From 1f5226b4882542545bec204d81f04171059566ee Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 20:58:58 +0200 Subject: [PATCH 001/139] Implemented generic multimodal chat handler. --- llama_cpp/llama.py | 12 +++++++++ llama_cpp/llama_chat_format.py | 49 +++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e26..848706a90d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -85,6 +85,7 @@ class Llama: def __init__( self, model_path: str, + clip_model_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -608,6 +609,17 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) + + if clip_model_path is not None: + if self.chat_handler is not None and self.verbose: + print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + + self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + gguf_metadata = self.metadata, + clip_model_path = clip_model_path, + model_arch = None, + verbose = self.verbose + ) eos_token_id = self.token_eos() bos_token_id = self.token_bos() diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db4..468a73c077 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2887,10 +2887,14 @@ def __init__( raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") # Pre-compile Jinja template + if not hasattr(self, "chat_format") or self.chat_format is None: + self.chat_format = self.CHAT_FORMAT + + self._chat_format_parser_tags = [] self.chat_template = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) + ).from_string(self.chat_format) self._exit_stack = ExitStack() @@ -3116,6 +3120,13 @@ def _process_mtmd_prompt( tool_choice=tool_choice, **getattr(self, 'extra_template_arguments', {}) ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + # Replace image_url by media_marker in text for item in media_items: text = text.replace(item["url"], media_marker) @@ -3827,6 +3838,42 @@ def from_pretrained( **kwargs, ) +class GenericMTMDChatHandler(MTMDChatHandler): + def __init__( + self, + gguf_metadata: Dict[str, Any], + clip_model_path: str, + model_arch: Optional[str] = None, + verbose: bool = True, + **kwargs + ) -> None: + self.model_metadata = gguf_metadata + + self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch + + if verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + if self.arch is None: + if verbose: + print("Unknown model architecture. Will use general/most-common tags.") + + self.arch = "unknown" + + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + + if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: + self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] + elif self.arch in ["gemma4"]: + self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] + elif self.arch in ["mistral3", "mistral4", "deepseek2"]: + self._chat_format_parser_tags += ["[IMG]"] + elif verbose: + print("Warning: Could not determine chat format parser tags.", flush = True) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From a8d19d3bbd18890693576b1f5ed6cd0b2d487eab Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 21:19:20 +0200 Subject: [PATCH 002/139] Used text.replace() --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 468a73c077..ab5e438d3e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3125,7 +3125,7 @@ def _process_mtmd_prompt( if tag not in text: continue - text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + text = text.replace(tag, media_marker) # Replace image_url by media_marker in text for item in media_items: From 3e031d5de16d5bd81dd35ef2cc3b8e2d49fac063 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 17:46:08 +0200 Subject: [PATCH 003/139] Fixed some bugs. --- llama_cpp/llama_chat_format.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ab5e438d3e..40491968a9 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3874,6 +3874,18 @@ def __init__( self._chat_format_parser_tags += ["[IMG]"] elif verbose: print("Warning: Could not determine chat format parser tags.", flush = True) + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From 389d0d97babca3edcf6fb74f476e306a21183b5f Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 18:49:21 +0200 Subject: [PATCH 004/139] Implemented 'chat_handler_kwargs'. --- llama_cpp/llama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 848706a90d..6dab44602d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -152,6 +152,7 @@ def __init__( spm_infill: bool = False, verbose: bool = True, # Extra Params + chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): """Load a llama.cpp model from `model_path`. @@ -618,7 +619,8 @@ def __init__( gguf_metadata = self.metadata, clip_model_path = clip_model_path, model_arch = None, - verbose = self.verbose + verbose = self.verbose, + **chat_handler_kwargs ) eos_token_id = self.token_eos() From 867a579ef9440f02f9bf0849ff37e30b7fd4deda Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 02:06:52 +0800 Subject: [PATCH 005/139] Update Submodule vendor/llama.cpp e48034d..bbeb89d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e48034dfc9..bbeb89d76c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e48034dfc9e5705248fd39dc437ca887dc55a528 +Subproject commit bbeb89d76c41bc250f16e4a6fefcc9b530d6e3f3 From 3796562b0cb4397bc13b295a8d8e8433f4919005 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 02:46:42 +0800 Subject: [PATCH 006/139] Sync llama : add option to save memory in device buffers Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 416e8b9357..1efd645150 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2770,6 +2770,9 @@ def llama_state_seq_load_file( # // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba) LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1 +# // keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2 + llama_state_seq_flags = ctypes.c_uint32 # LLAMA_API size_t llama_state_seq_get_size_ext( From 8eafd9edacb20fbc461081ae16b5afc8b5cd883c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 05:07:26 +0800 Subject: [PATCH 007/139] feat(HybridCheckpointCache): add `on-device` hybrid checkpoint support - Add dual-mode HybridCheckpointCache behavior Host mode keeps the existing Python-owned full checkpoint path. Device mode forwards checkpoint tensor payloads to llama.cpp-owned device buffers. - Add on_device support for llama.cpp sequence state APIs The cache now forwards LLAMA_STATE_SEQ_FLAGS_ON_DEVICE when requested. This aligns Python checkpoint behavior with upstream llama.cpp device-backed state storage. - Keep host checkpoint mode as the default on_device remains disabled by default for compatibility. Existing multi-checkpoint rollback behavior stays unchanged unless explicitly enabled. - Preserve multi-checkpoint history in host mode Host-backed checkpoints still store full serialized payloads in Python bytes. This keeps historical rollback safe for multi-turn reuse. - Add safe per-seq behavior for device mode Device-backed tensor payloads are owned by llama_context and keyed by seq_id. The cache now replaces old checkpoint metadata for the same seq_id before saving a new one. - Guard against stale on-device checkpoint restores Old Python checkpoint objects may outlive the current device payload. Restore now refuses stale on-device checkpoints to avoid mixing old metadata with newer device tensors. - Add shared FIFO eviction for checkpoint entries Checkpoint eviction is now handled through a common helper. This keeps max_checkpoints respected for both host metadata and device-mode metadata. - Clarify HybridCheckpoint data ownership semantics The dataclass docs now distinguish full host-side payloads from host-visible device-mode metadata. This makes it clear that Python does not own VRAM checkpoint tensors. - Improve cache_size documentation cache_size now describes host-visible memory usage. In device mode, it intentionally excludes llama_context-owned device tensor storage. - Expand save and restore diagnostics Verbose logs now include checkpoint mode, seq_id, position, count, and tracked memory usage. This should make hybrid rollback debugging much easier. - Rename internal state flags from _flag_partial to _flags The name now reflects that multiple sequence state flags may be combined. This is clearer now that PARTIAL_ONLY and ON_DEVICE can both be active. - Add checkpoint_on_device to Llama.__init__ Users can now enable device-backed hybrid checkpoints from the high-level Llama wrapper. The option is passed directly into HybridCheckpointCache as on_device. - Reduce default ctx_checkpoints from 32 to 16 This lowers default checkpoint memory pressure. Host mode can still be tuned higher when deeper rollback history is needed. - Document checkpoint_on_device in Llama init args The new argument explains that tensor payloads are stored in llama_context-owned device buffers. It also clarifies the tradeoff between lower device-to-host copy overhead and one active checkpoint per seq_id. - Improve hybrid cache initialization logs Llama.__init__ now prints ctx_checkpoints, checkpoint_interval, and on_device when hybrid checkpointing is enabled. This makes runtime configuration easier to verify from stderr. Signed-off-by: JamePeng --- llama_cpp/llama.py | 27 +++- llama_cpp/llama_cache.py | 293 ++++++++++++++++++++++++++++++++------- 2 files changed, 267 insertions(+), 53 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e26..e50e3b9a3b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -131,8 +131,9 @@ def __init__( swa_full: Optional[bool] = None, kv_unified: Optional[bool] = None, # HybridCheckpointCache Params - ctx_checkpoints: int = 32, + ctx_checkpoints: int = 16, checkpoint_interval: int = 4096, + checkpoint_on_device: bool = False, # Sampling Params last_n_tokens_size: int = 64, # Backend Params @@ -227,6 +228,7 @@ def __init__( kv_unified: use single unified KV buffer for the KV cache of all sequences ctx_checkpoints: max number of context checkpoints to create per slot (default: 16)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) checkpoint_interval: Hybrid model checkpoint token intervals, and archiving of text with interval sizes along the way. + checkpoint_on_device: Store hybrid/recurrent checkpoint tensor payloads in llama_context-owned device buffers via LLAMA_STATE_SEQ_FLAGS_ON_DEVICE. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. numa: numa policy chat_format: String specifying the chat format to use when calling create_chat_completion. @@ -541,6 +543,7 @@ def __init__( _is_recurrent = self._model.is_recurrent() _is_hybrid = self._model.is_hybrid() _n_swa = self._model.n_swa() + # Sync llama.cpp upstream (#20291): warn swa-full is not supported for non-SWA models. if _n_swa == 0: if (self.context_params.swa_full): @@ -555,13 +558,25 @@ def __init__( if self.is_hybrid: if self.verbose: - print(f"Llama.__init__: Hybrid/Recurrent model detected." - f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " - f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}, checkpoint_interval={checkpoint_interval}).", - file=sys.stderr) + print( + f"Llama.__init__: Hybrid/Recurrent model detected. " + f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, " + f"n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " + f"Enabling HybridCheckpointCache(" + f"ctx_checkpoints={ctx_checkpoints}, " + f"checkpoint_interval={checkpoint_interval}, " + f"on_device={checkpoint_on_device}).", + file=sys.stderr, + ) self.ctx_checkpoints = ctx_checkpoints self.checkpoint_interval = checkpoint_interval - self._hybrid_cache_mgr = HybridCheckpointCache(self._ctx.ctx, max_checkpoints=self.ctx_checkpoints, verbose=self.verbose) + self.checkpoint_on_device = checkpoint_on_device + self._hybrid_cache_mgr = HybridCheckpointCache( + self._ctx.ctx, + max_checkpoints=self.ctx_checkpoints, + on_device=self.checkpoint_on_device, + verbose=self.verbose, + ) else: self._hybrid_cache_mgr = None diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index dc1dd20d7c..ee37df1200 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -352,58 +352,169 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): @dataclass class HybridCheckpoint: - """Represents a single snapshot of the RNN/Hybrid model's hidden state.""" - pos: int # The token position (cursor) where this snapshot was taken - data: bytes # The raw binary RNN state data - hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching - size: int # Size of the state data in bytes - seq_id: int # Sequence ID this checkpoint belongs to + """ + Represents a single snapshot of the Hybrid/Recurrent model state. + + Notes: + - When on_device=False, `data` contains the full host-side serialized state. + - When on_device=True, `data` contains only the host-visible portion of the + serialized state. The tensor payload is stored in llama_context-owned + device buffers by llama.cpp, keyed by seq_id. + """ + pos: int # The token position (cursor) where this snapshot was taken. + data: bytes # The raw binary RNN state data. + hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching. + size: int # Number of bytes written by llama_state_seq_get_data_ext(). + seq_id: int # Sequence id used by llama.cpp state APIs. class HybridCheckpointCache(BaseLlamaCache): """ - Manager for RNN state snapshots (Checkpoints) tailored for Hybrid/Recurrent models. - Provides rollback capabilities for models that cannot physically truncate KV cache. + Checkpoint manager for Hybrid/Recurrent model states. + + This cache is designed for models whose memory cannot be safely truncated like + a regular Transformer KV cache. For recurrent/hybrid architectures, rollback is + implemented by saving and restoring sequence state snapshots. + + Two operating modes are supported: + + 1. Host mode: on_device=False + - Full checkpoint payload is materialized as Python bytes. + - Multiple checkpoints per seq_id are safe. + - This mode is suitable for multi-turn rollback and longer conversation reuse. + + 2. Device mode: on_device=True + - LLAMA_STATE_SEQ_FLAGS_ON_DEVICE is forwarded to llama.cpp. + - Tensor payloads are stored in llama_context-owned device buffers. + - The device buffers are created per seq_id in llama.cpp. + - Therefore only one active checkpoint per seq_id is safe. + - This mode is suitable for fast speculative / branch rollback where avoiding + device-to-host tensor copies is more important than keeping many historical + checkpoints. + + Important: + Do not treat on_device=True as "Python owns a VRAM checkpoint". Python only + owns the host-visible serialized portion. The tensor payload lives inside the + llama_context and is keyed by seq_id. """ - def __init__(self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): + def __init__( + self, + ctx: llama_cpp_lib.llama_context_p, + max_checkpoints: int = 16, + on_device: bool = False, + verbose: bool = False + ): + """ + Args: + ctx (llama_context_p): + Borrowed llama.cpp context pointer used by the state sequence APIs. + This cache does not own the context and must not free it. + + max_checkpoints(int): Maximum number of Python-side checkpoint entries to keep. + - Host mode: This is the maximum number of historical checkpoints across all seq_ids. + - Device mode: This is still a global upper bound for Python-side metadata entries, + but this class also enforces at most one active checkpoint per seq_id, + because llama.cpp stores device tensor payloads per seq_id. + + on_device(bool): Whether to request llama.cpp to keep tensor checkpoint payloads in + context-owned device buffers via LLAMA_STATE_SEQ_FLAGS_ON_DEVICE. + + verbose(bool): Enables diagnostic logging to stderr for checkpoint save/restore/eviction. + """ if ctx is None: - raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context") + raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with a null model context") self._ctx = ctx + self.on_device = on_device + self.verbose = verbose + + # In host mode, max_checkpoints means "maximum number of Python-owned + # checkpoints across all seq_ids". + # + # In device mode, llama.cpp stores tensor payloads in device buffers keyed + # by seq_id. Multiple Python checkpoint metadata entries for the same seq_id + # would point to the same mutable device-side slot, so only one checkpoint + # per seq_id is safe. self.max_checkpoints = max_checkpoints + + # Python-side checkpoint registry. + # + # Host mode: + # Each HybridCheckpoint owns a full serialized checkpoint payload. + # + # Device mode: + # Each HybridCheckpoint owns only the host-visible serialized portion. + # The corresponding tensor payload is owned by llama_context. self.checkpoints: list[HybridCheckpoint] = [] + + # Total Python-tracked checkpoint size in bytes. + # + # Host mode: + # Roughly equals the total serialized checkpoint payload size. + # + # Device mode: + # Tracks only the host-visible part returned by llama.cpp, not the + # context-owned device tensor storage. self._current_size = 0 - # Cache C-type API function pointers for performance + # Cache C API function pointers for faster repeated calls. self._get_size_ext = llama_cpp_lib.llama_state_seq_get_size_ext self._get_data_ext = llama_cpp_lib.llama_state_seq_get_data_ext self._set_data_ext = llama_cpp_lib.llama_state_seq_set_data_ext - self._flag_partial = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY - self.verbose = verbose - - if self.max_checkpoints <= 0: - if self.verbose: - import sys - print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " - "Rollback capabilities are turned off. This is optimal for single-turn workflows.", - file=sys.stderr) + # State serialization flags forwarded to llama.cpp. + # + # LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY: + # Save only the sequence-specific / partial state needed for recurrent + # rollback instead of a full context state. + # + # LLAMA_STATE_SEQ_FLAGS_ON_DEVICE: + # Ask llama.cpp to store tensor payloads in context-owned device buffers. + self._flags = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY + if on_device: + self._flags |= llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_ON_DEVICE + + if self.max_checkpoints <= 0 and self.verbose: + print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " + "Rollback capabilities are turned off. This is optimal for single-turn workflows.", + file=sys.stderr) + + if self.on_device and self.max_checkpoints > 1 and self.verbose: + print( + "HybridCheckpointCache(__init__): on_device=True stores tensor payloads " + "in llama_context-owned device buffers keyed by seq_id. Multiple " + "historical checkpoints for the same seq_id are unsafe, so this cache " + "will keep only one checkpoint per seq_id.", + file=sys.stderr, + ) @property def cache_size(self) -> int: - """Returns the total memory used by all stored checkpoints in bytes.""" + """ + Returns the host-visible checkpoint size tracked by Python. + + In host mode, this is close to the full serialized checkpoint payload size. + In device mode, this is only the host-visible metadata/payload size returned + by llama.cpp. Device-side tensor storage is owned by llama_context and is not + fully represented by this number. + """ return self._current_size def clear(self): - """Clears all stored checkpoints and resets memory tracking.""" + """ + Clears Python-side checkpoint metadata. + + This does not explicitly release llama_context-owned device buffers. The + device buffers are managed by llama.cpp and are associated with the context. + """ if not self.checkpoints: # Empty Checkpoint: Return immediately, no need to clear. return self.checkpoints.clear() self._current_size = 0 if self.verbose: - print("HybridCheckpointCache: cleared") + print("HybridCheckpointCache(clear): cleared", file=sys.stderr) def close(self): - self.checkpoints = None + self.clear() self._ctx = None self._get_size_ext = None self._get_data_ext = None @@ -421,23 +532,72 @@ def _hash_prefix(self, tokens: List[int], length: int) -> str: """ if length <= 0: return "empty" - tokens_size = len(tokens) - if length > tokens_size: - length = tokens_size + length = min(length, len(tokens)) data = array.array('i', tokens[:length]).tobytes() return hashlib.sha256(data).hexdigest()[:32] + def _replace_checkpoint_for_seq_id(self, seq_id: int) -> None: + """ + Removes all Python-side checkpoints for one seq_id. + + Required for on_device=True because llama.cpp stores the device tensor + payload per seq_id, not per Python checkpoint object. + """ + kept: list[HybridCheckpoint] = [] + removed_size = 0 + + for cp in self.checkpoints: + if cp.seq_id == seq_id: + removed_size += cp.size + else: + kept.append(cp) + + self.checkpoints = kept + self._current_size -= removed_size + if self._current_size < 0: + self._current_size = 0 + + def _evict_checkpoints_if_needed(self) -> None: + """ + Evicts old checkpoints if needed + + Host mode: + This evicts full Python-owned checkpoint payloads, so FIFO historical + checkpoints are safe and useful. + + Device mode: + This evicts Python-side metadata only. The device tensor payload is owned + by llama_context and is keyed by seq_id. + """ + while len(self.checkpoints) > self.max_checkpoints: + old_cp = self.checkpoints.pop(0) + self._current_size -= old_cp.size + if self._current_size < 0: + self._current_size = 0 + + if self.verbose: + print( + f"HybridCheckpointCache: evicted checkpoint " + f"seq_id={old_cp.seq_id}, pos={old_cp.pos}", + file=sys.stderr, + ) + def find_best_checkpoint(self, tokens: List[int], seq_id: int = 0) -> Optional[HybridCheckpoint]: """ Finds the longest valid checkpoint that perfectly matches the provided token prefix. + + The hash check prevents restoring a checkpoint that has the same length but + belongs to a different prompt/history. + Returns None if no matching checkpoint is found. """ # Empty Checkpoint: Instant return, no hash calculation needed. if self.max_checkpoints <= 0 or len(self.checkpoints) == 0: return None - best_cp = None + best_cp: Optional[HybridCheckpoint] = None best_pos = -1 + for cp in self.checkpoints: if cp.seq_id != seq_id or cp.pos > len(tokens): # Skip if sequence ID mismatches or checkpoint is longer than the current prompt @@ -475,9 +635,17 @@ def save_checkpoint( file=sys.stderr) return False - flags = self._flag_partial + # In on-device mode, remove old Python metadata for this seq_id before saving + # the new checkpoint. The underlying llama.cpp device buffer for this seq_id + # will be overwritten by the get_data_ext() call. + if self.on_device: + self._replace_checkpoint_for_seq_id(seq_id) + + flags = self._flags - # 1. Query the required buffer size from the underlying C++ context + # 1. Query the required host-visible buffer size. + # In on_device mode this may exclude the large tensor payload + # that stays in device memory. size = self._get_size_ext(self._ctx, seq_id, flags) if size == 0: if self.verbose: @@ -487,9 +655,14 @@ def save_checkpoint( # 2. Allocate buffer and extract raw state data buffer = (ctypes.c_uint8 * size)() n_written = self._get_data_ext(self._ctx, buffer, size, seq_id, flags) + if n_written != size: if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): get failed {n_written}/{size}") + print( + f"HybridCheckpointCache(save_checkpoint): get_data_ext failed " + f"({n_written}/{size})", + file=sys.stderr, + ) return False # Note: This deep copy isolates the state from subsequent C++ backend mutations @@ -506,19 +679,18 @@ def save_checkpoint( ) self._current_size += n_written - # 4. Enforce capacity limits (FIFO eviction) - while len(self.checkpoints) > self.max_checkpoints: - if not self.checkpoints: - break - old_cp = self.checkpoints.pop(0) - self._current_size -= old_cp.size - if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): evicted pos={old_cp.pos}") + # 4. Evicts old checkpoints if needed + self._evict_checkpoints_if_needed() if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): Saved checkpoint at pos {current_pos} ({size / 1024 / 1024:.2f} MiB) " - f"total={len(self.checkpoints)} used={self._current_size / 1024 / 1024:.2f} MiB", - file=sys.stderr) + mode = "device" if self.on_device else "host" + print( + f"HybridCheckpointCache(save_checkpoint): saved {mode} checkpoint " + f"seq_id={seq_id}, pos={current_pos}, size={size / 1024 / 1024:.2f} MiB, " + f"hcc_count={len(self.checkpoints)}, " + f"hcc_mem_used={self._current_size / 1024 / 1024:.2f} MiB", + file=sys.stderr, + ) return True @@ -531,17 +703,38 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: if self.verbose: print(f"HybridCheckpointCache(restore_checkpoint): [Error] Sequence ID mismatch: checkpoint has {cp.seq_id}, requested {seq_id}", file=sys.stderr) return False - flags = self._flag_partial - # 2. Verify the underlying C++ context still expects the exact same state size. + # 2. Guard against stale on-device checkpoint objects. + # + # In on_device mode, Python does not own the full checkpoint tensor payload. + # llama.cpp keeps the large tensor payload in llama_context-owned device + # buffers keyed by seq_id. Saving a newer checkpoint for the same seq_id may + # overwrite that device-side payload while an old HybridCheckpoint object can + # still exist outside this cache. + # + # Only checkpoint objects still tracked by this cache are considered valid. + # This avoids restoring old Python metadata together with newer device tensors. + if self.on_device and cp not in self.checkpoints: + if self.verbose: + print( + "HybridCheckpointCache(restore_checkpoint): stale on-device checkpoint; " + "refusing restore because device payload may have been overwritten.", + file=sys.stderr, + ) + return False + + flags = self._flags + + # 3. Verify the underlying C++ context still expects the exact same state size. # This prevents buffer overflows if the backend context was unexpectedly altered or reallocated. current_size = self._get_size_ext(self._ctx, seq_id, flags) if current_size != cp.size: if self.verbose: - print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: expected {cp.size}, got {current_size} -> possible invalidation") + print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: " + f"expected checkpoint size={cp.size}, got current size={current_size} -> possible invalidation") return False - # 3. Copy data back to a ctypes buffer and push to the C++ backend + # 4. Copy data back to a ctypes buffer and push to the C++ backend buffer = (ctypes.c_uint8 * cp.size).from_buffer_copy(cp.data) ret = self._set_data_ext( self._ctx, buffer, cp.size, seq_id, flags @@ -549,7 +742,13 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: success = (ret == cp.size) if self.verbose: - print(f"HybridCheckpointCache(restore_checkpoint): restore {'OK' if success else 'FAIL'} pos={cp.pos}") + mode = "device" if self.on_device else "host" + print( + f"HybridCheckpointCache(restore_checkpoint): restore " + f"{'OK' if success else 'FAIL'} " + f"mode={mode}, seq_id={seq_id}, pos={cp.pos}", + file=sys.stderr, + ) return success # Disable BaseLlamaCache Dictionary Interfaces From 54115b4e86c5ffedc2e84237ff03b2473570d756 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 05:08:14 +0800 Subject: [PATCH 008/139] Update /docs/wiki/core/Llama.md for `on_device` option Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 52 ++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 7a9b7bd6ad..0354d86150 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,13 +4,13 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-01 +last_updated: 2026-05-06 version_target: "latest" --- ``` ## Overview -The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, hybrid model checkpointing, speculative decoding, and context shifting. +The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, dual-mode hybrid/recurrent checkpointing, speculative decoding, and context shifting. ## Constructor (`__init__`) @@ -51,8 +51,9 @@ Initialize the model and context. Note that model loading will immediately alloc | `chat_format` | `str` | `None` | String specifying the chat template (e.g., `"llama-2"`, `"chatml"`). Guessed from GGUF if None. | | `chat_handler` | `LlamaChatCompletionHandler` | `None` | Optional custom handler. See [[ChatHandlers]]. | | `draft_model` | `LlamaDraftModel` | `None` | Optional draft model for speculative decoding. | -| `ctx_checkpoints` | `int` | `32` | Max context checkpoints per slot (Hybrid/SWA models). | -| `checkpoint_interval`| `int`| `4096` | Token interval for saving Hybrid model checkpoints. | +| `ctx_checkpoints` | `int` | `16` | Max hybrid/recurrent context checkpoints to keep. Set to `0` to disable checkpointing for single-turn fast paths. | +| `checkpoint_interval` | `int` | `4096` | Token interval for saving periodic Hybrid/Recurrent checkpoints during long prompt evaluation. | +| `checkpoint_on_device` | `bool` | `False` | Store Hybrid/Recurrent checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Reduces device-to-host copy overhead, but only one active checkpoint per `seq_id` is safe. | *(Note: There are numerous additional RoPE/YaRN scaling parameters available for specialized context extension. Refer to the source code for the full list).* @@ -189,18 +190,41 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn 5. **Hybrid & Recurrent Architectures**: - The class natively detects Hybrid/Recurrent models (like LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba or specialized SWA models(Gemma3/4)) and automatically enables the `HybridCheckpointCache`. This creates periodic save-states during large context pre-filling, allowing the model to roll back seamlessly if a generation is rejected (e.g., speculative decoding mismatches) without corrupting the recurrent state. + The class natively detects Hybrid/Recurrent models (for example LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba, RWKV, or specialized SWA models such as Gemma3/4) and automatically enables the `HybridCheckpointCache`. - * Tips: If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: + Unlike regular Transformer KV caches, Hybrid/Recurrent model memory cannot always be safely truncated token-by-token. The wrapper therefore saves periodic sequence-state checkpoints during long context prefill, allowing rollback to a verified prefix without corrupting recurrent state. + + `HybridCheckpointCache` supports two checkpoint storage modes: + + - **Host checkpoint mode** (`checkpoint_on_device=False`, default): checkpoint payloads are serialized into Python-owned bytes. This supports multiple historical checkpoints per `seq_id`, which is useful for multi-turn reuse and deeper rollback history. + - **Device checkpoint mode** (`checkpoint_on_device=True`): checkpoint tensor payloads are stored in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Python only keeps the host-visible serialized portion. This reduces device-to-host tensor copy overhead, but only one active checkpoint per `seq_id` is safe because device payloads are keyed by `seq_id`. + + *Tips*: If you are using a hybrid multimodal model for ComfyUI nodes or single-turn API wrappers where you do not need multi-turn state rollback, initialize your `Llama` instance with `ctx_checkpoints=0`: + + ```python + llm = Llama( + model_path="./Qwen3.5-VL-9B.gguf", + chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), + n_ctx=4096, + ctx_checkpoints=0 # Disable checkpoints for zero-latency single-turn fast paths + ) + ``` + + For long prompts on GPU-backed Hybrid/Recurrent models, you can enable device-backed checkpoints to reduce device-to-host copy overhead: + + ```python + llm = Llama( + model_path="./Qwen3.6-27B.gguf", + n_ctx=32768, + n_gpu_layers=-1, + ctx_checkpoints=16, + checkpoint_interval=4096, + checkpoint_on_device=True + ) + ``` + + Use `checkpoint_on_device=False` if you need multiple historical checkpoints for the same `seq_id`. Use `checkpoint_on_device=True` when fast rollback/checkpointing is more important than keeping many historical checkpoint payloads. - ```python - llm = Llama( - model_path="./Qwen3.5-VL-9B.gguf", - chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), - n_ctx=4096, - ctx_checkpoints=0 # <-- SET THIS TO 0 TO ENABLE ZERO-LATENCY FAST PATH - ) - ``` 6. **Assistant Prefill**: `llama-cpp-python` supports native **Assistant Prefill** for seamless message continuation. You can now simply use the `assistant_prefill=True` parameter in the `create_chat_completion` function. From f8d88b014889f7592d6fc3caaf23cdb9e3b088f2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 05:22:36 +0800 Subject: [PATCH 009/139] Update /docs/wiki/modules/LlamaCache.md for `on_device` option Signed-off-by: JamePeng --- docs/wiki/modules/LlamaCache.md | 285 +++++++++++++++++++++++--------- 1 file changed, 205 insertions(+), 80 deletions(-) diff --git a/docs/wiki/modules/LlamaCache.md b/docs/wiki/modules/LlamaCache.md index 64e6bbb5f8..d1db0a2097 100644 --- a/docs/wiki/modules/LlamaCache.md +++ b/docs/wiki/modules/LlamaCache.md @@ -2,7 +2,7 @@ title: Llama Cache module_name: llama_cpp.llama_cache source_file: llama_cpp/llama_cache.py -last_updated: 2026-05-02 +last_updated: 2026-05-06 version_target: "latest" --- @@ -21,10 +21,10 @@ It defines several cache classes: | `BaseLlamaCache` | Abstract base class for llama.cpp state caches. | | `LlamaRAMCache` | In-memory LRU cache for `LlamaState` objects. | | `LlamaDiskCache` | Disk-backed cache using the `diskcache` library. | -| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | -| `HybridCheckpointCache` | Checkpoint manager for RNN/Hybrid model hidden states. | -| `HybridCheckpoint` | Dataclass representing one saved hybrid model checkpoint. | | `TrieNode` | Internal trie node used by `LlamaTrieCache`. | +| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | +| `HybridCheckpoint` | Dataclass representing one saved Hybrid/Recurrent checkpoint and its host-visible payload. | +| `HybridCheckpointCache` | Checkpoint manager for Hybrid/Recurrent model state snapshots, with host and device-backed modes. | The public compatibility alias is: @@ -910,7 +910,7 @@ from llama_cpp.llama_cache import LlamaTrieCache as LlamaCache ## Overview -`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or recurrent model's hidden state. +`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or Recurrent model state. It is used by `HybridCheckpointCache`. @@ -920,9 +920,14 @@ Defined in: `llama_cpp/llama_cache.py` ## Role in the API -Hybrid or recurrent models may require hidden-state rollback rather than standard KV-cache truncation. +Hybrid or recurrent models may require sequence-state rollback rather than standard KV-cache truncation. + +`HybridCheckpoint` stores the checkpoint position, prefix verification hash, sequence id, and the serialized checkpoint payload visible to Python. -`HybridCheckpoint` stores enough metadata to verify and restore a specific recurrent state snapshot. +Its `data` field has different ownership semantics depending on the cache mode: + +* In host mode (`on_device=False`), `data` contains the full host-side serialized checkpoint state. +* In device mode (`on_device=True`), `data` contains only the host-visible serialized portion. The large tensor payload is stored in `llama_context`-owned device buffers by llama.cpp, keyed by `seq_id`. --- @@ -936,19 +941,19 @@ class HybridCheckpoint: hash_val: str size: int seq_id: int -``` +```` --- ## Fields -| Field | Type | Description | -| ---------- | ------- | --------------------------------------------------------------- | -| `pos` | `int` | Token position where this checkpoint was taken. | -| `data` | `bytes` | Raw binary RNN or Hybrid model state data. | -| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | -| `size` | `int` | Size of the state data in bytes. | -| `seq_id` | `int` | Sequence ID associated with this checkpoint. | +| Field | Type | Description | +| ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `pos` | `int` | Token position where this checkpoint was taken. | +| `data` | `bytes` | Serialized checkpoint payload visible to Python. In host mode this is the full state; in device mode this is only the host-visible portion. | +| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | +| `size` | `int` | Number of bytes written by `llama_state_seq_get_data_ext`. | +| `seq_id` | `int` | Sequence id used by llama.cpp sequence-state APIs. | --- @@ -958,23 +963,33 @@ class HybridCheckpoint: Users usually do not need to instantiate this dataclass manually. +In device mode, old `HybridCheckpoint` Python objects may become stale if a newer checkpoint is saved for the same `seq_id`, because the device-side tensor payload is keyed by `seq_id` and may be overwritten. + --- # `HybridCheckpointCache` ## Overview -`HybridCheckpointCache` manages RNN or Hybrid model hidden-state checkpoints. +`HybridCheckpointCache` manages Hybrid/Recurrent model state checkpoints. + +It is designed for models whose memory cannot always be safely truncated like a regular Transformer KV cache. Instead, rollback is implemented by saving and restoring sequence-state snapshots through llama.cpp state APIs. + +The cache supports two operating modes: -It is designed for models that cannot physically truncate KV cache in the same way as standard transformer-only models. +1. **Host mode** (`on_device=False`) -Instead of implementing dictionary-style cache operations, it provides explicit checkpoint operations: + * Full checkpoint payloads are materialized as Python-owned `bytes`. + * Multiple historical checkpoints per `seq_id` are safe. + * This is the default mode and is useful for multi-turn rollback or deeper prefix reuse. -* `save_checkpoint` -* `find_best_checkpoint` -* `restore_checkpoint` -* `clear` -* `close` +2. **Device mode** (`on_device=True`) + + * `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` is forwarded to llama.cpp. + * Tensor payloads are stored in `llama_context`-owned device buffers. + * Python keeps only the host-visible serialized portion. + * Only one active checkpoint per `seq_id` is safe because device payloads are keyed by `seq_id`. + * This mode can reduce device-to-host copy overhead during checkpoint save/restore. Defined in: `llama_cpp/llama_cache.py` @@ -984,12 +999,14 @@ Defined in: `llama_cpp/llama_cache.py` `HybridCheckpointCache` is a specialized cache manager for Hybrid/Recurrent model rollback. -It stores raw state snapshots extracted from the llama.cpp backend through low-level C API functions: +It stores host-visible checkpoint data extracted from the llama.cpp backend through low-level C API functions: * `llama_state_seq_get_size_ext` * `llama_state_seq_get_data_ext` * `llama_state_seq_set_data_ext` +When `on_device=True`, tensor payloads are not treated as Python-owned bytes. They are stored by llama.cpp in `llama_context`-owned device buffers, while Python keeps the host-visible serialized portion and checkpoint metadata. + It is not a drop-in replacement for `LlamaRAMCache`, `LlamaDiskCache`, or `LlamaTrieCache`. --- @@ -1001,16 +1018,18 @@ def __init__( self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, + on_device: bool = False, verbose: bool = False ): ... ``` -| Parameter | Type | Default | Required | Description | -| ----------------- | ------------------------------- | ------: | -------: | ------------------------------------------------------------------------------------------- | -| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Low-level llama.cpp context pointer. Required for extracting and restoring sequence state. | -| `max_checkpoints` | `int` | `16` | No | Maximum number of checkpoints to retain. If set to `0` or below, checkpointing is disabled. | -| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | +| Parameter | Type | Default | Required | Description | +| ----------------- | ------------------------------- | ------: | -------: | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Borrowed low-level llama.cpp context pointer used for sequence-state save/restore. The cache does not own or free this context. | +| `max_checkpoints` | `int` | `16` | No | Maximum number of Python-side checkpoint entries to retain. If set to `0` or below, checkpointing is disabled. | +| `on_device` | `bool` | `False` | No | Whether to request llama.cpp to store checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. | +| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | --- @@ -1018,32 +1037,26 @@ def __init__( The constructor raises `ValueError` if `ctx` is `None`. -```python -if ctx is None: - raise ValueError( - "HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context" - ) -``` +If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. This mode is intended to avoid expensive state extraction for single-turn workflows. -If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. - -This mode is intended to avoid expensive state extraction for single-turn workflows. +When `on_device=True`, the cache forwards `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` to llama.cpp. In this mode, the cache keeps only one active checkpoint per `seq_id` by replacing old Python-side checkpoint metadata before saving a new checkpoint for the same `seq_id`. --- ## Instance Variables -| Name | Type | Description | -| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------ | -| `_ctx` | `llama_cpp_lib.llama_context_p` | Low-level llama.cpp context pointer used for state extraction and restoration. | -| `max_checkpoints` | `int` | Maximum number of checkpoints retained. Values less than or equal to zero disable checkpointing. | -| `checkpoints` | `list[HybridCheckpoint]` | Stored checkpoint objects. | -| `_current_size` | `int` | Total memory used by all stored checkpoints in bytes. | -| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | -| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | -| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | -| `_flag_partial` | int | Cached value of `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY`. | -| `verbose` | `bool` | Enables debug output. | +| Name | Type | Description | +| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `_ctx` | `llama_cpp_lib.llama_context_p` | Borrowed llama.cpp context pointer used for state extraction and restoration. | +| `on_device` | `bool` | Whether `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` is forwarded to llama.cpp state APIs. | +| `verbose` | `bool` | Enables debug output. | +| `max_checkpoints` | `int` | Maximum number of Python-side checkpoint entries retained. Values less than or equal to zero disable checkpointing. | +| `checkpoints` | `list[HybridCheckpoint]` | Python-side checkpoint registry. In host mode, entries own full checkpoint payloads. In device mode, entries own only host-visible metadata/payload portions. | +| `_current_size` | `int` | Python-tracked host-visible checkpoint size in bytes. In device mode, this does not include `llama_context`-owned device tensor storage. | +| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | +| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | +| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | +| `_flags` | `int` | Combined llama.cpp sequence-state flags, always including `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY` and optionally `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. | --- @@ -1057,7 +1070,11 @@ def cache_size(self) -> int: return self._current_size ``` -Returns the total memory used by stored checkpoints in bytes. +Returns the Python-tracked host-visible checkpoint size in bytes. + +In host mode, this is close to the full serialized checkpoint payload size. + +In device mode, this reports only the host-visible portion returned by llama.cpp. It does not include `llama_context`-owned device tensor storage. --- @@ -1070,14 +1087,16 @@ def clear(self): ... ``` -Clears all stored checkpoints and resets `_current_size` to `0`. +Clears Python-side checkpoint metadata and resets `_current_size` to `0`. If the checkpoint list is already empty, it returns immediately. +In device mode, this does not explicitly release `llama_context`-owned device buffers. Those buffers are managed by llama.cpp and are associated with the context. + In verbose mode, it prints: ```text -HybridCheckpointCache: cleared +HybridCheckpointCache(clear): cleared ``` --- @@ -1089,15 +1108,15 @@ def close(self): ... ``` -Releases references held by the cache. +Releases Python-side checkpoint metadata and detaches cached references held by the cache. Behavior: -* Sets `checkpoints` to `None`. +* Calls `clear()`. * Sets `_ctx` to `None`. * Sets cached C API function references to `None`. -This method is also called by `__del__`. +This method does not free the llama.cpp context itself, because the context is borrowed rather than owned by the cache. --- @@ -1133,6 +1152,50 @@ This hash is used to ensure checkpoints are restored only when the token prefix --- +### `_replace_checkpoint_for_seq_id` + +```python +def _replace_checkpoint_for_seq_id(self, seq_id: int) -> None: + ... +``` + +Removes all Python-side checkpoint entries for one `seq_id`. + +This is required in device mode because llama.cpp stores the device tensor payload per `seq_id`, not per Python checkpoint object. Keeping multiple checkpoint metadata entries for the same `seq_id` would be unsafe. + +Behavior: + +1. Iterates over all checkpoint entries. +2. Removes entries whose `seq_id` matches the requested `seq_id`. +3. Preserves entries for other sequence ids. +4. Subtracts removed checkpoint sizes from `_current_size`. +5. Clamps `_current_size` to `0` if needed. + +--- + +### `_evict_checkpoints_if_needed` + +```python +def _evict_checkpoints_if_needed(self) -> None: + ... +``` + +Evicts old checkpoint entries using FIFO order until `len(checkpoints) <= max_checkpoints`. + +In host mode, this evicts full Python-owned checkpoint payloads. + +In device mode, this evicts Python-side checkpoint metadata only. Device tensor payloads are owned by `llama_context`. + +Behavior: + +1. Checks whether the number of checkpoints exceeds `max_checkpoints`. +2. Pops the oldest checkpoint entry from the front of the list. +3. Subtracts its size from `_current_size`. +4. Clamps `_current_size` to `0` if needed. +5. Prints an eviction message in verbose mode. + +--- + ### `find_best_checkpoint` ```python @@ -1144,20 +1207,23 @@ def find_best_checkpoint( ... ``` -Finds the longest valid checkpoint matching the given token prefix and sequence ID. +Finds the longest valid checkpoint matching the given token prefix and sequence id. + +The hash check prevents restoring a checkpoint that has the same length but belongs to a different prompt/history. Returns `None` if: * Checkpointing is disabled. * There are no checkpoints. -* No checkpoint matches the requested sequence ID and token prefix. +* No checkpoint matches the requested sequence id and token prefix. Behavior: -1. Skips checkpoints whose `seq_id` differs. -2. Skips checkpoints whose `pos` is greater than the current token length. -3. Verifies token-prefix integrity using `_hash_prefix`. -4. Returns the checkpoint with the largest matching `pos`. +1. Returns immediately if `max_checkpoints <= 0` or no checkpoints exist. +2. Skips checkpoints whose `seq_id` differs from the requested `seq_id`. +3. Skips checkpoints whose `pos` is greater than the current token length. +4. Verifies token-prefix integrity using `_hash_prefix`. +5. Returns the checkpoint with the largest matching `pos`. --- @@ -1173,7 +1239,7 @@ def save_checkpoint( ... ``` -Extracts the current recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. +Extracts the current Hybrid/Recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. Returns `True` if the checkpoint was saved successfully. @@ -1186,20 +1252,24 @@ Returns `False` if: ### Behavior 1. Returns immediately if `max_checkpoints <= 0`. -2. Calls `_get_size_ext` to query the required state buffer size. -3. Allocates a `ctypes.c_uint8` buffer. -4. Calls `_get_data_ext` to extract state data. -5. Copies the state bytes into a Python `bytes` object. -6. Computes a hash of the token prefix. -7. Appends a new `HybridCheckpoint`. -8. Increments `_current_size`. -9. Evicts old checkpoints using FIFO order if the number of checkpoints exceeds `max_checkpoints`. +2. In device mode, removes old Python-side checkpoint metadata for the same `seq_id`. +3. Uses `_flags` to select partial-only state serialization, optionally with `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. +4. Calls `_get_size_ext` to query the required host-visible buffer size. +5. Allocates a `ctypes.c_uint8` buffer. +6. Calls `_get_data_ext` to extract the host-visible checkpoint data. +7. Copies the data into a Python `bytes` object. +8. Computes a hash of the token prefix. +9. Appends a new `HybridCheckpoint`. +10. Increments `_current_size`. +11. Evicts old checkpoint entries using FIFO order if the number of entries exceeds `max_checkpoints`. ### Important Performance Note The implementation intentionally bypasses checkpoint extraction when `max_checkpoints <= 0`. -This avoids potentially large synchronous VRAM-to-RAM transfers for single-turn workflows. +This avoids potentially large synchronous checkpoint extraction costs for single-turn workflows. + +When `on_device=True`, llama.cpp may keep large tensor payloads in context-owned device buffers instead of materializing them as Python-owned bytes. This can reduce device-to-host tensor copy overhead, but only one active checkpoint per `seq_id` is safe. --- @@ -1220,18 +1290,28 @@ Returns `True` if restoration succeeds. Returns `False` if: -* The checkpoint sequence ID does not match the requested `seq_id`. +* The checkpoint sequence id does not match the requested `seq_id`. +* `on_device=True` and the checkpoint object is no longer tracked by this cache. * The current backend state size differs from the checkpoint size. * The backend does not report the expected number of restored bytes. ### Behavior 1. Verifies `cp.seq_id == seq_id`. -2. Queries current expected state size from the backend. -3. Verifies it matches `cp.size`. -4. Copies checkpoint bytes into a ctypes buffer. -5. Calls `_set_data_ext` to restore the state. -6. Returns whether the number of restored bytes equals `cp.size`. +2. In device mode, rejects stale checkpoint objects that are no longer tracked by this cache. +3. Queries current expected host-visible state size from the backend. +4. Verifies it matches `cp.size`. +5. Copies checkpoint bytes into a ctypes buffer. +6. Calls `_set_data_ext` to restore the state. +7. Returns whether the number of restored bytes equals `cp.size`. + +### Stale Checkpoint Guard + +In device mode, Python does not own the full checkpoint tensor payload. The large tensor payload is stored inside `llama_context` device buffers keyed by `seq_id`. + +If a newer checkpoint is saved for the same `seq_id`, an older `HybridCheckpoint` Python object may still exist outside the cache, but its device-side tensor payload may have been overwritten. + +For this reason, `restore_checkpoint` refuses on-device checkpoint objects that are no longer tracked by the cache. This avoids restoring old Python metadata together with newer device tensors. --- @@ -1270,7 +1350,7 @@ Users should use checkpoint-specific methods instead. --- -## Example +## Example: Host-backed Checkpoints ```python from llama_cpp.llama_cache import HybridCheckpointCache @@ -1279,6 +1359,7 @@ from llama_cpp.llama_cache import HybridCheckpointCache checkpoint_cache = HybridCheckpointCache( ctx=ctx, max_checkpoints=16, + on_device=False, verbose=True, ) @@ -1299,16 +1380,57 @@ if saved: print("Restored:", restored) ``` -> Note: This example assumes `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. +Host mode stores full serialized checkpoint payloads in Python-owned `bytes`. Multiple historical checkpoints per `seq_id` are safe. + +--- + +## Example: Device-backed Checkpoints + +```python +from llama_cpp.llama_cache import HybridCheckpointCache + +# `ctx` must be a valid llama.cpp context pointer. +checkpoint_cache = HybridCheckpointCache( + ctx=ctx, + max_checkpoints=16, + on_device=True, + verbose=True, +) + +tokens = [1, 2, 3, 4] +current_pos = len(tokens) + +saved = checkpoint_cache.save_checkpoint( + current_pos=current_pos, + tokens=tokens, + seq_id=0, +) + +if saved: + checkpoint = checkpoint_cache.find_best_checkpoint(tokens, seq_id=0) + + if checkpoint is not None: + restored = checkpoint_cache.restore_checkpoint(checkpoint, seq_id=0) + print("Restored:", restored) +``` + +In device mode, llama.cpp owns the large tensor payload in context-owned device buffers. Python keeps only the host-visible checkpoint data and metadata. + +Only one active checkpoint per `seq_id` is safe. + +> Note: These examples assume `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. Instead, they configure it through the `Llama` constructor using `ctx_checkpoints`, `checkpoint_interval`, and `checkpoint_on_device`. --- ## Best Practices * Use `HybridCheckpointCache` only for Hybrid or recurrent model workflows that require hidden-state rollback. +* Keep `on_device=False` when you need multiple historical checkpoints for the same `seq_id`. +* Use `on_device=True` when reducing device-to-host checkpoint copy overhead is more important than keeping many historical checkpoint payloads. Only store the checkpoint seq_id and pos. * Set `max_checkpoints=0` for single-turn workflows where rollback is not needed. * Keep `max_checkpoints` small if checkpoint states are large. * Use `find_best_checkpoint` before calling `restore_checkpoint`. +* Do not hold and restore old on-device `HybridCheckpoint` objects after newer checkpoints have been saved for the same `seq_id`. * Do not use dictionary-style cache access with this class. --- @@ -1319,7 +1441,10 @@ if saved: * `max_checkpoints <= 0` disables checkpointing. * Restoring a checkpoint with the wrong `seq_id` fails. * Restore fails if the current backend state size no longer matches the checkpoint size. -* `close()` sets internal references to `None`; the object should not be reused afterward. +* In device mode, old `HybridCheckpoint` objects can become stale after a newer checkpoint is saved for the same `seq_id`. +* In device mode, `cache_size` does not include `llama_context`-owned device tensor storage. +* `clear()` removes Python-side checkpoint metadata but does not explicitly free llama.cpp-owned device buffers. +* `close()` detaches internal references; the object should not be reused afterward. * This class is not equivalent to `LlamaCache`. --- From 156226b00ebb6482cd83e26c917ff66aec65d104 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 12 May 2026 03:46:26 +0800 Subject: [PATCH 010/139] Update Submodule vendor/llama.cpp bbeb89d..a9883db --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index bbeb89d76c..a9883db8ee 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit bbeb89d76c41bc250f16e4a6fefcc9b530d6e3f3 +Subproject commit a9883db8ee021cf16783016a60996d41820b5195 From 89e90a74ec4823efb53baadec20197a6de08db2b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 13 May 2026 08:15:03 +0800 Subject: [PATCH 011/139] Sync llama.cpp API 20260513 Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 2 ++ llama_cpp/mtmd_cpp.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1efd645150..cc900c0648 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2763,6 +2763,8 @@ def llama_state_seq_load_file( ) -> int: ... +# define LLAMA_STATE_SEQ_FLAGS_NONE 0 +LLAMA_STATE_SEQ_FLAGS_NONE = 0 # // for backwards-compat LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 574d90e2bf..839c718ccd 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -723,6 +723,34 @@ def mtmd_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # type: ... +# // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context +# // This is only intended to be used by llama-server, breaking changes is expected +# struct mtmd_caps { +# bool inp_vision; +# bool inp_audio; +# }; +class mtmd_caps(Structure): + _fields_ = [ + ("inp_vision", c_bool), + ("inp_audio", c_bool), + ] + + if TYPE_CHECKING: + inp_vision: c_bool + inp_audio: c_bool + + +# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); +@ctypes_function_mtmd( + "mtmd_get_cap_from_file", [c_char_p], mtmd_caps) +def mtmd_get_cap_from_file(mmproj_fname: c_char_p) -> mtmd_caps: + """ + EXPERIMENTAL API to get mmproj's capabilities without initializing the full context. + This is only intended to be used by llama-server, breaking changes is expected + """ + ... + + # // test function, to be used in test-mtmd-c-api.c # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); @ctypes_function_mtmd( From e67169dfd59a67ec500c38b42d0cfc41475f1051 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 14 May 2026 00:40:40 +0800 Subject: [PATCH 012/139] Implement `MiniCPMV46ChatHandler` for `MiniCPM-V-4.6` Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 204 +++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) diff --git a/README.md b/README.md index c9aba7d42d..cd03d217d7 100644 --- a/README.md +++ b/README.md @@ -835,6 +835,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [minicpm-v-4.5](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) | `MiniCPMv45ChatHandler` | `minicpm-v-4.5` | +| [minicpm-v-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6-gguf) | `MiniCPMv46ChatHandler` | `minicpm-v-4.6` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [gemma4](https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db4..2ab627c89b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4280,6 +4280,210 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class MiniCPMV46ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V-4.6 models. + + Features: + - Aligned with official tokenizer_config.json special tokens. + - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. + - Integrated MTMD-style URL and Base64 injection for visual content. + - Specialized `` and `` block generation. + - Autonomously folds previous reasoning paths using `last_query_index`. + - Toggles `` block generation via `enable_thinking` (Defaults to False). + """ + + # Core tokens + MINICPM_BOS_TOKEN = "<|im_start|>" + MINICPM_EOS_TOKEN = "<|im_end|>" + MINICPM_PAD_TOKEN = "<|endoftext|>" + + # Vision tokens + MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" + MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" + MINICPM_IMAGE_TOKEN = "<|image_pad|>" + MINICPM_VIDEO_TOKEN = "<|video_pad|>" + + CHAT_FORMAT = ( + "{%- if enable_thinking is not defined -%}\n" + " {%- set enable_thinking = false -%}\n" + "{%- endif -%}\n" + "{%- macro render_content(content, is_system_content=false) -%}\n" + " {%- if content is string -%}\n" + " {{- content -}}\n" + " {%- elif content is iterable and content is not mapping -%}\n" + " {%- set ns = namespace(parts=[]) -%}\n" + " {%- for item in content -%}\n" + " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" + " {%- if is_system_content -%}\n" + " {{- raise_exception('System message cannot contain images.') -}}\n" + " {%- endif -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.type == 'image_url' -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" + # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" + # " {%- if is_system_content -%}\n" + # " {{- raise_exception('System message cannot contain videos.') -}}\n" + # " {%- endif -%}\n" + # " {%- set url_val = '' -%}\n" + # " {%- if item.type == 'video_url' -%}\n" + # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" + # " {%- endif -%}\n" + # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" + " {%- elif 'text' in item -%}\n" + " {%- set ns.parts = ns.parts + [item.text] -%}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected item type in content.') -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.parts | join('\\n') -}}\n" + " {%- elif content is none or content is undefined -%}\n" + " {{- '' -}}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected content type.') -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- if not messages %}\n" + " {{- raise_exception('No messages provided.') }}\n" + "{%- endif %}\n" + "{%- if tools and tools is iterable and tools is not mapping %}\n" + " {{- '<|im_start|>system\\n' }}\n" + " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" + " {%- for tool in tools %}\n" + " {{- '\\n' }}\n" + " {{- tool | tojson }}\n" + " {%- endfor %}\n" + " {{- '\\n' }}\n" + " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {%- if content %}\n" + " {{- '\\n\\n' + content }}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + "{%- else %}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" + "{%- for message in messages[::-1] %}\n" + " {%- set index = (messages|length - 1) - loop.index0 %}\n" + " {%- if ns.multi_step_tool and message.role == 'user' %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if not(content.startswith('') and content.endswith('')) %}\n" + " {%- set ns.multi_step_tool = false %}\n" + " {%- set ns.last_query_index = index %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if ns.multi_step_tool %}\n" + " {{- raise_exception('No user query found in messages.') }}\n" + "{%- endif %}\n" + "{%- for message in messages %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if message.role == 'system' %}\n" + " {%- if not loop.first %}\n" + " {{- raise_exception('System message must be at the beginning.') }}\n" + " {%- endif %}\n" + " {%- elif message.role == 'user' %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" + " {%- elif message.role == 'assistant' %}\n" + " {%- set reasoning_content = '' %}\n" + " {%- if message.reasoning_content is string %}\n" + " {%- set reasoning_content = message.reasoning_content %}\n" + " {%- else %}\n" + " {%- if '' in content %}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {%- set reasoning_content = reasoning_content|trim %}\n" + " {%- if loop.index0 > ns.last_query_index %}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" + " {%- else %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" + " {%- endif %}\n" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" + " {%- for tool_call in message.tool_calls %}\n" + " {%- if tool_call.function is defined %}\n" + " {%- set tool_call = tool_call.function %}\n" + " {%- endif %}\n" + " {%- if loop.first %}\n" + " {%- if content|trim %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- '\\n\\n\\n' }}\n" + " {%- endif %}\n" + " {%- if tool_call.arguments is defined %}\n" + " {%- for args_name, args_value in tool_call.arguments|items %}\n" + " {{- '\\n' }}\n" + " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" + " {{- args_value }}\n" + " {{- '\\n\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif message.role == 'tool' %}\n" + " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" + " {{- '<|im_start|>user' }}\n" + " {%- endif %}\n" + " {{- '\\n\\n' }}\n" + " {{- content }}\n" + " {{- '\\n' }}\n" + " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif loop.last %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- raise_exception('Unexpected message role.') }}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if add_generation_prompt %}\n" + " {{- '<|im_start|>assistant\\n' }}\n" + " {%- if enable_thinking is defined and enable_thinking is false %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V-4.6 Handler. + + Args: + enable_thinking (bool): Controls whether to open a `` block for reasoning. + Defaults to False as per the standard template logic. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # MiniCPM uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + class Gemma3ChatHandler(MTMDChatHandler): GEMMA3_BOI_TOKEN = "" From 99543936f58145314cde6c7cfbf88ab119a664b5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 14 May 2026 07:17:58 +0800 Subject: [PATCH 013/139] fix(MTMDChatHandler): correct audio_url content type check and improve variable handling - Changed condition from `content == "audio_url"` to `content_type == "audio_url"` for proper type-based dispatching. - Extracted `audio_url` variable for better readability. - Converted `else` to `elif content_type == "input_audio"` to make the control flow explicit and safer. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2ab627c89b..1c41beb40f 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2997,11 +2997,12 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") # Case A: Handle custom/forward-compatible audio_url format - if content == "audio_url": - url = content["audio_url"] if isinstance(content["audio_url"], str) else content["audio_url"]["url"] + if content_type == "audio_url": + audio_url = content["audio_url"] + url = audio_url if isinstance(audio_url, str) else audio_url["url"] media_items.append({"url": url, "type": "audio"}) # Case B: Handle OpenAI standard input_audio format - else: + elif content_type == "input_audio": input_audio = content.get("input_audio", {}) if isinstance(input_audio, dict) and "data" in input_audio: # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic From 0295d0c62fd5173e0704d33a8ced4d2c8e590d9c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 04:21:06 +0800 Subject: [PATCH 014/139] Update Submodule vendor/llama.cpp a9883db..834a243 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a9883db8ee..834a243664 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a9883db8ee021cf16783016a60996d41820b5195 +Subproject commit 834a243664114487f99520370a7a7b00fc7a486f From 1fb6a6665726e6abce52959bc38f162e6a0cb2dc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 07:22:11 +0800 Subject: [PATCH 015/139] feat(logger): refactor and enhance ggml logging configuration system - Introduce a `LoggerConfig` dataclass to provide fine-grained control over native ggml/llama.cpp runtime logging. - Align verbosity levels (0 to 5) with upstream `llama.cpp` conventions (`common/log.h`). - Implement a dynamic, configurable substring filtering system, replacing the hardcoded "CUDA Graph" patch with `DEFAULT_LOG_FILTERS`. - Add comprehensive public APIs for log management: `configure_logging`, `set_verbosity`, `set_quiet`, `set_silent`, `set_log_filters`, and `add_log_filters`. - Maintain backwards compatibility for the existing `set_verbose(bool)` function. - Improve the `ggml_log_callback` to correctly handle `GGML_LOG_LEVEL_CONT` by inheriting the verbosity of the preceding log message. - Route `GGML_LOG_LEVEL_NONE` to `stdout` and all other diagnostic logs to `stderr` by default. Signed-off-by: JamePeng --- llama_cpp/_logger.py | 406 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 383 insertions(+), 23 deletions(-) diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 015cec9faa..7669e2a722 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,6 +1,9 @@ import sys import ctypes import logging +from dataclasses import dataclass, field +from typing import Iterable, Optional, TextIO, Union + import llama_cpp._ggml as _ggml import llama_cpp.llama_cpp as llama_cpp_lib @@ -12,42 +15,399 @@ # GGML_LOG_LEVEL_DEBUG = 4, # GGML_LOG_LEVEL_CONT = 5, // continue previous log # }; -GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { - 0: logging.CRITICAL, - 1: logging.INFO, - 2: logging.WARNING, - 3: logging.ERROR, - 4: logging.DEBUG, - 5: logging.DEBUG, +GGML_LOG_LEVEL_NONE = 0 +GGML_LOG_LEVEL_INFO = 1 +GGML_LOG_LEVEL_WARN = 2 +GGML_LOG_LEVEL_ERROR = 3 +GGML_LOG_LEVEL_DEBUG = 4 +GGML_LOG_LEVEL_CONT = 5 + +# common/log.h model: +# +# LOG_LEVEL_OUTPUT = 0 +# LOG_LEVEL_ERROR = 1 +# LOG_LEVEL_WARN = 2 +# LOG_LEVEL_INFO = 3 +# LOG_LEVEL_TRACE = 4 +# LOG_LEVEL_DEBUG = 5 +# +# Rule: +# +# event_verbosity <= verbosity_threshold => print +# +# Larger threshold means more verbose output. +# +LOG_LEVEL_OUTPUT = 0 +LOG_LEVEL_ERROR = 1 +LOG_LEVEL_WARN = 2 +LOG_LEVEL_INFO = 3 +LOG_LEVEL_TRACE = 4 +LOG_LEVEL_DEBUG = 5 + +LOG_DEFAULT_LLAMA = LOG_LEVEL_INFO +LOG_DEFAULT_DEBUG = LOG_LEVEL_DEBUG + +# Match the updated common_log_default_callback behavior: +# INFO -> TRACE +# CONT -> TRACE +# +# This is slightly more conservative for verbosity=3: +# if the backend emits INFO through ggml_log_callback, Python will hide it unless +# verbosity >= 4. This mirrors the current upstream default callback behavior. +GGML_LEVEL_TO_VERBOSITY = { + GGML_LOG_LEVEL_NONE: LOG_LEVEL_OUTPUT, + GGML_LOG_LEVEL_ERROR: LOG_LEVEL_ERROR, + GGML_LOG_LEVEL_WARN: LOG_LEVEL_WARN, + GGML_LOG_LEVEL_INFO: LOG_LEVEL_TRACE, + GGML_LOG_LEVEL_DEBUG: LOG_LEVEL_DEBUG, + GGML_LOG_LEVEL_CONT: LOG_LEVEL_TRACE, # fallback only; CONT inherits previous +} + +GGML_LEVEL_TO_PYTHON_LEVEL = { + GGML_LOG_LEVEL_NONE: logging.INFO, + GGML_LOG_LEVEL_ERROR: logging.ERROR, + GGML_LOG_LEVEL_WARN: logging.WARNING, + GGML_LOG_LEVEL_INFO: logging.INFO, + GGML_LOG_LEVEL_DEBUG: logging.DEBUG, + GGML_LOG_LEVEL_CONT: logging.INFO, # fallback only; CONT inherits previous } + +# Default substring filters. +# +# These are intentionally simple substring filters instead of hard-coded +# special branches. Users can replace or clear them with set_log_filters(). +DEFAULT_LOG_FILTERS = [ + "CUDA Graph", + "CUDA graph" +] + + +VerbosityLike = Union[bool, int, str, None] + logger = logging.getLogger("llama-cpp-python") -_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0] -# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); +@dataclass +class LoggerConfig: + # 0=output, 1=error, 2=warn, 3=info, 4=trace, 5=debug + verbosity: int = LOG_DEFAULT_LLAMA + + show_output: bool = True + + stdout: TextIO = sys.stdout + stderr: TextIO = sys.stderr + + # If any substring is contained in a log message, the message is dropped. + log_filters: list[str] = field(default_factory=lambda: list(DEFAULT_LOG_FILTERS)) + log_filters_case_sensitive: bool = True + + +_config = LoggerConfig() +_last_verbosity = LOG_LEVEL_INFO + + +def _normalize_verbosity( + value: VerbosityLike, + *, + default: int = LOG_DEFAULT_LLAMA, +) -> int: + """ + Convert user input to llama.cpp-style verbosity 0..5. + + Compatibility: + verbose=False -> ERROR (1) + verbose=True -> DEBUG (5) + + Numeric levels: + 0 = output + 1 = error + 2 = warn + 3 = info + 4 = trace + 5 = debug + """ + if value is None: + return default + + if isinstance(value, bool): + return LOG_LEVEL_DEBUG if value else LOG_LEVEL_ERROR + + if isinstance(value, int): + return max(LOG_LEVEL_OUTPUT, min(LOG_LEVEL_DEBUG, value)) + + if isinstance(value, str): + key = value.strip().lower() + aliases = { + "0": LOG_LEVEL_OUTPUT, + "output": LOG_LEVEL_OUTPUT, + "none": LOG_LEVEL_OUTPUT, + + "1": LOG_LEVEL_ERROR, + "error": LOG_LEVEL_ERROR, + "err": LOG_LEVEL_ERROR, + "silent": LOG_LEVEL_ERROR, + + "2": LOG_LEVEL_WARN, + "warn": LOG_LEVEL_WARN, + "warning": LOG_LEVEL_WARN, + "quiet": LOG_LEVEL_WARN, + + "3": LOG_LEVEL_INFO, + "info": LOG_LEVEL_INFO, + "default": LOG_DEFAULT_LLAMA, + "normal": LOG_DEFAULT_LLAMA, + + "4": LOG_LEVEL_TRACE, + "trace": LOG_LEVEL_TRACE, + "trc": LOG_LEVEL_TRACE, + + "5": LOG_LEVEL_DEBUG, + "debug": LOG_LEVEL_DEBUG, + "verbose": LOG_LEVEL_DEBUG, + } + + if key in aliases: + return aliases[key] + + try: + parsed = int(key) + except ValueError as exc: + raise ValueError( + "_logger._normalize_verbosity: " + "verbosity must be one of 0..5, bool, None, or " + "'silent'/'quiet'/'info'/'trace'/'debug'" + ) from exc + + return max(LOG_LEVEL_OUTPUT, min(LOG_LEVEL_DEBUG, parsed)) + + raise TypeError(f"_logger._normalize_verbosity: unsupported verbosity type: {type(value)!r}") + + +def _verbosity_to_python_level(verbosity: int) -> int: + if verbosity >= LOG_LEVEL_DEBUG: + return logging.DEBUG + if verbosity >= LOG_LEVEL_INFO: + return logging.INFO + if verbosity >= LOG_LEVEL_WARN: + return logging.WARNING + return logging.ERROR + + +def _get_verbosity(level: int) -> int: + """ + Map ggml log level to Python-side verbosity. + + GGML_LOG_LEVEL_INFO maps to LOG_LEVEL_INFO so that verbosity=3 remains + useful as the default info level. + """ + if level == GGML_LOG_LEVEL_NONE: + return LOG_LEVEL_OUTPUT + if level == GGML_LOG_LEVEL_ERROR: + return LOG_LEVEL_ERROR + if level == GGML_LOG_LEVEL_WARN: + return LOG_LEVEL_WARN + if level == GGML_LOG_LEVEL_INFO: + return LOG_LEVEL_INFO + if level == GGML_LOG_LEVEL_DEBUG: + return LOG_LEVEL_DEBUG + if level == GGML_LOG_LEVEL_CONT: + return LOG_LEVEL_INFO + return LOG_LEVEL_DEBUG + + +def _decode_log_text(text: bytes) -> str: + return text.decode("utf-8", errors="replace") + + +def _matches_log_filter(msg: str) -> bool: + filters = _config.log_filters + if not filters: + return False + + if _config.log_filters_case_sensitive: + return any(item and item in msg for item in filters) + + msg_lower = msg.lower() + return any(item and item.lower() in msg_lower for item in filters) + + +def _should_drop(level: int, verbosity: int, msg: str) -> bool: + if verbosity > _config.verbosity: + return True + + if level == GGML_LOG_LEVEL_NONE and not _config.show_output: + return True + + if _matches_log_filter(msg): + return True + + return False + + @_ggml.ggml_log_callback def ggml_log_callback( level: int, text: bytes, user_data: ctypes.c_void_p, ): - # Note(JamePeng): A temporary patch is used to filter out garbage debug information - # output from the underlying C++ `CUDA Graph id %zu reused`. - # The logger is planned to be refactored to meet control requirements. - if text: - if b"CUDA Graph" in text or b"CUDA graph" in text: - return - # TODO: Correctly implement continue previous log - global _last_log_level - log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level - if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: - print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) - _last_log_level = log_level + global _last_verbosity + + msg = _decode_log_text(text) + + if level == GGML_LOG_LEVEL_CONT: + verbosity = _last_verbosity + else: + verbosity = _get_verbosity(level) + _last_verbosity = verbosity + if _should_drop(level, verbosity, msg): + return -llama_cpp_lib.llama_log_set(ggml_log_callback, ctypes.c_void_p(0)) + out = _config.stdout if level == GGML_LOG_LEVEL_NONE else _config.stderr + print(msg, end="", flush=True, file=out) + + +# Keep a global reference to avoid ctypes callback being garbage-collected. +_ggml_log_callback_ref = ggml_log_callback + +llama_cpp_lib.llama_log_set(_ggml_log_callback_ref, ctypes.c_void_p(0)) + + +def configure_logging( + *, + verbosity: VerbosityLike = None, + verbose: Optional[bool] = None, + quiet: Optional[bool] = None, + silent: Optional[bool] = None, + show_output: Optional[bool] = None, + log_filters: Optional[Iterable[str]] = None, + append_log_filters: Optional[Iterable[str]] = None, + log_filters_case_sensitive: Optional[bool] = None, +): + """ + Configure native ggml/llama.cpp runtime logging. + + Priority: + silent > quiet > verbosity > verbose > current config + + Compatibility: + verbose=False -> ERROR + verbose=True -> DEBUG + + Numeric levels: + 0 = output + 1 = error + 2 = warn + 3 = info + 4 = trace + 5 = debug + """ + if silent is True: + v = LOG_LEVEL_ERROR + elif quiet is True: + v = LOG_LEVEL_WARN + elif verbosity is not None: + v = _normalize_verbosity(verbosity) + elif verbose is not None: + v = _normalize_verbosity(verbose) + else: + v = _config.verbosity + + _config.verbosity = v + logger.setLevel(_verbosity_to_python_level(v)) + + if show_output is not None: + _config.show_output = show_output + + if log_filters is not None: + _config.log_filters = [s for s in log_filters if s] + + if append_log_filters is not None: + _config.log_filters.extend(s for s in append_log_filters if s) + + if log_filters_case_sensitive is not None: + _config.log_filters_case_sensitive = log_filters_case_sensitive def set_verbose(verbose: bool): - logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + """ + Backward-compatible bool API. + + False -> ERROR + True -> DEBUG + """ + configure_logging(verbose=verbose) + + +def set_verbosity(verbosity: VerbosityLike): + configure_logging(verbosity=verbosity) + + +def get_verbosity() -> int: + return _config.verbosity + + +def set_quiet(quiet: bool = True): + configure_logging(quiet=quiet) + + +def set_silent(silent: bool = True): + configure_logging(silent=silent) + + +def set_log_filters( + filters: Iterable[str], + *, + case_sensitive: bool = True, +): + """ + Replace all substring log filters. + + Example: + set_log_filters(["CUDA Graph id", "clip_model_loader: tensor"]) + """ + configure_logging( + log_filters=filters, + log_filters_case_sensitive=case_sensitive, + ) + + +def get_log_filters() -> list[str]: + return list(_config.log_filters) + + +def add_log_filters(filters: Iterable[str]): + """ + Append substring log filters. + """ + configure_logging(append_log_filters=filters) + + +def clear_log_filters(): + """ + Clear all substring log filters, including default filters. + """ + _config.log_filters.clear() + + +def reset_log_filters(): + """ + Restore default substring log filters. + """ + _config.log_filters = list(DEFAULT_LOG_FILTERS) + + +def get_log_filters_case_sensitive() -> bool: + return _config.log_filters_case_sensitive + + +def reset_logging(): + """ + Reset logging to default llama.cpp-style INFO verbosity and default filters. + """ + _config.verbosity = LOG_DEFAULT_LLAMA + _config.show_output = True + _config.log_filters = list(DEFAULT_LOG_FILTERS) + _config.log_filters_case_sensitive = True + logger.setLevel(_verbosity_to_python_level(_config.verbosity)) From f64320de4fedcad1f28cb46526803ff68784c546 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 07:30:34 +0800 Subject: [PATCH 016/139] feat(core): integrate fine-grained logging API into Llama class This commit exposes the newly refactored `_logger` configuration system directly through the `Llama` class, providing users with robust, programmatic control over native `llama.cpp` backend logs. Key changes: - Expand `Llama.__init__` with `verbosity`, `log_filters`, and `log_filters_case_sensitive` parameters. - Add instance methods for runtime log management (`set_verbosity`, `get_verbosity`, `set_log_filters`, `add_log_filters`, `clear_log_filters`, etc.). - Add comprehensive docstrings explaining the 0-5 verbosity scale and explicitly noting the process-global nature of the native backend logger. Advantages over the legacy implementation: - Granular Control: Replaces the restrictive binary `verbose=True/False` flag (which only toggled between ERROR and DEBUG) with a granular 6-tier scale (output, error, warn, info, trace, debug). - Dynamic Filtering: Empowers users to actively suppress specific noisy C++ logs using custom substring filters, removing the need for hardcoded internal patches. - Better Discoverability: Attaches logging controls directly to the `Llama` object, making log management much more accessible and intuitive without requiring users to import internal logger modules. Signed-off-by: JamePeng --- llama_cpp/llama.py | 112 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e50e3b9a3b..19ede6bcfd 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -58,7 +58,16 @@ from ._ggml import ( ggml_backend_cpu_buffer_type, ) -from ._logger import set_verbose +from ._logger import ( + configure_logging, + get_verbosity, + set_verbosity, + get_log_filters, + set_log_filters, + add_log_filters, + clear_log_filters, + reset_log_filters, +) from ._utils import suppress_stdout_stderr @@ -150,7 +159,11 @@ def __init__( type_v: Optional[int] = None, # Misc spm_infill: bool = False, + # Log verbose: bool = True, + verbosity: Optional[Union[int, str, bool]] = None, + log_filters: Optional[Sequence[str]] = None, + log_filters_case_sensitive: bool = True, # Extra Params **kwargs, # type: ignore ): @@ -235,11 +248,31 @@ def __init__( chat_handler: Optional chat handler to use when calling create_chat_completion. draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. - verbose: Print verbose output to stderr. type_k: KV cache data type for K (default: f16) type_v: KV cache data type for V (default: f16) spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. - + verbose: Backward-compatible boolean switch for native llama.cpp / ggml runtime logs. + False keeps only error-level native logs; True enables debug-level native logs. + If `verbosity` is provided, `verbosity` takes precedence over `verbose`. + verbosity: Fine-grained llama.cpp-style native runtime log verbosity. + Accepts 0-5, bool, or string aliases. + Numeric levels: + 0 = output only + 1 = error + 2 = warning + 3 = info + 4 = trace + 5 = debug + Use `verbosity=3` for llama.cpp-style default info logs. + `verbose=False` remains equivalent to error-only logging, while + `verbose=True` remains equivalent to debug logging. + log_filters: Optional substring filters for native runtime logs. + If any provided substring appears in a decoded backend log message, + that message is suppressed. By default, the logger may include built-in + filters for noisy low-level logs such as CUDA Graph reuse spam messages. + Pass an empty list to disable all substring filtering for this instance. + log_filters_case_sensitive: Whether `log_filters` should match case-sensitively. + Defaults to True for predictable low-level backend log filtering. Raises: ValueError: If the model path does not exist. @@ -247,9 +280,15 @@ def __init__( A Llama instance. """ self.verbose = verbose + self.verbosity = verbosity self._stack = contextlib.ExitStack() - set_verbose(verbose) + configure_logging( + verbose=verbose, + verbosity=verbosity, + log_filters=log_filters, + log_filters_case_sensitive=log_filters_case_sensitive, + ) if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): @@ -795,6 +834,71 @@ def eval_logits(self) -> Deque[List[float]]: maxlen=self._n_ctx if self._logits_all else 1, ) + # Logger API + + def set_verbosity(self, verbosity: Union[int, str, bool, None]) -> None: + """Set native llama.cpp / ggml runtime log verbosity for this process. + + Levels: + 0 = output only + 1 = error + 2 = warning + 3 = info + 4 = trace + 5 = debug + + Note: + Native backend logging is process-global because llama.cpp / ggml use + a global log callback. Changing this affects all Llama instances in + the current Python process. + """ + set_verbosity(verbosity) + self.verbosity = get_verbosity() + self.verbose = self.verbosity >= 5 + + + def get_verbosity(self) -> int: + """Return the current native runtime log verbosity.""" + return get_verbosity() + + + def set_log_filters( + self, + filters: Sequence[str], + *, + case_sensitive: bool = True, + ) -> None: + """Replace substring filters for native runtime logs. + + Any backend log message containing one of these substrings will be + suppressed. Pass an empty list to disable all substring filtering. + + Note: + Native backend logging is process-global, so this affects all Llama + instances in the current Python process. + """ + set_log_filters(filters, case_sensitive=case_sensitive) + + + def add_log_filters(self, filters: Sequence[str]) -> None: + """Append substring filters for native runtime logs.""" + add_log_filters(filters) + + + def get_log_filters(self) -> List[str]: + """Return the current substring filters for native runtime logs.""" + return get_log_filters() + + + def clear_log_filters(self) -> None: + """Clear all substring filters, including default filters.""" + clear_log_filters() + + + def reset_log_filters(self) -> None: + """Restore default substring filters for native runtime logs.""" + reset_log_filters() + # LoRA / Adapter Management API def load_lora(self, name: str, path: str): From d89aa5a7d7ea3629c8767b332d692e9f3b9a9e5f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 07:43:27 +0800 Subject: [PATCH 017/139] docs(wiki): document runtime verbosity and log filters for Llama Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 106 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 0354d86150..a061861ece 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,7 +4,7 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-06 +last_updated: 2026-05-15 version_target: "latest" --- ``` @@ -55,6 +55,15 @@ Initialize the model and context. Note that model loading will immediately alloc | `checkpoint_interval` | `int` | `4096` | Token interval for saving periodic Hybrid/Recurrent checkpoints during long prompt evaluation. | | `checkpoint_on_device` | `bool` | `False` | Store Hybrid/Recurrent checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Reduces device-to-host copy overhead, but only one active checkpoint per `seq_id` is safe. | +### Runtime Logging Parameters + +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `verbose` | `bool` | `True` | Backward-compatible boolean native logging switch. `False` keeps only error-level llama.cpp / ggml logs; `True` enables debug-level native logs. If `verbosity` is provided, `verbosity` takes precedence over `verbose`. | +| `verbosity` | `Optional[Union[int, str, bool]]` | `None` | Fine-grained llama.cpp-style native runtime log verbosity. Numeric levels: `0=output`, `1=error`, `2=warning`, `3=info`, `4=trace`, `5=debug`. Use `verbosity=3` for llama.cpp-style default info logs. String aliases such as `"silent"`, `"quiet"`, `"info"`, `"trace"`, and `"debug"` are also accepted. | +| `log_filters` | `Optional[Sequence[str]]` | `None` | Optional substring filters for native runtime logs. If any provided substring appears in a decoded backend log message, that message is suppressed. The default logger may include built-in filters for noisy low-level logs such as `CUDA Graph id %d reuse` messages. Pass an empty list `[]` to disable default substring filtering. | +| `log_filters_case_sensitive` | `bool` | `True` | Whether `log_filters` should match case-sensitively. Defaults to `True` for predictable low-level backend log filtering. | + *(Note: There are numerous additional RoPE/YaRN scaling parameters available for specialized context extension. Refer to the source code for the full list).* --- @@ -112,6 +121,42 @@ model.eval(tokens=[1, 453, 234, 987], active_loras=[{"name": "coding_adapter", " Immediately halts an active generation loop safely. * **Usage**: Typically called from a separate monitoring thread (like a timer). When triggered, the running stream will exit and the final chunk will contain `"finish_reason": "abort"`. +### Runtime Logging Control + +The `Llama` class exposes lightweight runtime helpers for adjusting native llama.cpp / ggml logging after initialization. + +> **Note:** Native backend logging is process-global because llama.cpp / ggml use a global log callback. Changing verbosity or log filters affects all `Llama` instances in the current Python process. + +* `set_verbosity(verbosity: Union[int, str, bool, None])`: Set native runtime log verbosity. +* `get_verbosity() -> int`: Return the current native runtime log verbosity. +* `set_log_filters(filters: Sequence[str], case_sensitive: bool = True)`: Replace substring filters for native runtime logs. +* `add_log_filters(filters: Sequence[str])`: Append substring filters. +* `get_log_filters() -> List[str]`: Return the current substring filters. +* `clear_log_filters()`: Clear all substring filters, including default filters. +* `reset_log_filters()`: Restore default substring filters. + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # llama.cpp-style info logs +) + +# Temporarily enable debug-level native logs. +llm.set_verbosity(5) + +# Suppress noisy backend messages by substring. +llm.add_log_filters([ + "CUDA Graph", + "CUDA graph", + "clip_model_loader: tensor", +]) + +# Return to quiet error-only logging. +llm.set_verbosity(1) +``` + ### Dynamic LoRA Management The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dynamically per-generation or per-eval. * `load_lora(name: str, path: str)`: Loads an adapter into VRAM (does not apply it yet). @@ -185,7 +230,7 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn llm.create_completion("Once upon a time", active_loras=[{"name": "story", "scale": 0.9}]) # Use sql adapter - llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}])v + llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}]) ``` 5. **Hybrid & Recurrent Architectures**: @@ -321,6 +366,63 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn run_controlled_generation("Explain quantum mechanics in a way that relates to bugs in code.", timeout_seconds=8) ``` +8. **Runtime Logging & Backend Noise Filtering**: + + `Llama` supports fine-grained native llama.cpp / ggml logging through `verbosity`. This is more precise than the legacy `verbose` boolean flag. + + ```python + from llama_cpp import Llama + + # Legacy behavior: + # verbose=False -> error-only logs + llm_quiet = Llama( + model_path="models/qwen3.gguf", + verbose=False, + ) + + # Recommended precise logging: + # 0 = output, 1 = error, 2 = warning, 3 = info, 4 = trace, 5 = debug + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # llama.cpp-style default info logs + ) + ``` + + For low-level debugging, use `verbosity=5`. By default, the logger may suppress known noisy backend messages such as CUDA Graph reuse logs. Pass `log_filters=[]` to disable all substring filtering. + + ```python + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[], # show all debug logs, including normally filtered ones + ) + ``` + + To suppress additional noisy messages, pass substring filters: + + ```python + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[ + "CUDA Graph id", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", + ], + ) + ``` + + You can also adjust logging at runtime: + + ```python + llm.set_verbosity(5) + llm.add_log_filters(["llama_perf_context_print"]) + + # Later, return to warning-level logs. + llm.set_verbosity(2) + ``` + + **Important:** native backend logging is process-global. Runtime changes affect all `Llama` instances in the same Python process. --- From c14d769a4408e516bb03bd13f68e838ab6edbe4a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 07:30:10 +0800 Subject: [PATCH 018/139] Update Submodule vendor/llama.cpp 834a243..49d1701 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 834a243664..49d1701bd2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 834a243664114487f99520370a7a7b00fc7a486f +Subproject commit 49d1701bd24e4cedf6dfec9e50e185111203946b From 4d3e320b321db7c505721c92c7d3d26641e95623 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:01:49 +0800 Subject: [PATCH 019/139] Implement `Qwen3ASRChatHandler` for `Qwen3-ASR` models. - Integrate MTMD multimodal logic to extract and inject `audio_url` and base64 `input_audio` data directly into the `<|audio_start|><|audio_pad|>[DATA]<|audio_end|>` sequence. - Define a default multilingual transcription system prompt and configure model-specific stop tokens. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 78 ++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1c41beb40f..0365d8f871 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5473,6 +5473,84 @@ def __call__(self, **kwargs): # Use parent implementation return super().__call__(**kwargs) +class Qwen3ASRChatHandler(MTMDChatHandler): + """ + Handler for Qwen 3 ASR (Automatic Speech Recognition) models. + + Features: + - Highly specialized for Speech-to-Text tasks. + - Aggregates all system text into a single cohesive system block. + - Drops user text entirely, extracting ONLY audio data into a unified user turn. + - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. + - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. + """ + + DEFAULT_SYSTEM_MESSAGE = """ + You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. + You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. + """ + + QWEN3_ASR_BOS_TOKEN = "<|im_start|>" + QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" + QWEN3_ASR_EOS_TOKEN = "<|im_end|>" + + + QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" + QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" + QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" + + CHAT_FORMAT = ( + "{%- set ns = namespace(system_text='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.role == 'system' -%}\n" + " {%- if m.content is string -%}\n" + " {%- set ns.system_text = ns.system_text + m.content -%}\n" + " {%- else -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'text' and (c.text is defined) -%}\n" + " {%- set ns.system_text = ns.system_text + c.text -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- set ns2 = namespace(audio_tokens='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.content is not string -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" + " {#- MTMD Audio Injection -#}\n" + " {%- set audio_val = '' -%}\n" + " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" + " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" + " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" + " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" + " {%- endif -%}\n" + " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" + "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token + kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") + + return super().__call__(**kwargs) class Qwen3VLChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From ad67e0e979620e3dc18c91460bd7a96a3dfc1934 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:05:21 +0800 Subject: [PATCH 020/139] docs(README.md): add Qwen3-ASR documentation and usage example - Update the supported multi-modal models table to include `qwen3-asr` and the `Qwen3ASRChatHandler`. - Add a new dedicated section for Speech-to-Text inference with a complete, collapsible Python script. - Provide a `build_media_payload` helper function to demonstrate proper Base64 encoding of local `.wav` and `.mp3` files into OpenAI-compatible `input_audio` schemas. - Include a critical warning advising users to use BF16 quantization for the multimodal projector (`mmproj`) to prevent audio degradation. - Clarify usage mechanics, specifically that all instructions must be placed in the `system` role due to the ASR template's text-dropping behavior. Signed-off-by: JamePeng --- README.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/README.md b/README.md index cd03d217d7..26aa11a55f 100644 --- a/README.md +++ b/README.md @@ -845,6 +845,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | +| [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | | [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` | | [qwen3.6](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | `Qwen35ChatHandler` | `qwen3.6` | @@ -1072,6 +1073,111 @@ print(res["choices"][0]["message"]["content"]) +## Speech Recognition With Qwen3-ASR (Speech-to-Text) + +The `Qwen3ASRChatHandler` is specifically designed for the Qwen3 Automatic Speech Recognition (ASR) models. Unlike standard multimodal models, this handler aggregates system prompts for instructions and automatically extracts audio data from the user's message, ignoring any user text. + +> **⚠️ Important Note on Quantization:** > For Qwen3-ASR models, it is highly recommended to use the **BF16** version of the multimodal projector (`mmproj`). Other quantizations are known to cause severe audio degradation. + +**Example Code**:
+ +```python +from llama_cpp import Llama +from llama_cpp.llama_chat_format import Qwen3ASRChatHandler +import base64 +import os + +# 1. Define paths to the model and the BF16 multimodal projector +MODEL_PATH = r"./Qwen3-ASR-1.7B-BF16.gguf" +MMPROJ_PATH = r"./mmproj-Qwen3-ASR-1.7b-BF16.gguf" + +# 2. Initialize the Llama model with the dedicated ASR handler +llm = Llama( + model_path=MODEL_PATH, + chat_handler=Qwen3ASRChatHandler( + clip_model_path=MMPROJ_PATH, + verbose=False, + ), + n_gpu_layers=-1, + n_ctx=10240, + verbose=False, + verbosity=0 +) + +# 3. Helper function to encode audio files into OpenAI-compatible payloads +_MEDIA_MIME_TYPES = { + '.wav': ('audio', 'wav'), + '.mp3': ('audio', 'mp3'), +} + +def build_media_payload(file_path: str) -> dict: + """Reads a local audio file and converts it into the LLM input structure.""" + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Media file not found: {file_path}") + + extension = os.path.splitext(file_path)[1].lower() + media_category, mime_or_format = _MEDIA_MIME_TYPES.get(extension, ('unknown', 'application/octet-stream')) + + if media_category == 'unknown': + print(f"Warning: Unknown extension '{extension}'.") + + # Read and Base64 encode the file + with open(file_path, "rb") as f: + encoded_data = base64.b64encode(f.read()).decode("utf-8") + + if media_category == 'audio': + return { + "type": "input_audio", + "input_audio": { + "data": encoded_data, + "format": mime_or_format + } + } + else: + return {"type": "text", "text": f"[Attached unsupported file: {file_path}]"} + + +# ======================== +# Main Inference Section +# ======================== + +media_paths = ["./audio/test.wav"] +user_content = [build_media_payload(path) for path in media_paths] + +# 4. Generate the transcription +response = llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": ( + "You are an advanced multilingual Speech-to-Text model. " + "Accurately transcribe the audio into text in its original spoken language. " + "You should ignore background noise, filler words, and stutters where possible, " + "and format the final output with correct grammar and capitalization." + ) + }, + { + "role": "user", + "content": user_content + } + ], + temperature=1.0, + top_p=0.95, + top_k=64, + max_tokens=10240, +) + +print(f"Transcribe: {response['choices'][0]['message']['content']}") + +``` + +#### How it works: + +* **`input_audio` Schema:** The script reads the local `.wav` or `.mp3` file, encodes it in Base64, and wraps it in an OpenAI-compatible `"type": "input_audio"` dictionary. +* **System Prompt:** Because the Qwen3-ASR template strips out user text, all instructions (like translation requests or formatting rules) **must** be placed in the `"system"` role. + +
+ ## Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text) Below is a complete, production-ready example demonstrating how to dynamically route and process both image and audio files. It includes a universal media processor that automatically converts local files into the correct payload structure (Data URIs for images, and `input_audio` for audio files). From 43b85f38ed9b55ab1cff4646e41796f93b4b1129 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:12:34 +0800 Subject: [PATCH 021/139] docs(README.md): Update the jump link for Qwen3-ASR in the top directory. Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 26aa11a55f..caec7e32e0 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ This package provides: - [Multi-modal Models Support](https://github.com/JamePeng/llama-cpp-python#multi-modal-models) - Support Models Lists - [Loading a Local Image With Qwen3VL(Thinking/Instruct)](https://github.com/JamePeng/llama-cpp-python#loading-a-local-image-with-qwen3vlthinkinginstruct) + - [Speech Recognition With Qwen3-ASR (Speech-to-Text)](https://github.com/JamePeng/llama-cpp-python#speech-recognition-with-qwen3-asr-speech-to-text) - [Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text)](https://github.com/JamePeng/llama-cpp-python#comprehensive-omni-multimodal-example-gemma-4-vision--audio--text) - [Embeddings & Reranking (GGUF)](https://github.com/JamePeng/llama-cpp-python#embeddings--reranking-gguf) - [1. Text Embeddings (Vector Search)](https://github.com/JamePeng/llama-cpp-python#1-text-embeddings-vector-search) From 4fb074682bf18c0c8097e9b9e4800940eb49bc07 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:57:46 +0800 Subject: [PATCH 022/139] Update SCHEMA.md --- docs/wiki/SCHEMA.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index b96ec964c7..23954a156e 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -4,7 +4,7 @@ - **Author**: JamePeng - **Maintainer**: LLM-assisted documentation workflow - **Project**: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) wiki -- **Last Modified**: 2026-05-02 +- **Last Modified**: 2026-05-16 - **Version Target**: latest source code - **Schema Version**: 0.3 @@ -24,6 +24,7 @@ - `llama_cpp.py` - `mtmd_cpp.py` - `_ggml.py` + - `_logger.py` - Never invent parameters or behavior. Always read the current source code before writing/updating a page. - Prefer documenting public and user-facing APIs first. Internal implementation details may be documented only when they help users understand behavior, extension points, debugging, or advanced usage. - All examples must be complete, runnable with the latest API, and include necessary imports. From 1064cf17361c394f38f6f3d89fc68367d45d8720 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 10:00:53 +0800 Subject: [PATCH 023/139] docs(Llama.md): update `verbose=False` vs. `verbosity=0` note Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index a061861ece..1f7cce206b 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,7 +4,7 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-15 +last_updated: 2026-05-16 version_target: "latest" --- ``` @@ -424,6 +424,10 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn **Important:** native backend logging is process-global. Runtime changes affect all `Llama` instances in the same Python process. + **verbose=False** vs. **verbosity=0**: These have distinct behaviors. + - `verbose=False` silences Python wrapper prints but not backend diagnostics; like `if self.verbose: print()` + - `verbosity=0` silences all backend non-error output. + --- ## Deprecated / Changed APIs From 7eab8d3ad3f178d9cae6bebdd6013213176415d8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 10:04:43 +0800 Subject: [PATCH 024/139] docs(Logger.md): Upload Logger documentation Signed-off-by: JamePeng --- docs/wiki/modules/Logger.md | 216 ++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 docs/wiki/modules/Logger.md diff --git a/docs/wiki/modules/Logger.md b/docs/wiki/modules/Logger.md new file mode 100644 index 0000000000..f24f7f43a8 --- /dev/null +++ b/docs/wiki/modules/Logger.md @@ -0,0 +1,216 @@ +--- +title: Logger +class_name: Logger (module) +module_name: llama_cpp._logger +source_file: llama_cpp/_logger.py +last_updated: 2026-05-16 +version_target: latest +--- + +## Overview + +The `Logger` module provides configuration for runtime logging in `llama-cpp-python`, wrapping the native `ggml`/`llama.cpp` logging infrastructure. It controls verbosity levels, output streams, substring filtering, and callback integration, allowing fine-grained control over diagnostic and informational output from the underlying bindings. + +## Role in the Library + +- **Wraps low-level logging**: It intercepts and transforms log events from the C/C++ backend (`ggml_log_callback`). +- **Connects to Python logging**: Maps `ggml` verbosity levels (0–5) to `logging` levels (ERROR, WARNING, INFO, DEBUG), and routes output to `stdout`/`stderr` based on severity. +- **Provides filtering**: Substring-based message filtering to suppress specific log categories (e.g., CUDA Graph output). +- **Extends the API surface**: Offers both explicit configuration functions and convenient shorthand setters (`set_verbose`, `set_quiet`), while preserving full control through `configure_logging`. + +## Core Methods + +### `configure_logging(*, verbosity=None, verbose=None, quiet=None, silent=None, show_output=None, log_filters=None, append_log_filters=None, log_filters_case_sensitive=None)` + +The primary configuration function. Combines multiple parameters into a unified verbosity level. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `verbosity` | int \| bool \| None | None | Numeric level (0–5). `False` maps to `ERROR` (1), `True` to `DEBUG` (5). | +| `verbose` | bool | None | Shorthand: `True` → `DEBUG`, `False` → `ERROR`. | +| `quiet` | bool | None | Shorthand: `True` → `WARN` (2). | +| `silent` | bool | None | Shorthand: `True` → `ERROR` (1). | +| `show_output` | bool | None | Whether `GGML_LOG_LEVEL_NONE` (output) should be shown. | +| `log_filters` | Iterable[str] | None | List of substring patterns to filter out. | +| `append_log_filters` | Iterable[str] | None | Append additional filter patterns. | +| `log_filters_case_sensitive` | bool | None | Whether filters are case-sensitive. | + +### `set_verbose(verbose: bool)` + +Shorthand setter. `verbose=True` sets `verbosity=DEBUG`, `verbose=False` sets `verbosity=ERROR`. + +### `set_verbosity(verbosity: VerbosityLike)` + +Sets verbosity to any value accepted by `configure_logging`. + +### `get_verbosity() -> int` + +Returns current configured verbosity level (0–5). + +### `set_quiet(quiet: bool = True)` + +Sets `verbosity=WARN` (`2`). + +### `set_silent(silent: bool = True)` + +Sets `verbosity=ERROR` (`1`). + +### `set_log_filters(filters: Iterable[str], *, case_sensitive: bool = True)` + +Replaces all substring log filters. + +### `get_log_filters() -> list[str]` + +Returns current filter list. + +### `add_log_filters(filters: Iterable[str])` + +Appends filters to the current list. + +### `clear_log_filters()` + +Removes all user-defined filters. + +### `reset_log_filters()` + +Restores the default filter list: `["CUDA Graph", "CUDA graph"]`. + +### `reset_logging()` + +Resets to default: `verbosity=INFO` (`3`), `show_output=True`, default filters. + +## Important Attributes / State + +| Attribute | Type | Source | Description | +|-----------|------|--------|-------------| +| `_config` | LoggerConfig | Internal | Holds the current configuration: verbosity, output streams, filters. | +| `_last_verbosity` | int | Internal | Tracks the last verbosity level set by `ggml_log_callback`. | + +## Best Practices & Common Patterns + +### 1. Default Behavior +Use `reset_logging()` to start with `INFO` verbosity, which shows warnings and errors but hides internal debug output. + +```python +from llama_cpp import Llama +from llama_cpp import reset_logging + +reset_logging() # Default verbosity=3 (INFO), show warnings and errors +llm = Llama(model_path="models/qwen3.gguf") +llm("Explain quantum physics.") +``` + +### 2. Precise Logging via `verbosity` +Replace the legacy `verbose` boolean with the precise `verbosity` parameter. `verbose=False` maps to `ERROR` (1), `verbose=True` to `DEBUG` (5). + +```python +from llama_cpp import Llama + +# Legacy (coarse control): +llm_quiet = Llama(model_path="models/qwen3.gguf", verbose=False) +llm_quiet("What is a neural network?") + +# Modern (fine-grained control): +llm = Llama(model_path="models/qwen3.gguf", verbosity=3) +llm("What is a neural network?") +``` + +### 3. Low-Level Debugging +For deep backend debugging, set `verbosity=5` (DEBUG) and optionally disable substring filters to see all diagnostic output. + +```python +from llama_cpp import Llama + +# Debug-level logs, showing all backend diagnostics +llm = Llama(model_path="models/qwen3.gguf", verbosity=5) + +# If you want to see normally filtered CUDA Graph messages: +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[], # Disable all substring filters +) +``` + +### 4. Substring-Based Backend Noise Filtering +Suppress known noisy backend messages by passing substring filters. This prevents "CUDA Graph" and model loading chatter from flooding the console. + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # INFO level + log_filters=[ + "CUDA Graph id", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", + "llama_perf_context_print", + ], +) +llm("What is a transformer?") +``` + +### 5. Runtime Logging Adjustments +Since logging is process-global, you can adjust verbosity or filters at runtime — changes apply to all `Llama` instances in the same process. + +```python +from llama_cpp import Llama + +llm = Llama(model_path="models/qwen3.gguf", verbosity=2) # QUIET: only show warnings and errors +llm("Quick answer: What is machine learning?") + +# Temporarily increase verbosity for diagnostics +llm.set_verbosity(5) +llm("Show me the full debug log for this prompt") +llm.set_verbosity(2) # Return to QUIET + +# Add a specific filter without resetting everything +llm.add_log_filters(["llama_perf_context_print"]) +llm("Final answer: What is machine learning?") +``` + +### 6. Complete Diagnostic Session +For a full diagnostic session, combine precise verbosity, custom filters, and runtime control: + +```python +from llama_cpp import Llama + +# 1. Start with info-level verbosity +llm = Llama(model_path="models/qwen3.gguf", verbosity=3) + +# 2. Suppress backend noise +llm.set_log_filters([ + "CUDA Graph", + "CUDA graph", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", +]) + +# 3. Run inference +llm("Explain the llama.cpp inference pipeline") + +# 4. Temporarily increase verbosity for a specific call +llm.set_verbosity(5) +llm("Show debug output for cache hit details") +llm.set_verbosity(2) # Return to normal + +# 5. Remove filters after session +llm.clear_log_filters() +``` + +## Key Considerations + +- **Process-global**: Logging configuration affects all `Llama` instances in the same process. Use `add_log_filters` or `set_log_filters` carefully when multiple instances run concurrently. +- **Flushed immediately**: Every log call flushes to `stdout`/`stderr`, so output appears immediately. +- **Shorthand vs. precise**: Prefer `verbosity`/`set_verbosity` over `verbose`/`set_verbose`/`set_quiet`/`set_silent` for precision, though the shorthands remain for backward compatibility. +- **verbose=False** vs. **verbosity=0**: These have distinct behaviors — `verbose=False` silences Python wrapper prints but not backend diagnostics; `verbosity=0` silences all backend non-error output. + +## Deprecated / Changed APIs + +None documented. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] From 43a96e2e098ca2845d3e4c6acd92643b80579240 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 10:08:29 +0800 Subject: [PATCH 025/139] docs(index): Append Logger.md info and link Signed-off-by: JamePeng --- docs/wiki/index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index 02f2dd5b9a..143d6e629b 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -30,6 +30,7 @@ These pages document major source modules and related classes. | [modules/LlamaEmbedding\|Llama Embedding] | Embedding-related APIs and usage patterns. | | [modules/LlamaGrammar\|Llama Grammar] | Provides grammar utilities for constrained generation. | | [modules/LlamaSpeculative\|Llama Speculative Decoding] | Draft model interfaces and prompt-based speculative decoding helpers. | +| [modules/Logger\|Logger] | provides configuration for runtime logging in `llama-cpp-python`, wrapping the native `ggml`/`llama.cpp` logging infrastructure. It controls verbosity levels, output streams, substring filtering, and callback integration, allowing fine-grained control over diagnostic and informational output from the underlying bindings. | --- @@ -53,6 +54,7 @@ If you are new to this wiki, read the pages in this order: 3. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] 4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] 5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +6. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] If you are contributing documentation, start with: @@ -72,6 +74,7 @@ Currently available pages: - `modules/LlamaEmbedding.md` - `modules/LlamaGrammar.md` - `modules/LlamaSpeculative.md` +- `modules/Logger.md` - `SCHEMA.md` - `contributing-to-wiki.md` From 9187910e35e6f4d063f33364a10812727a05e58d Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sat, 16 May 2026 06:41:17 +0200 Subject: [PATCH 026/139] fix --- llama_cpp/llama.py | 1 - llama_cpp/llama_chat_format.py | 33 +++++++++++---------------------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6dab44602d..7666b822a8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -618,7 +618,6 @@ def __init__( self.chat_handler = llama_chat_format.GenericMTMDChatHandler( gguf_metadata = self.metadata, clip_model_path = clip_model_path, - model_arch = None, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 40491968a9..0be38a19d3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3839,47 +3839,36 @@ def from_pretrained( ) class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + def __init__( self, gguf_metadata: Dict[str, Any], clip_model_path: str, - model_arch: Optional[str] = None, verbose: bool = True, **kwargs ) -> None: self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) - self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.arch is None: - if verbose: - print("Unknown model architecture. Will use general/most-common tags.") - - self.arch = "unknown" if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) - - if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: - self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] - elif self.arch in ["gemma4"]: - self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] - elif self.arch in ["mistral3", "mistral4", "deepseek2"]: - self._chat_format_parser_tags += ["[IMG]"] - elif verbose: - print("Warning: Could not determine chat format parser tags.", flush = True) def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] if self.verbose: print(f"{self.log_prefix} - Start processing") From 2dad6dc407c6b56af281a29bbe2f7a3e15fb712f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 12:46:38 +0800 Subject: [PATCH 027/139] build(cmake): refactor install target lists for new GGML backend layout - Categorize build targets into logical groups (`LLAMA_CPP_TARGETS`, `GGML_CORE_TARGETS`, `GGML_CPU_VARIANT_TARGETS`, and `GGML_BACKEND_TARGETS`) to improve maintainability and keep the Python package installation in sync with the updated upstream GGML backend layout. - Add missing targets such as `llama-common` and the separated `ggml-cpu-*` CPU variant backends. - Ensure all grouped targets are passed through `llama_cpp_python_install_target`. Signed-off-by: JamePeng --- CMakeLists.txt | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04d3ec1fff..c42bbe95f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,14 +117,38 @@ if (LLAMA_BUILD) set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() - # Define list of GGML targets to install - set(GGML_TARGETS + # Define list of LLAMA_CPP/GGML targets to install + set(LLAMA_CPP_TARGETS llama + llama-common + ) + set(GGML_CORE_TARGETS ggml ggml-base ggml-blas - ggml-cann ggml-cpu + ggml-rpc + ) + + set(GGML_CPU_VARIANT_TARGETS + ggml-cpu-x64 + ggml-cpu-sse42 + ggml-cpu-sandybridge + ggml-cpu-ivybridge + ggml-cpu-piledriver + ggml-cpu-haswell + ggml-cpu-skylakex + ggml-cpu-cannonlake + ggml-cpu-cascadelake + ggml-cpu-cooperlake + ggml-cpu-icelake + ggml-cpu-alderlake + ggml-cpu-sapphirerapids + ggml-cpu-zen4 + ) + + set(GGML_BACKEND_TARGETS + ggml-cann ggml-cuda ggml-hexagon ggml-hip @@ -132,7 +156,6 @@ if (LLAMA_BUILD) ggml-musa ggml-opencl ggml-openvino - ggml-rpc ggml-sycl ggml-virtgpu ggml-vulkan @@ -141,8 +164,12 @@ if (LLAMA_BUILD) ggml-zendnn ) - # Loop through targets to avoid repetitive function calls - foreach(TARGET_NAME ${GGML_TARGETS}) + foreach(TARGET_NAME + ${LLAMA_CPP_TARGETS} + ${GGML_CORE_TARGETS} + ${GGML_CPU_VARIANT_TARGETS} + ${GGML_BACKEND_TARGETS} + ) llama_cpp_python_install_target(${TARGET_NAME}) endforeach() From 24b1dc859cba8b2dce7fb2463c78faacc1955997 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 13:07:25 +0800 Subject: [PATCH 028/139] build(cmake): sync llama build options and disable server UI - Update llama build option descriptions to match the current upstream naming style. - Explicitly disable `LLAMA_BUILD_SERVER` to avoid building the server target for Python package wheels. - Explicitly disable `LLAMA_BUILD_UI` and `LLAMA_USE_PREBUILT_UI` because the embedded server Web UI is not needed for wheel builds. - Keep examples, tests, and curl support disabled for minimal wheel artifacts. Signed-off-by: JamePeng --- CMakeLists.txt | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c42bbe95f0..ee72ae9582 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,16 +72,23 @@ if (LLAMA_BUILD) set(CMAKE_SKIP_RPATH FALSE) # Enable building of the common library - set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama.cpp: build common utils library" FORCE) + set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama: build common utils library" FORCE) # Enable build and link OpenSSL - set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) + set(LLAMA_OPENSSL ON CACHE BOOL "llama: use openssl to support HTTPS" FORCE) # Disable building of examples - set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama.cpp: build examples" FORCE) + set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama: build examples" FORCE) # Disable building of tests - set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama.cpp: build tests" FORCE) + set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama: build tests" FORCE) + + # Disable building of server + set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example" FORCE) + + # Disable build the embedded Web UI for server + set(LLAMA_BUILD_UI OFF CACHE BOOL "llama: build the embedded Web UI for server" FORCE) + set(LLAMA_USE_PREBUILT_UI OFF CACHE BOOL "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" FORCE) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: use libcurl to download model from an URL" FORCE) From 4c4e3d007a649e65e8f38a6ce387807299310bdf Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 13:55:17 +0800 Subject: [PATCH 029/139] build(cmake): clean up dev files and import libs from Windows wheels - Remove `ARCHIVE DESTINATION` for Windows targets to avoid installing `.lib` files. - Add a cleanup function to strip `cmake`, `pkgconfig`, and import libraries from the python wheel runtime directories. - Ensures Windows builds only package the required runtime DLLs. Signed-off-by: JamePeng --- CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee72ae9582..8e5d583d90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,13 +11,12 @@ set(CMAKE_INSTALL_LIBDIR llama_cpp/lib CACHE PATH "" FORCE) set(CMAKE_INSTALL_INCLUDEDIR llama_cpp/include CACHE PATH "" FORCE) -# Helper function to install targets to Python package directories +# Install a built target into the Python package runtime directory. function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) return() endif() - # Define install destinations to avoid code duplication set(INSTALL_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" @@ -33,6 +32,9 @@ function(llama_cpp_python_install_target target) RESOURCE DESTINATION ${DIR} ) + # Copy runtime DLL dependencies of this target when available. + # This does not replace explicit installation of dynamic backend + # targets such as ggml-cpu-*; those are installed as targets below. # Automatically handle Windows DLL installation for each target if (WIN32) install( @@ -57,6 +59,40 @@ function(llama_cpp_python_install_target target) endif() endfunction() + +# Remove development-only artifacts from Python wheel runtime directories. +# +# Upstream install rules may place CMake package files, pkg-config files, and +# Windows import libraries under llama_cpp/lib because CMAKE_INSTALL_LIBDIR is +# redirected there for wheel builds. They are not needed at runtime. +function(llama_cpp_python_cleanup_dev_files) + if(NOT WIN32) + return() + endif() + + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + ) + + foreach(DIR ${INSTALL_DIRS}) + install(CODE " + if(EXISTS \"${DIR}\") + file(GLOB LLAMA_CPP_IMPORT_LIBS \"${DIR}/*.lib\") + if(LLAMA_CPP_IMPORT_LIBS) + file(REMOVE \${LLAMA_CPP_IMPORT_LIBS}) + endif() + + file(REMOVE_RECURSE + \"${DIR}/cmake\" + \"${DIR}/pkgconfig\" + ) + endif() + ") + endforeach() +endfunction() + + if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") @@ -204,4 +240,8 @@ if (LLAMA_BUILD) llama_cpp_python_install_target(mtmd) endif() + + # Run after all runtime targets are installed, including mtmd. + llama_cpp_python_cleanup_dev_files() + endif() From 6af3cd7df808cb9725b2da0273c23098111c25ea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 15:18:32 +0800 Subject: [PATCH 030/139] fix(_ggml): correct `ggml_backend_unload` function name --- llama_cpp/_ggml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 8f4cb1187f..c4ae7c94bf 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -1295,8 +1295,8 @@ def ggml_backend_load(path: ctypes.c_char_p) -> ggml_backend_reg_t: # // Unload a backend if loaded dynamically and unregister it # GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); -@ggml_function("ggml_backend_load_all", [ctypes.c_void_p], None) -def ggml_backend_load_all(reg: ggml_backend_reg_t): +@ggml_function("ggml_backend_unload", [ctypes.c_void_p], None) +def ggml_backend_unload(reg: ggml_backend_reg_t): """ Unload a backend if loaded dynamically and unregister it """ From 038a953079126fd4a81e574c9c06680e36b0a10e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 16:13:53 +0800 Subject: [PATCH 031/139] feat(core): support loading GGML_BACKEND_DL dynamic backend libraries from wheel lib - Import `ggml_backend_load_all_from_path` and `ggml_backend_reg_count` from `_ggml`. - Load dynamic ggml backend libraries from the packaged `llama_cpp/lib` directory after `llama_backend_init()`. - Support wheels built with `GGML_BACKEND_DL`, where CPU variants and accelerator backends such as `ggml-cpu-*` and `ggml-cuda` are shipped as separate runtime libraries. - Print the registered backend count in verbose mode to help diagnose backend discovery issues. Signed-off-by: JamePeng --- llama_cpp/llama.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 19ede6bcfd..8b1070be4f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -57,6 +57,8 @@ ) from ._ggml import ( ggml_backend_cpu_buffer_type, + ggml_backend_load_all_from_path, + ggml_backend_reg_count ) from ._logger import ( configure_logging, @@ -290,9 +292,40 @@ def __init__( log_filters_case_sensitive=log_filters_case_sensitive, ) + # llama.cpp / ggml backend initialization is process-global. + # Run it once before loading any model. if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): llama_cpp_lib.llama_backend_init() + + # Wheels built with `GGML_BACKEND_DL` ship ggml backends as separate + # dynamic libraries under llama_cpp/lib, for example: + # + # ggml-cpu-x64.dll + # ggml-cpu-haswell.dll + # ggml-cpu-alderlake.dll + # ggml-cuda.dll + # + # With the dynamic backend layout, llama_backend_init() initializes + # the global backend system but does not necessarily register every + # packaged backend. Loading the package lib directory ensures ggml can + # discover CPU variants and optional accelerator backends before model + # loading. + lib_dir = Path(llama_cpp_lib.__file__).resolve().parent / "lib" + + if not lib_dir.exists(): + raise FileNotFoundError(f"Llama.__init__: llama_cpp lib directory not found: {lib_dir}") + + # Load all dynamic ggml backend plugins from the packaged lib directory. + ggml_backend_load_all_from_path( + ctypes.c_char_p(str(lib_dir).encode("utf-8")) + ) + + # Print the number of backend registrations to confirm whether the DLL is loaded. + if self.verbose: + count = ggml_backend_reg_count() + print(f"Llama.__init__: Loaded ggml backend registry count: {count}", file=sys.stderr) + Llama.__backend_initialized = True if isinstance(numa, bool): From a8f928c9b134b61d5f6370d1e02de29efdd95227 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 17:14:15 +0800 Subject: [PATCH 032/139] Bump version to 0.3.39-preview Signed-off-by: JamePeng --- llama_cpp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 438bf08b58..b32fbfd36e 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.38" +__version__ = "0.3.39-preview" From 628373c1af97935a8c00e5273c5e9dd90dcd6b4c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 17:29:56 +0800 Subject: [PATCH 033/139] ci(cu131+windows): build CU131 wheels with GGML dynamic backends for windows - Replace the old CPU/AVX release tag matrix with a single CU131 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu130`. - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu130-win.yml | 134 ------------- .github/workflows/build-wheels-cu131-win.yml | 191 +++++++++++++++++++ 2 files changed, 191 insertions(+), 134 deletions(-) delete mode 100644 .github/workflows/build-wheels-cu130-win.yml create mode 100644 .github/workflows/build-wheels-cu131-win.yml diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml deleted file mode 100644 index d6187d7bf4..0000000000 --- a/.github/workflows/build-wheels-cu130-win.yml +++ /dev/null @@ -1,134 +0,0 @@ -name: Build Wheels (CU130) for Windows - -on: - workflow_dispatch: - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] - cuda: ["13.0.2"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v3 - with: - msbuild-architecture: x64 - - - uses: actions/checkout@v6 - with: - submodules: "recursive" - - # from kingbri1/flash-attention build-wheels.yml - - name: Install CUDA ${{ matrix.cuda }} - uses: Jimver/cuda-toolkit@v0.2.35 - id: cuda-toolkit - with: - cuda: "${{ matrix.cuda }}" - use-github-cache: false - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v7 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - name: Install Dependencies - run: | - git config --system core.longpaths true - uv pip install --upgrade build setuptools wheel packaging - - - name: Build Wheel - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_HOME = $env:CUDA_PATH - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' - } - python -m build --wheel - - # Check if wheel was built - if (!(Test-Path '.\dist\*.whl')) { - Write-Error "No wheel built in dist/ directory" - exit 1 - } - - $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - - # Split file name: name-ver-py-abi-plat.whl - $parts = $wheelFile.Name.Split('-') - $distName = $parts[0] - $version = $parts[1] - $pyTag = $parts[2] - $abiTag = $parts[3] - $platTag = $parts[4] - - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - - $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" - - # Rename wheel file - Rename-Item -Path $wheelFile.FullName -NewName $newName - Write-Output "Renamed wheel to: $newName" - - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - - name: Get Current Date - id: get-date - run: | - $currentDate = Get-Date -UFormat "%Y%m%d" - Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - - name: Create Release - if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v3 - with: - files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml new file mode 100644 index 0000000000..14bea65d19 --- /dev/null +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -0,0 +1,191 @@ +name: Build Wheels (CU131) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu131 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.1.1"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From df07219dee25ae2cc95f842bd1a7c81e6bfb599f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 07:57:05 +0800 Subject: [PATCH 034/139] ci(cu12+windows): build CU124-128 wheels with GGML dynamic backends for windows - Replace the old CPU/AVX release tag matrix with a single CU124-128 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu124/cu126/cu128`. - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu124-win.yml | 145 ++++++++++++------ .github/workflows/build-wheels-cu126-win.yml | 145 ++++++++++++------ .github/workflows/build-wheels-cu128-win.yml | 147 +++++++++++++------ 3 files changed, 304 insertions(+), 133 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index 01bd48e7de..e856533410 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu124 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.4.1"] - releasetag: ["Basic"] cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split wheel filename: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index 9330cb130b..b77b17917f 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu126 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.6.3"] - releasetag: ["Basic"] cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split wheel filename: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 98ebbc4127..223473dde6 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu128 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.8.1"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split file name: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From e47843f591f1b879e461bcce3d1877ff943c3f71 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 07:58:19 +0800 Subject: [PATCH 035/139] Update Submodule vendor/llama.cpp 49d1701..b64739e --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 49d1701bd2..b64739ea39 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 49d1701bd24e4cedf6dfec9e50e185111203946b +Subproject commit b64739ea393b3c9d07cc9907e0a611f707838051 From 39785d0efb7a45490fdc45ac340ff2ec1a2eae8c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 08:08:45 +0800 Subject: [PATCH 036/139] fix(_internals): Remove unnecessary free operations; models should not be released within the context. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b4ba1f4b21..277d22aebf 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -494,7 +494,6 @@ def __init__( ctx = llama_cpp.llama_init_from_model(self.model.model, self.params) if ctx is None: - llama_cpp.llama_model_free(self.model.model) raise ValueError("Failed to create context with model") self.ctx = ctx From 127881293e712ffdc3cae43af969d2a46e52c80e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 08:47:19 +0800 Subject: [PATCH 037/139] Sync llama.cpp API 20260517 - llama + spec: MTP Support Signed-off-by: JamePeng --- llama_cpp/_internals.py | 3 +++ llama_cpp/llama.py | 8 ++++++++ llama_cpp/llama_cpp.py | 41 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 277d22aebf..a8dd56083b 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -533,6 +533,9 @@ def n_ubatch(self) -> int: def n_seq_max(self) -> int: return llama_cpp.llama_n_seq_max(self.ctx) + def n_rs_seq(self) -> int: + return llama_cpp.llama_n_rs_seq(self.ctx) + def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8b1070be4f..734485802e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -119,8 +119,12 @@ def __init__( n_batch: int = 2048, n_ubatch: int = 512, n_seq_max: int = 1, + n_rs_seq: int = 0, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, + ctx_type: Optional[ + int + ] = llama_cpp_lib.llama_context_type.LLAMA_CONTEXT_TYPE_DEFAULT, rope_scaling_type: Optional[ int ] = llama_cpp_lib.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, @@ -474,6 +478,7 @@ def __init__( self.n_batch = min(n_ctx, n_batch) # ??? self.n_keep = n_keep if n_keep > 0 else 256 self.n_seq_max = n_seq_max + self.n_rs_seq = n_rs_seq self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -486,8 +491,11 @@ def __init__( self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) self.context_params.n_seq_max = self.n_seq_max + self.context_params.n_rs_seq = self.n_rs_seq self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch + + self.context_params.ctx_type = ctx_type self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index cc900c0648..ec2b665a16 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -471,6 +471,14 @@ class llama_split_mode(enum.IntEnum): LLAMA_SPLIT_MODE_ROW = 2 LLAMA_SPLIT_MODE_TENSOR = 3 +# enum llama_context_type { +# LLAMA_CONTEXT_TYPE_DEFAULT = 0, +# LLAMA_CONTEXT_TYPE_MTP = 1, +# }; +class llama_context_type(enum.IntEnum): + LLAMA_CONTEXT_TYPE_DEFAULT = 0 + LLAMA_CONTEXT_TYPE_MTP = 1 + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -827,9 +835,11 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) +# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing +# enum llama_context_type ctx_type; // set the context type (e.g. MTP) # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings @@ -843,13 +853,14 @@ class llama_sampler_seq_config(ctypes.Structure): # float yarn_beta_fast; // YaRN low correction dim # float yarn_beta_slow; // YaRN high correction dim # uint32_t yarn_orig_ctx; // YaRN original context size -# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, < 0 disabled (default) +# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; # enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] # enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] + # // Abort callback # // if it returns true, execution of llama_decode() will be aborted # // currently works only with CPU execution @@ -862,11 +873,12 @@ class llama_sampler_seq_config(ctypes.Structure): # bool no_perf; // measure performance timings # bool op_offload; // offload host tensor operations to device # bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) -# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some casesAdd commentMore actions -# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 +# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases +# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 # bool kv_unified; // use a unified buffer across the input sequences when computing the attention -# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix -# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 +# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix +# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + # // [EXPERIMENTAL] # // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) # // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) @@ -881,12 +893,16 @@ class llama_context_params(ctypes.Structure): n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) + n_rs_seq (int): number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing + + ctx_type (int): set the context type (e.g. MTP) rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings flash_attn_type (int): when to enable Flash Attention + rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -895,18 +911,23 @@ class llama_context_params(ctypes.Structure): yarn_beta_slow (float): YaRN high correction dim yarn_orig_ctx (int): YaRN original context size defrag_thold (float): [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) + cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval + type_k (int): data type for K cache type_v (int): data type for V cache + abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback + embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU no_perf (bool): whether to measure performance timings op_offload(bool): whether to offload host tensor operations to device swa_full(bool): whether to use full-size SWA cache kv_unified(bool): use a unified buffer across the input sequences when computing the attention + samplers(llama_sampler_seq_config *): the samplers must be sampler chains (i.e. use llama_sampler_chain_init) n_samplers(size_t): numbers of sampler chains """ @@ -916,8 +937,10 @@ class llama_context_params(ctypes.Structure): n_batch: int n_ubatch: int n_seq_max: int + n_rs_seq: int n_threads: int n_threads_batch: int + ctx_type: int rope_scaling_type: int pooling_type: int attention_type: int @@ -950,8 +973,10 @@ class llama_context_params(ctypes.Structure): ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), + ("n_rs_seq", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), + ("ctx_type", ctypes.c_int), ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), @@ -1602,6 +1627,12 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_rs_seq(ctx: llama_context_p, /) -> int: + ... + + # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: From 50627bfa9b16a5061df699b04006029070935c62 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 09:06:03 +0800 Subject: [PATCH 038/139] fix(context): prevent operations on uninitialized or closed contexts - Introduce an internal `_assert_ctx()` method to verify that the underlying C context (`self.ctx`) is valid before invoking dependent `llama.cpp` operations. - Apply `_assert_ctx()` to critical methods (`encode`, `decode`, `get_logits*`, `get_embeddings*`) to prevent hard crashes (segfaults) caused by passing null pointers to the C API. - Upgrade the context initialization failure exception from a generic `ValueError` to a detailed `RuntimeError`, providing developers with actionable hints about potentially out-of-sync `llama_context_params`. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index a8dd56083b..c026440a2d 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -493,8 +493,13 @@ def __init__( ctx = llama_cpp.llama_init_from_model(self.model.model, self.params) - if ctx is None: - raise ValueError("Failed to create context with model") + if not ctx: + raise RuntimeError( + "Failed to create llama context with model. " + "This may indicate that llama_context_params is out of sync with " + "the bundled llama.cpp version, or that required context parameters " + "were not initialized correctly." + ) self.ctx = ctx @@ -518,6 +523,13 @@ def close(self): def __del__(self): self.close() + def _assert_ctx(self): + if not getattr(self, "ctx", None): + raise RuntimeError( + "LlamaContext is not initialized or has already been closed. " + "Context-dependent llama.cpp operations cannot continue." + ) + def n_ctx(self) -> int: return llama_cpp.llama_n_ctx(self.ctx) @@ -654,6 +666,7 @@ def set_state_seq_data_ext( # // Decoding API def encode(self, batch: LlamaBatch): + self._assert_ctx() return_code = llama_cpp.llama_encode( self.ctx, batch.batch, @@ -678,6 +691,7 @@ def decode(self, batch: 'LlamaBatch') -> int: RuntimeError: If a fatal, non-recoverable error occurs during decoding (e.g., negative error codes or invalid batch structures). """ + self._assert_ctx() return_code = llama_cpp.llama_decode(self.ctx, batch.batch) if return_code == 0: @@ -741,21 +755,27 @@ def synchronize(self): llama_cpp.llama_synchronize(self.ctx) def get_logits(self): + self._assert_ctx() return llama_cpp.llama_get_logits(self.ctx) def get_logits_ith(self, i: int): + self._assert_ctx() return llama_cpp.llama_get_logits_ith(self.ctx, i) def set_embeddings(self, embeddings: bool): + self._assert_ctx() llama_cpp.llama_set_embeddings(self.ctx, embeddings) def get_embeddings(self): + self._assert_ctx() return llama_cpp.llama_get_embeddings(self.ctx) def get_embeddings_ith(self, i: int): + self._assert_ctx() return llama_cpp.llama_get_embeddings_ith(self.ctx, i) def get_embeddings_seq(self, seq_id: int): + self._assert_ctx() return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id) def reset_timings(self): From a4c8d77d1817bcf7255b8e93aa4733e5378d9211 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 12:29:48 +0800 Subject: [PATCH 039/139] ci(cu131+linux): build CU131 wheels with GGML dynamic backends for linux - Replace the old CPU/AVX release tag matrix with a single CU131 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Linux wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu131`. - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. Signed-off-by: JamePeng --- .../workflows/build-wheels-cu130-linux.yml | 132 --------------- .../workflows/build-wheels-cu131-linux.yml | 156 ++++++++++++++++++ 2 files changed, 156 insertions(+), 132 deletions(-) delete mode 100644 .github/workflows/build-wheels-cu130-linux.yml create mode 100644 .github/workflows/build-wheels-cu131-linux.yml diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml deleted file mode 100644 index 4f4305ad3e..0000000000 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ /dev/null @@ -1,132 +0,0 @@ -name: Build Wheels(CU130) for Linux - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions - cuda: ["13.0.2"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v7 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for rename wheel file and release tag --- - - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - - wheel_path=$(ls dist/*.whl | head -n 1) - filename=$(basename "$wheel_path") - - # Split wheel filename - IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - - # Rename wheel file - mv "$wheel_path" "dist/$new_filename" - echo "Renamed wheel to: $new_filename" - - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu131-linux.yml b/.github/workflows/build-wheels-cu131-linux.yml new file mode 100644 index 0000000000..d70f8a01c8 --- /dev/null +++ b/.github/workflows/build-wheels-cu131-linux.yml @@ -0,0 +1,156 @@ +name: Build Wheels (CU131) for Linux + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu131 + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.1.2-cudnn-devel-ubuntu22.04 + + strategy: + fail-fast: false + matrix: + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions + cuda: ["13.1.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real;121-real"] + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Install dependencies + run: | + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Show CUDA version + run: nvcc -V + + - name: Build wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" + run: | + set -euo pipefail + + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true + + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi + + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" + + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" + + - name: Get current date + id: get-date + run: | + currentDate=$(date +%Y%m%d) + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 42891efb70bfd8ac6dd438e99f6c2a1b4119299d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 19:39:57 +0800 Subject: [PATCH 040/139] ci(cu12+linux): build CU124/126/128 wheels with GGML dynamic backends for linux - Replace the old CPU/AVX release tag matrix with a single CU124/126/128 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Linux wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu124/cu126/cu128`. Signed-off-by: JamePeng --- .../workflows/build-wheels-cu124-linux.yml | 170 ++++++++++-------- .../workflows/build-wheels-cu126-linux.yml | 170 ++++++++++-------- .../workflows/build-wheels-cu128-linux.yml | 170 ++++++++++-------- 3 files changed, 291 insertions(+), 219 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 889a1679a4..d7a3a90d81 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU124) for Linux # Workflow name +name: Build Wheels (CU124) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu124 runs-on: ubuntu-22.04 container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.4.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 568824c642..9f28a57ca2 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU126) for Linux # Workflow name +name: Build Wheels (CU126) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu126 runs-on: ubuntu-22.04 container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.6.3"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index d1c387c52a..c6b255c9f9 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU128) for Linux # Workflow name +name: Build Wheels (CU128) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu128 runs-on: ubuntu-22.04 container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.8.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 6e32ef0171b075605fa527181b924a816a237b77 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 19:42:56 +0800 Subject: [PATCH 041/139] ci: Remove outdated workflow Signed-off-by: JamePeng --- .github/workflows/build-and-release.yaml | 145 ----------------------- .github/workflows/build-docker.yaml | 50 -------- 2 files changed, 195 deletions(-) delete mode 100644 .github/workflows/build-and-release.yaml delete mode 100644 .github/workflows/build-docker.yaml diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml deleted file mode 100644 index 7eaf017fbc..0000000000 --- a/.github/workflows/build-and-release.yaml +++ /dev/null @@ -1,145 +0,0 @@ -name: Build Release - -on: workflow_dispatch - -permissions: - contents: write - -jobs: - build_wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-22.04, windows-2022, macos-14, macos-15] - - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - # Used to host cibuildwheel - - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies (Linux/MacOS) - if: runner.os != 'Windows' - run: | - python -m pip install --upgrade pip - python -m pip install uv - RUST_LOG=trace python -m uv pip install -e .[all] --verbose - shell: bash - - - name: Install dependencies (Windows) - if: runner.os == 'Windows' - env: - RUST_LOG: trace - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - shell: cmd - - - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 - env: - # disable repair - CIBW_REPAIR_WHEEL_COMMAND: "" - with: - package-dir: . - output-dir: wheelhouse - - - uses: actions/upload-artifact@v4 - with: - name: wheels-${{ matrix.os }} - path: ./wheelhouse/*.whl - - build_wheels_arm64: - name: Build arm64 wheels - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 - env: - CIBW_SKIP: "*musllinux* pp*" - CIBW_REPAIR_WHEEL_COMMAND: "" - CIBW_ARCHS: "aarch64" - CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON" - CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" - with: - output-dir: wheelhouse - - - name: Upload wheels as artifacts - uses: actions/upload-artifact@v4 - with: - name: wheels_arm64 - path: ./wheelhouse/*.whl - - build_sdist: - name: Build source distribution - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies (Linux/MacOS) - if: runner.os != 'Windows' - run: | - python -m pip install --upgrade pip - python -m pip install uv - RUST_LOG=trace python -m uv pip install -e .[all] --verbose - python -m uv pip install build - shell: bash - - - name: Install dependencies (Windows) - if: runner.os == 'Windows' - env: - RUST_LOG: trace - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - python -m uv pip install build - shell: cmd - - - name: Build source distribution - run: | - python -m build --sdist - - - uses: actions/upload-artifact@v4 - with: - name: sdist - path: ./dist/*.tar.gz - - release: - name: Release - needs: [build_wheels, build_wheels_arm64, build_sdist] - runs-on: ubuntu-latest - - steps: - - uses: actions/download-artifact@v4 - with: - merge-multiple: true - path: dist - - - uses: softprops/action-gh-release@v2 - with: - files: dist/* - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml deleted file mode 100644 index b290f6273f..0000000000 --- a/.github/workflows/build-docker.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: Build Docker - -on: workflow_dispatch - -permissions: - contents: write - packages: write - -jobs: - docker: - name: Build and push Docker image - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - id: docker_build - uses: docker/build-push-action@v6 - with: - context: . - file: "docker/simple/Dockerfile" - push: ${{ startsWith(github.ref, 'refs/tags/') }} - pull: true - platforms: linux/amd64,linux/arm64 - tags: | - ghcr.io/abetlen/llama-cpp-python:latest - ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }} - build-args: | - BUILDKIT_INLINE_CACHE=1 - - - name: Publish to GitHub Tag - if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/') - run: | - echo "Docker image published for tag: ${{ github.ref_name }}" From b8d69f29e71b74973a7fd295e32d9e9d86908f5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:11:42 +0800 Subject: [PATCH 042/139] Update .gitmodules submodule git addr Signed-off-by: JamePeng --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7edf0975dc..f56cca32df 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/ggml-org/llama.cpp.git From ec580cb1c3b0ef946523979dca63df5c2d0483cc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:23:49 +0800 Subject: [PATCH 043/139] chore(docs): remove outdated mkdocs workflow - Transition documentation focus to the repository wiki and /docs/wiki. - Clean up and remove all unnecessary mkdocs-related configuration files. Signed-off-by: JamePeng --- .readthedocs.yaml | 24 ------------ docs/api-reference.md | 88 ------------------------------------------- docs/changelog.md | 1 - docs/index.md | 5 --- docs/requirements.txt | 3 -- mkdocs.yml | 74 ------------------------------------ pyproject.toml | 14 ++----- 7 files changed, 3 insertions(+), 206 deletions(-) delete mode 100644 .readthedocs.yaml delete mode 100644 docs/api-reference.md delete mode 100644 docs/changelog.md delete mode 100644 docs/index.md delete mode 100644 docs/requirements.txt delete mode 100644 mkdocs.yml diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index ff3e950cd1..0000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Read the Docs configuration file for MkDocs projects -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-22.04 - tools: - python: "3.11" - -mkdocs: - configuration: mkdocs.yml - -python: - install: - - method: pip - path: . - - requirements: docs/requirements.txt - -submodules: - include: all - recursive: true \ No newline at end of file diff --git a/docs/api-reference.md b/docs/api-reference.md deleted file mode 100644 index ab51ef754e..0000000000 --- a/docs/api-reference.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -title: API Reference ---- - -## High Level API - -High-level Python bindings for llama.cpp. - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - create_chat_completion_openai_v1 - - set_cache - - save_state - - load_state - - token_bos - - token_eos - - from_pretrained - show_root_heading: true - -::: llama_cpp.LlamaGrammar - options: - members: - - from_string - - from_json_schema - -::: llama_cpp.LlamaCache - options: - show_root_heading: true - -::: llama_cpp.LlamaState - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessor - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessorList - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteria - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteriaList - options: - show_root_heading: true - -## Low Level API - -Low-level Python bindings for llama.cpp using Python's ctypes library. - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - # filter only members starting with `llama_` - filters: - - "^llama_" - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - # filter only members starting with `LLAMA_` - filters: - - "^LLAMA_" - -## Misc - -::: llama_cpp.llama_types - options: - show_if_no_docstring: true \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index 047bc14424..0000000000 --- a/docs/changelog.md +++ /dev/null @@ -1 +0,0 @@ --8<- "CHANGELOG.md" \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 60bc7aef42..0000000000 --- a/docs/index.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Getting Started ---- - --8<- "README.md" \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 199bd4ffbf..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mkdocs -mkdocs-material -mkdocstrings[python] \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 79a9e67a1a..0000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,74 +0,0 @@ -site_name: llama-cpp-python -repo_url: https://github.com/abetlen/llama-cpp-python - -theme: - name: material - palette: - - # Palette toggle for light mode - - scheme: default - primary: indigo - toggle: - icon: material/brightness-7 - name: Switch to dark mode - - # Palette toggle for dark mode - - scheme: slate - primary: indigo - toggle: - icon: material/brightness-4 - name: Switch to light mode - -plugins: - - search - - mkdocstrings: - handlers: - python: - options: - members_order: source - group_by_category: false - signature_crossrefs: true - show_signature: true - docstring_section_style: list - show_root_heading: true - heading_level: 3 - preload_modules: - - typing - - typing_extensions - - ctypes - import: - - https://docs.python.org/3/objects.inv - - https://numpy.org/doc/stable/objects.inv - -watch: - - llama_cpp - - README.md - -nav: - - "Getting Started": "index.md" - - "Installation Guides": - - "macOS (Metal)": "install/macos.md" - - "API Reference": "api-reference.md" - - "OpenAI Compatible Web Server": "server.md" - - "Changelog": "changelog.md" - -markdown_extensions: - - attr_list - - pymdownx.emoji: - emoji_index: !!python/name:materialx.emoji.twemoji - emoji_generator: !!python/name:materialx.emoji.to_svg - - pymdownx.highlight: - anchor_linenums: true - line_spans: __span - pygments_lang_class: true - - pymdownx.inlinehilite - - pymdownx.magiclink: - repo_url_shorthand: true - user: abetlen - repo: llama-cpp-python - - pymdownx.snippets - - pymdownx.superfences - - pymdownx.tabbed: - alternate_style: true - - pymdownx.tilde - - tables diff --git a/pyproject.toml b/pyproject.toml index 2e439c0685..eb4b879dd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,17 +49,9 @@ test = [ "pydantic-settings>=2.0.1", "huggingface-hub>=0.23.0" ] -dev = [ - "black>=23.3.0", - "twine>=4.0.2", - "mkdocs>=1.4.3", - "mkdocstrings[python]>=0.22.0", - "mkdocs-material>=9.1.18", - "pytest>=7.4.0", - "httpx>=0.24.1", -] + all = [ - "llama_cpp_python[server,test,dev]", + "llama_cpp_python[server,test]", ] [tool.scikit-build] @@ -76,7 +68,7 @@ input = "llama_cpp/__init__.py" [project.urls] Homepage = "https://github.com/JamePeng/llama-cpp-python" Issues = "https://github.com/JamePeng/llama-cpp-python/issues" -Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/" +Documentation = "https://github.com/JamePeng/llama-cpp-python/wiki" Changelog = "https://github.com/JamePeng/llama-cpp-python/blob/main/CHANGELOG.md" FAQ = "https://github.com/JamePeng/llama-cpp-python?tab=readme-ov-file#faq" From d33d98806f1e55219a9a9bfde9557b06e8f16b01 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:28:23 +0800 Subject: [PATCH 044/139] Update Submodule vendor/llama.cpp b64739e..39cf5d6 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b64739ea39..39cf5d6191 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b64739ea393b3c9d07cc9907e0a611f707838051 +Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee From e87041e4ee6a89798abe9f36315f60f3fb06c5cb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:57:56 +0800 Subject: [PATCH 045/139] docs(readme): update wheel requirements and dynamic CPU backend info - Update supported CUDA versions to include 12.8 and 13.1, while outlining the supported compute architectures (SM70 up to SM120a). - Document the transition to `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` starting in `0.3.39-preview`. - Clarify that dynamic CPU backend loading eliminates the need for separate `Basic` and `AVX2` wheel distributions. - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for achieving full x64 CPU variant coverage on Windows. Signed-off-by: JamePeng --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index caec7e32e0..4a1550a85c 100644 --- a/README.md +++ b/README.md @@ -162,12 +162,41 @@ pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python **Pre-built Wheel (New)** -It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: +It is also possible to install a pre-built wheel with CUDA support. Make sure your system meets the following requirements: -- CUDA Version is 12.4, 12.6, 12.8 or 13.0 -- Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 -- Basic version(Default): A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). -- AVX2 version: A version compiled using AVX2 instructions. +- CUDA version: 12.4, 12.6, 12.8, or 13.1 +- Python version: 3.10, 3.11, 3.12, 3.13, or 3.14 +- Starting with `0.3.39-preview`, Windows and Linux x64 wheels are built with `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS`. + +This means CPU backends are shipped as dynamically loaded runtime libraries under: + +```text +site-packages/llama_cpp/lib +```` + +Supported CPU backend variants may include: + +* `ggml-cpu-x64` +* `ggml-cpu-sse42` +* `ggml-cpu-sandybridge` +* `ggml-cpu-ivybridge` +* `ggml-cpu-piledriver` +* `ggml-cpu-haswell` +* `ggml-cpu-skylakex` +* `ggml-cpu-cannonlake` +* `ggml-cpu-cascadelake` +* `ggml-cpu-cooperlake` +* `ggml-cpu-icelake` +* `ggml-cpu-alderlake` +* `ggml-cpu-sapphirerapids` +* `ggml-cpu-zen4` + +The old `Basic` and `AVX2` wheel variants are no longer required for the new dynamic-backend wheels. GGML can load the compatible CPU backend at runtime, which improves CPU instruction-set compatibility across different x64 machines. + +Before `0.3.39-preview`: + +* `Basic`: compiled without AVX instructions for maximum compatibility. +* `AVX2`: compiled with AVX2 instructions for newer CPUs. Check the releases page: https://github.com/JamePeng/llama-cpp-python/releases @@ -1695,17 +1724,20 @@ This error is primarily caused by the following reasons: 3. **CUDA Version Mismatch:** Regarding `ggml-cuda.dll`, the CUDA version of the pre-compiled library does not match your local CUDA Toolkit version (e.g., a mismatch between CUDA 12.X and CUDA 13.X). It is recommended to fully configure your local CUDA Toolkit environment (ensuring the PATH for dynamic libraries is set and the nvcc compiler is recognized). Then, clone the code and compile it locally. -### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions range from 300MB to 900MB? +### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions are 300MB or larger? -My GitHub Actions script is configured to compile against **all supported CUDA compute architectures** for each specific CUDA version I maintain. +My GitHub Actions workflow is configured to compile against multiple supported CUDA compute architectures for each CUDA version I maintain. For example: -* **CUDA 13.0.2:** Currently supports architectures from SM75 (Turing) up to SM120a (Blackwell). -* **CUDA 12.4.1 and 12.6.3:** Support older architectures as well, such as SM70. -* *(Note: The Windows versions are built to support every architecture compatible with the respective CUDA version).* +- **CUDA 13.1 and CUDA 12.8:** currently target architectures from SM75 (Turing) up to SM120a / SM121a (Blackwell generation, depending on CUDA support). +- **CUDA 12.4 and CUDA 12.6:** currently target architectures from SM70 (Volta) up to SM90 (Hopper). + +Libraries from other authors are often smaller because they may only compile for a single architecture, such as RTX 30 series (`SM86`) or RTX 40 series (`SM89`). To maximize compatibility, these wheels include CUDA kernels for a wider range of GPUs. You only need to choose the wheel that matches your installed CUDA version. + + - **Updated 2026-05-16 / 2026-05-17:** Starting with `0.3.39-preview`, Windows wheels support the `GGML_BACKEND_DL` + `GGML_CPU_ALL_VARIANTS` runtime layout. CPU backend libraries such as `ggml-cpu-*.dll` are packaged under `site-packages/llama_cpp/lib` and loaded dynamically at runtime. This allows GGML to select a compatible CPU backend automatically, reducing the need for separate `Basic` / `AVX2` wheel variants. -The reason libraries from other authors are smaller is that they often **only compile for a single architecture** (e.g., targeting only the RTX 30 series [SM86] or the RTX 40 series [SM89]). To maximize convenience, I provide an **integrated compilation** covering a wide range of hardware; you simply need to select the CUDA version that matches your environment to load and run it. + - Note: for full x64 CPU variant coverage on Windows, LLVM/Clang builds are preferred. MSVC may skip some variants such as `zen4`, `cooperlake`, or `sapphirerapids` due to compiler intrinsic support limitations. ### Quick tips for develop/user (continuously updated): From a778c57d73ec7d4f43e2518a513e7d4cf68a0df8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 23:35:57 +0800 Subject: [PATCH 046/139] Bump version to 0.3.39 Signed-off-by: JamePeng --- CHANGELOG.md | 116 ++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 253b2ae4cc..e4c6e4c976 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,122 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.39] Dynamic GGML Backends, Qwen3-ASR/MiniCPM-V-4.6, On-Device Hybrid Checkpoint, and Granular Logging + +- **ci(cu131/128/126/124): build wheels with GGML dynamic backends for windows/Linux** + - Replace the old CPU/AVX release tag matrix with a single backend + wheel layout. + - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship + runtime-loadable GGML backend DLLs and CPU variant backends. + - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, + tests, tools, server, embedded UI, and curl. + - Remove the `.basic` style local version suffix and publish wheels + as `+cu131`. + - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. + - Note: for full x64 CPU variant coverage on Windows, LLVM/Clang builds are preferred. MSVC may skip some variants such as zen4, cooperlake, or sapphirerapids due to compiler intrinsic support limitations. + +- **feat(core): support loading GGML_BACKEND_DL dynamic backend libraries from wheel lib** + - Import `ggml_backend_load_all_from_path` and `ggml_backend_reg_count` + from `_ggml`. + - Load dynamic ggml backend libraries from the packaged `llama_cpp/lib` + directory after `llama_backend_init()`. + - Support wheels built with `GGML_BACKEND_DL`, where CPU variants and + accelerator backends such as `ggml-cpu-*` and `ggml-cuda` are shipped as + separate runtime libraries. + - Print the registered backend count in verbose mode to help diagnose backend + discovery issues. + +- **build(cmake): refactor install target lists for new GGML backend layout** + - Categorize build targets into logical groups (`LLAMA_CPP_TARGETS`, + `GGML_CORE_TARGETS`, `GGML_CPU_VARIANT_TARGETS`, and `GGML_BACKEND_TARGETS`) + to improve maintainability and keep the Python package installation in sync + with the updated upstream GGML backend layout. + - Add missing targets such as `llama-common` and the separated + `ggml-cpu-*` CPU variant backends. + - Ensure all grouped targets are passed through `llama_cpp_python_install_target`. + - Update llama build option descriptions to match the current upstream naming style. + - Explicitly disable `LLAMA_BUILD_SERVER` to avoid building the server target for Python package wheels. + - Explicitly disable `LLAMA_BUILD_UI` and `LLAMA_USE_PREBUILT_UI` because the + embedded server Web UI is not needed for wheel builds. + - Keep examples, tests, and curl support disabled for minimal wheel artifacts. + - Add a cleanup function to strip `cmake`, `pkgconfig`, and import libraries from the python wheel runtime directories. + - Ensures Windows builds only package the required runtime DLLs. + +- **Implement Qwen3ASRChatHandler for Qwen3-ASR models.** + - Integrate MTMD multimodal logic to extract and inject `audio_url` and base64 `input_audio` data directly into the `<|audio_start|><|audio_pad|>[DATA]<|audio_end|>` sequence. + - Define a default multilingual transcription system prompt and configure model-specific stop tokens. + - docs(README.md): add Qwen3-ASR documentation and usage example + - Update the supported multi-modal models table to include `qwen3-asr` and the `Qwen3ASRChatHandler`. + - Add a new dedicated section for Speech-to-Text inference with a complete, collapsible Python script. + - Provide a `build_media_payload` helper function to demonstrate proper Base64 encoding of local `.wav` and `.mp3` files into OpenAI-compatible `input_audio` schemas. + - Include a critical warning advising users to use BF16 quantization for the multimodal projector (`mmproj`) to prevent audio degradation. + - Clarify usage mechanics, specifically that all instructions must be placed in the `system` role due to the ASR template's text-dropping behavior. + +- **Implement MiniCPMV46ChatHandler for MiniCPM-V-4.6** + +- **feat(core): integrate fine-grained logging API into Llama class** + - This commit exposes the newly refactored `_logger` configuration system directly through the `Llama` class, providing users with robust, programmatic control over native `llama.cpp` backend logs. + - docs(wiki): document runtime verbosity and log filters for Llama + - docs(Llama.md): update verbose=False vs. verbosity=0 note + - Key changes: + - Expand `Llama.__init__` with `verbosity`, `log_filters`, and `log_filters_case_sensitive` parameters. + - Add instance methods for runtime log management (`set_verbosity`, `get_verbosity`, `set_log_filters`, `add_log_filters`, `clear_log_filters`, etc.). + - Add comprehensive docstrings explaining the 0-5 verbosity scale and explicitly noting the process-global nature of the native backend logger. + - Advantages over the legacy implementation: + - Granular Control: Replaces the restrictive binary `verbose=True/False` flag (which only toggled between ERROR and DEBUG) with a granular 6-tier scale (output, error, warn, info, trace, debug). + - Dynamic Filtering: Empowers users to actively suppress specific noisy C++ logs using custom substring filters, removing the need for hardcoded internal patches. + - Better Discoverability: Attaches logging controls directly to the `Llama` object, making log management much more accessible and intuitive without requiring users to import internal logger modules. + +- **feat(logger): refactor and enhance ggml logging configuration system** + - Introduce a `LoggerConfig` dataclass to provide fine-grained control over native ggml/llama.cpp runtime logging. + - Align `verbosity` levels (0 to 5) with upstream `llama.cpp` conventions (`common/log.h`). + - Implement a dynamic, configurable substring filtering system, replacing the hardcoded "CUDA Graph" patch with `DEFAULT_LOG_FILTERS`. + - Add comprehensive public APIs for log management: `configure_logging`, `set_verbosity`, `set_quiet`, `set_silent`, `set_log_filters`, and `add_log_filters`. + - Maintain backwards compatibility for the existing `set_verbose(bool)` function. + - Improve the `ggml_log_callback` to correctly handle `GGML_LOG_LEVEL_CONT` by inheriting the verbosity of the preceding log message. + - Route `GGML_LOG_LEVEL_NONE` to `stdout` and all other diagnostic logs to `stderr` by default. + - docs(Logger.md): Upload Logger documentation + +- fix(MTMDChatHandler): correct audio_url content type check and improve variable handling + - Changed condition from `content == "audio_url"` to `content_type == "audio_url"` for proper type-based dispatching. + - Extracted `audio_url` variable for better readability. + - Converted `else` to `elif content_type == "input_audio"` to make the control flow explicit and safer. + +- fix(_internals): Remove unnecessary free operations; models should not be released within the context. + +- **feat(cache): add on-device hybrid checkpoint support** + - Introduce `HybridCheckpointCache` with dual-mode behavior (Host/On-Device). + - Device mode utilizes `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` to keep tensor + payloads in `llama_context` VRAM, reducing host-device copy overhead. + - Host mode remains the default, preserving full Python-owned rollback history. + - Implement safety guards against stale on-device checkpoint restores and + enforce one active device checkpoint per `seq_id`. + - Unify checkpoint management with shared FIFO eviction. + - Expose `checkpoint_on_device` in `Llama.__init__` and reduce default + `ctx_checkpoints` from 32 to 16. + - Enhance verbose logging and docs to clarify host vs. VRAM ownership + semantics and track memory usage accurately. + - Rename internal `_flag_partial` to `_flags` to support multiple state flags. + - Update /docs/wiki/core/Llama.md for on_device option + - Update /docs/wiki/modules/LlamaCache.md for on_device option + +- docs: Update /docs/wiki and README.md file and remove outdated mkdocs workflow + - docs(readme): update wheel requirements and dynamic CPU backend info + - Update supported CUDA versions to include 12.8 and 13.1, while outlining + the supported compute architectures (SM70 up to SM120a). + - Document the transition to `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` + starting in `0.3.39-preview`. + - Clarify that dynamic CPU backend loading eliminates the need for separate + `Basic` and `AVX2` wheel distributions. + - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for + achieving full x64 CPU variant coverage on Windows. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee](https://github.com/ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee) + +- feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260517 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/ef27f333f367fdc53dc1a729ad8bb6c3c9362514...e87041e4ee6a89798abe9f36315f60f3fb06c5cb + ## [0.3.38] Optimized CJK Detokenization, Sync Grammar Parser, and Patched CUDA Graph Logs - perf: Optimize detokenize buffer sizing for CJK-heavy outputs diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index b32fbfd36e..ec28faae66 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.39-preview" +__version__ = "0.3.39" From a96f2807c3be057650b7bc34173274e1cda68128 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:41:47 +0800 Subject: [PATCH 047/139] ci(metal): upgrade actions/download-artifact@v6 ->v7 - actions/download-artifact@v7 now runs on Node.js 24 Signed-off-by: JamePeng --- .github/workflows/build-wheels-metal.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 40675b4c26..a809909720 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -75,7 +75,7 @@ jobs: uses: actions/checkout@v6 - name: Download artifacts - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: merge-multiple: true path: dist2 From b48d57a2b4019bbd248c848eefa1442c9e7890cb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:43:37 +0800 Subject: [PATCH 048/139] Update Submodule vendor/llama.cpp 39cf5d6..6db1304 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 39cf5d6191..6db130445d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee +Subproject commit 6db130445d29b243ee2171efb8cd61b84a1c5322 From f309265b0df3ab2477682db3a959656dcb6d06e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 19:36:28 +0800 Subject: [PATCH 049/139] build(ci+cu131): bundle LLVM OpenMP runtime for Windows CPU backends - Add a PowerShell step to the Windows CI workflow to locate and copy `libomp140.x86_64.dll` from the Visual Studio redistributables. - Place the runtime DLL into the `llama_cpp\lib` package directory. This ensures that the dynamically loaded `ggml-cpu-*.dll` variants (which are built with LLVM OpenMP on Windows) have their required dependencies packaged in the wheel. Without this, `ggml_backend_load_all_from_path()` can silently fail to load the CPU backends at runtime on end-user machines. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu131-win.yml | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 14bea65d19..5f77003a5f 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,6 +67,31 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + - name: Copy LLVM OpenMP runtime + shell: pwsh + run: | + # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. + # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. + # If it is missing from the wheel, ggml_backend_load_all_from_path() + # may fail to load CPU backend DLLs at runtime. + $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" + New-Item -ItemType Directory -Force $packageLibDir | Out-Null + + $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` + -Recurse ` + -Filter "libomp140.x86_64.dll" ` + -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "OpenMP\.LLVM" } | + Select-Object -First 1 + + if (!$omp) { + Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." + exit 1 + } + + Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force + Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" + - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') From dd61687fc6cbabb0885e45d708cc9562d1bd2d53 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:43:37 +0800 Subject: [PATCH 050/139] Update Submodule vendor/llama.cpp 39cf5d6..6db1304 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 39cf5d6191..d14ce3dab4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee +Subproject commit d14ce3dab4de197adec5166faa54ac5db8262f26 From 2bc3cdded9285b591454e11e50a4b1524afa32ff Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 22:54:36 +0800 Subject: [PATCH 051/139] build(cmake): package LLVM OpenMP runtime DLL for Windows wheels Dynamically loaded GGML CPU backends compiled with LLVM/Clang and OpenMP require `libomp140.x86_64.dll` at runtime. - Add `llama_cpp_python_install_windows_runtime_file` to handle installing arbitrary extra DLLs with proper CMake path normalization. - Add `llama_cpp_python_install_windows_openmp_runtime` to automatically locate the OpenMP DLL in common Visual Studio 2022 directories, with an override available via `LLAMA_CPP_OPENMP_RUNTIME_DLL`. - Execute the OpenMP runtime installation before the dev-file cleanup step to ensure the DLL is correctly packaged in the final wheel. Signed-off-by: JamePeng --- CMakeLists.txt | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e5d583d90..f6dfb7c136 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,106 @@ function(llama_cpp_python_install_target target) endfunction() +# Install an extra Windows runtime DLL into the Python package runtime directory. +# +# Some dynamically loaded backend libraries depend on runtime DLLs that are not +# always discoverable through $. One important example +# is libomp140.x86_64.dll, required by LLVM OpenMP CPU backend variants. +function(llama_cpp_python_install_windows_runtime_file runtime_file) + if(NOT WIN32) + return() + endif() + + if(NOT runtime_file) + return() + endif() + + if(NOT EXISTS "${runtime_file}") + message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}") + return() + endif() + + # Normalize Windows paths for generated cmake_install.cmake. + # Without this, paths like C:\Program Files (...) may produce invalid + # CMake escape sequences such as \P during install. + file(TO_CMAKE_PATH "${runtime_file}" runtime_file_cmake) + + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + ) + + foreach(DIR ${INSTALL_DIRS}) + file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + + install( + FILES "${runtime_file_cmake}" + DESTINATION "${DIR_CMAKE}" + ) + endforeach() +endfunction() + + +# Locate and install the Windows LLVM OpenMP runtime when available. +# +# GGML CPU all-variant backends built with LLVM/Clang + OpenMP depend on +# libomp140.x86_64.dll. Since ggml-cpu-*.dll files are loaded dynamically via +# ggml_backend_load_all_from_path(), the OpenMP runtime must be packaged next to +# them under llama_cpp/lib. +# +# CI may pass LLAMA_CPP_OPENMP_RUNTIME_DLL explicitly. Local builds can rely on +# fallback search paths for Visual Studio Enterprise / BuildTools. +function(llama_cpp_python_install_windows_openmp_runtime) + if(NOT WIN32) + return() + endif() + + set(OPENMP_RUNTIME_DLL "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + else() + file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) + file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) + + set(VS_OPENMP_SEARCH_ROOTS + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" + ) + + foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS}) + if(EXISTS "${ROOT}") + file( + GLOB_RECURSE FOUND_OPENMP_DLLS + "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll" + "${ROOT}/**/libomp140.x86_64.dll" + ) + + if(FOUND_OPENMP_DLLS) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + break() + endif() + endif() + endforeach() + endif() + + if(OPENMP_RUNTIME_DLL) + message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}") + llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") + else() + message(WARNING + "Could not find libomp140.x86_64.dll. " + "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " + "the packaged ggml-cpu-*.dll files may fail to load at runtime. " + "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " + "to package it explicitly." + ) + endif() +endfunction() + + # Remove development-only artifacts from Python wheel runtime directories. # # Upstream install rules may place CMake package files, pkg-config files, and @@ -241,6 +341,11 @@ if (LLAMA_BUILD) llama_cpp_python_install_target(mtmd) endif() + # Install Windows LLVM OpenMP runtime when available. + # This must run before cleanup so the final wheel keeps runtime DLLs but + # removes development-only files such as .lib, cmake/, and pkgconfig/. + llama_cpp_python_install_windows_openmp_runtime() + # Run after all runtime targets are installed, including mtmd. llama_cpp_python_cleanup_dev_files() From fa36f70421815f3e050f1538e37f40feb5a7005a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 23:00:09 +0800 Subject: [PATCH 052/139] Update CHANGELOG.md upstream version link Signed-off-by: JamePeng --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4c6e4c976..e8ebb5cd3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -117,7 +117,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for achieving full x64 CPU variant coverage on Windows. -- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee](https://github.com/ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee) +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/d14ce3dab4de197adec5166faa54ac5db8262f26](https://github.com/ggml-org/llama.cpp/commit/d14ce3dab4de197adec5166faa54ac5db8262f26) - feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260517 From d37951799450b6461ac73160630371c9d1d36065 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 23:07:12 +0800 Subject: [PATCH 053/139] ci: pin windows runner and streamline python test matrix - Pin the Windows CI runner from `windows-latest` to `windows-2022` to ensure build environment stability and prevent unexpected breakages from runner updates. - Remove Python 3.13 from the test matrix to reduce CI runtime and resource consumption. - Retain Python 3.9 (oldest supported) and 3.14 (latest) to ensure compatibility boundaries are still properly tested across Ubuntu, Windows, and macOS (Metal / Non-Metal). Signed-off-by: JamePeng --- .github/workflows/test.yaml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 335b0f0ac3..420c5e9495 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,18 +24,14 @@ jobs: # Don't cancel other jobs in the matrix if one fails fail-fast: false matrix: - os: [ubuntu-latest, windows-latest] - python-version: ["3.9", "3.13", "3.14"] + os: [ubuntu-latest, windows-2022] + python-version: ["3.9", "3.14"] include: # macOS Non-Metal - os: macos-14 python-version: "3.9" cmake_args: "-DLLAMA_METAL=off" metal_status: "(No Metal)" - - os: macos-14 - python-version: "3.13" - cmake_args: "-DLLAMA_METAL=off" - metal_status: "(No Metal)" - os: macos-14 python-version: "3.14" cmake_args: "-DLLAMA_METAL=off" @@ -46,10 +42,6 @@ jobs: python-version: "3.9" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" metal_status: "(Metal)" - - os: macos-14 - python-version: "3.13" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" - metal_status: "(Metal)" - os: macos-14 python-version: "3.14" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" From c7668b150fb1f8f36ecca0100f294829dfef7e66 Mon Sep 17 00:00:00 2001 From: DELUXA Date: Wed, 20 May 2026 15:39:41 +0300 Subject: [PATCH 054/139] Add Windows ROCm build instructions --- README.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a1550a85c..14148562bf 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,9 @@ https://github.com/JamePeng/llama-cpp-python/releases
HIP (ROCm) +
+Linux ROCm + This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). @@ -303,6 +306,40 @@ More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/bu
+
+Windows ROCm + +> **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. + +```powershell +cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } + +rocm-sdk init + +$ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" +$ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" +$ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName + +$env:HIP_PATH = $ROCM_DEVEL +$env:ROCM_PATH = $ROCM_DEVEL +$env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" +$env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" +$env:CMAKE_GENERATOR = "Ninja" +$env:HIP_PLATFORM = "amd" +$env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" +$env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" +$env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" + +$R = $ROCM_DEVEL -replace '\\', '/' +$env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" + +pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir +``` + +
+ +
+
Vulkan @@ -1743,7 +1780,7 @@ Libraries from other authors are often smaller because they may only compile for * 1. I've determined that `llama_cpp.server` is currently in a semi-deprecated state (meaning it won't be maintained unless absolutely necessary, and I might even consider deleting or separating it to reduce the library size). I highly recommend using the `llama-server` program maintained by the upstream `llama.cpp` project, which offers a lower-level implementation, more frequent maintenance and optimization, and more reliable API calls. -* 2. Regarding AMD and Intel graphics cards, AMD can certainly use ROCm as the primary backend (but the drawback is that it's basically only stable on Linux platforms), and Intel's Sycl will also encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. +* 2. Regarding AMD and Intel graphics cards, AMD can use ROCm as the primary backend, while Intel's Sycl will encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. * 3. If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: From a4080a4a7e1f4550fd534fa3f798d39946089c7a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 20 May 2026 22:00:36 +0800 Subject: [PATCH 055/139] docs: Optimize the formatting of the ROCm section in README.md. Signed-off-by: JamePeng --- README.md | 68 +++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 14148562bf..07d7635cbc 100644 --- a/README.md +++ b/README.md @@ -288,55 +288,55 @@ https://github.com/JamePeng/llama-cpp-python/releases
HIP (ROCm) -
-Linux ROCm + -
+ Linux ROCm -This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. + This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. -You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). + You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). -To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: + To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: -```bash -CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" -``` -Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. + ```bash + CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" + ``` + Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. -More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip + More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip -
+
-
-Windows ROCm + -
+ Windows ROCm -> **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. + > **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. -```powershell -cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } + ```powershell + cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } -rocm-sdk init + rocm-sdk init -$ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" -$ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" -$ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName + $ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" + $ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" + $ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName -$env:HIP_PATH = $ROCM_DEVEL -$env:ROCM_PATH = $ROCM_DEVEL -$env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" -$env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" -$env:CMAKE_GENERATOR = "Ninja" -$env:HIP_PLATFORM = "amd" -$env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" -$env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" -$env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" + $env:HIP_PATH = $ROCM_DEVEL + $env:ROCM_PATH = $ROCM_DEVEL + $env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" + $env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" + $env:CMAKE_GENERATOR = "Ninja" + $env:HIP_PLATFORM = "amd" + $env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" + $env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" + $env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" -$R = $ROCM_DEVEL -replace '\\', '/' -$env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" + $R = $ROCM_DEVEL -replace '\\', '/' + $env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" -pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir -``` + pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir + ``` -
+
From 89927d4633e3ff6dde3aa903c3cc84d454e040ad Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 20 May 2026 22:00:49 +0800 Subject: [PATCH 056/139] Update Submodule vendor/llama.cpp d14ce3d..e947228 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d14ce3dab4..e947228222 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d14ce3dab4de197adec5166faa54ac5db8262f26 +Subproject commit e947228222147356bc7e64154d3439e142481632 From 023780091755724b9e41d62d3df9f9ffcbafda09 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 21 May 2026 08:27:48 +0800 Subject: [PATCH 057/139] docs: Removed outdated macOS installation guides and added the latest installation notes. Signed-off-by: JamePeng --- README.md | 84 +++++++++++++++++++++++++++++++------------ docs/install/macos.md | 59 ------------------------------ 2 files changed, 62 insertions(+), 81 deletions(-) delete mode 100644 docs/install/macos.md diff --git a/README.md b/README.md index 07d7635cbc..6c56c034c3 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,8 @@ On MacOS, Metal is enabled by default(`GGML_METAL=ON`). Using Metal makes the co To disable the Metal build at compile time use the `CMAKE_ARGS="-DGGML_METAL=OFF"` cmake option. +When built with Metal support, you can explicitly disable GPU inference with the `n-gpu-layers=0` parameter. + ```bash pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` @@ -277,6 +279,7 @@ pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements: +- CPU Arch: arm64 - MacOS Version is 11.0 or later - Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 @@ -415,46 +418,83 @@ CMAKE_ARGS="-DGGML_RPC=on" pip install "llama-cpp-python @ git+https://github.co
-### Windows Notes - +### Install Notes
-Error: Can't find 'nmake' or 'CMAKE_C_COMPILER' + Optimization Options (Optional) -If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install: +> **💡 Tip:** If you want to save compilation time, you can skip building of llama.cpp with the standalone examples, tools, tests, and server by adding the following flags, as they are not required for Python bindings: -```ps -$env:CMAKE_GENERATOR = "MinGW Makefiles" -$env:CMAKE_ARGS = "-DGGML_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe" +```bash +-DLLAMA_BUILD_EXAMPLES=OFF \ +-DLLAMA_BUILD_TOOLS=OFF \ +-DLLAMA_BUILD_TESTS=OFF \ +-DLLAMA_BUILD_SERVER=OFF ``` - -See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
-### MacOS Notes +
+ CUDA compiler warning suppression is optional +CUDA nvcc compiler may print many template-related warnings from ggml-cuda, such as: -Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/) +```bash +warning #177-D +warning #221-D +warning #550-D +``` -
-M1 Mac Performance Issue +These usually generate a huge amount of noisy diagnostics rather than build blockers. They constantly flood logs and consume CPU printing performance. -Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +For cleaner CI/local logs, you can pass: ```bash -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh +-DCMAKE_CUDA_FLAGS="--diag-suppress=177 --diag-suppress=221 --diag-suppress=550" ``` - -Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
-M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))` + Notes for `GGML_BACKEND_DL` + `GGML_CPU_ALL_VARIANTS` builds +When building wheels with `GGML_BACKEND_DL=ON` and `GGML_CPU_ALL_VARIANTS=ON`, +GGML CPU backends are built as separate dynamic libraries, such as: -Try installing with +```text +ggml-cpu-x64.dll +ggml-cpu-haswell.dll +ggml-cpu-alderlake.dll +ggml-cpu-zen4.dll +``` +These backend libraries must be packaged together under: -```bash -CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +```text +site-packages/llama_cpp/lib ``` + +The runtime must also explicitly load them with: + +```text +ggml_backend_load_all_from_path() +``` + +### Windows notes + +For full x64 CPU variant coverage, `LLVM/Clang` is recommended. `MSVC` may skip some variants such as `zen4`, `cooperlake`, or `sapphirerapids`. + +If `GGML_OPENMP=ON` is used, the LLVM OpenMP runtime must also be packaged next to the backend DLLs: + +```text +libomp140.x86_64.dll +``` + +Without this file, `ggml-cpu-*.dll` may fail to load dynamically at runtime. + +### Wheel packaging checklist + +* Enable `GGML_BACKEND_DL=ON` +* Enable `GGML_CPU_ALL_VARIANTS=ON` +* Use `GGML_NATIVE=OFF` for portable wheels +* Install all `ggml-cpu-*` backend libraries into `llama_cpp/lib` +* Package required runtime dependencies such as `libomp140.x86_64.dll` +* Remove development-only files such as `.lib`, `cmake/`, and `pkgconfig/` +
### Upgrading and Reinstalling diff --git a/docs/install/macos.md b/docs/install/macos.md deleted file mode 100644 index e006fc0a3c..0000000000 --- a/docs/install/macos.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: MacOS Install with Metal GPU ---- - -**(1) Make sure you have xcode installed... at least the command line parts** -``` -# check the path of your xcode install -xcode-select -p - -# xcode installed returns -# /Applications/Xcode-beta.app/Contents/Developer - -# if xcode is missing then install it... it takes ages; -xcode-select --install -``` - -**(2) Install the conda version for MacOS that supports Metal GPU** -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` - -**(3) Make a conda environment** -``` -conda create -n llama python=3.9.16 -conda activate llama -``` - -**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** - *(you needed xcode installed in order pip to build/compile the C++ code)* -``` -pip uninstall llama-cpp-python -y -CMAKE_ARGS="-DGGML_METAL=on" pip install -U llama-cpp-python --no-cache-dir -pip install 'llama-cpp-python[server]' - -# you should now have llama-cpp-python v0.1.62 or higher installed -llama-cpp-python         0.1.68 - -``` - -**(5) Download a v3 gguf v2 model** - - **ggufv2** - - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0 - -https://huggingface.co/TheBloke/CodeLlama-7B-GGUF - - -**(6) run the llama-cpp-python API server with MacOS Metal GPU support** -``` -# config your ggml model path -# make sure it is gguf v2 -# make sure it is q4_0 -export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf -python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 -``` - -***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* - - From 14b98ae81802d8c89a55609ec2bf64349aac58f6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 21 May 2026 20:33:15 +0800 Subject: [PATCH 058/139] Update Submodule vendor/llama.cpp e947228..40d5358 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e947228222..40d5358d3c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e947228222147356bc7e64154d3439e142481632 +Subproject commit 40d5358d3c730b81729ba81cd5c44ed596d02510 From b2f09bb42c0242ae9fcc8a24f0456365891b28de Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 07:14:34 +0800 Subject: [PATCH 059/139] Update Submodule vendor/llama.cpp 40d5358..1acee6b Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 40d5358d3c..1acee6bf89 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 40d5358d3c730b81729ba81cd5c44ed596d02510 +Subproject commit 1acee6bf8939948f9bcbf4b14034e4b475f06069 From 78fa55bd5f8129ebbbf11a4fd6f7fef046707b85 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 08:00:23 +0800 Subject: [PATCH 060/139] feat(speculative): upgrade ngram map decoder with k/k4v modes Enhance `LlamaNGramMapDecoding` to align with the upstream llama.cpp ngram-map algorithm, offering better memory management and draft quality. - Introduce `mode` selection ("k" and "k4v"): "k" stores only historical positions for memory efficiency, while "k4v" caches continuation values directly for faster lookups. - Add `min_hits` threshold to filter out low-confidence drafts. - Implement `max_entries_per_key` to cap dictionary growth and prevent memory bloat during long-context generations. - Improve state synchronization (`_sync_and_index`) using `sync_check_tokens` to safely verify incremental history appends. - Add explicit lifecycle management methods (`clear`, `close`, `accept`) for better API symmetry and resource cleanup. Signed-off-by: JamePeng --- llama_cpp/llama_speculative.py | 313 ++++++++++++++++++++++++++------- 1 file changed, 252 insertions(+), 61 deletions(-) diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py index c3814aaf42..c4289d0797 100644 --- a/llama_cpp/llama_speculative.py +++ b/llama_cpp/llama_speculative.py @@ -1,7 +1,7 @@ import abc import collections -from typing import Any, Dict, List, Tuple +from typing import Any, DefaultDict, Dict, List, Literal, Optional, Tuple import numpy as np import numpy.typing as npt @@ -17,102 +17,293 @@ def __call__( class LlamaNGramMapDecoding(LlamaDraftModel): """ - Ultra-fast speculative decoder based on hash inverted index and incremental updates. - O(1) time complexity, aligned with llama.cpp's underlying ngram-map algorithm. + Fast model-free speculative decoder based on prompt n-gram lookup. + + It supports two modes: + + - "k": + Key-only mode. Stores n-gram key -> history positions. + This is memory-efficient and similar to llama.cpp's ngram-map-k behavior. + + - "k4v": + Key-to-value mode. Stores n-gram key -> continuation tokens. + This uses more memory, but can return cached continuations directly. + + This class does not use a draft model. It only speculates from already verified + token history. Therefore, rejected tokens are handled naturally when the next + `input_ids` is passed in. + + Aligned with llama.cpp's underlying ngram-map k/k4v algorithm. """ - def __init__(self, ngram_size: int = 3, num_pred_tokens: int = 10): + def __init__( + self, + ngram_size: int = 3, + num_pred_tokens: int = 10, + mode: Literal["k", "k4v"] = "k", + min_hits: int = 2, + max_entries_per_key: Optional[int] = None, + sync_check_tokens: int = 16, + ) -> None: """ - Initializes the N-Gram Map speculative decoder. - Args: - ngram_size (int): The length of the token sequence used as the search key. - Larger values provide strictly accurate context matching but may result - in fewer cache hits. Defaults to 3. - num_pred_tokens (int): The maximum number of future tokens to draft (predict) - and return once a match is found in the history. Defaults to 10. + ngram_size: + Number of tokens used as the lookup key. + + num_pred_tokens: + Maximum number of draft tokens to return. + + mode: + "k" stores only matched positions. + "k4v" stores matched continuation values directly. + + min_hits: + Minimum number of historical matches required before returning a draft. + Use 1 for maximum recall. Use >1 to reduce low-confidence drafts. + + max_entries_per_key: + Optional memory cap per n-gram key. + When set, only the most recent entries are kept. + For k4v mode, setting max_entries_per_key is strongly recommended. + + sync_check_tokens: + Number of trailing tokens used to verify whether the new input is an + incremental append of the previous input. This avoids expensive full + prefix comparison while still detecting most rollback/prompt-switch cases. """ - self.ngram_size = ngram_size - self.num_pred_tokens = num_pred_tokens + if ngram_size <= 0: + raise ValueError("ngram_size must be greater than 0") + if num_pred_tokens <= 0: + raise ValueError("num_pred_tokens must be greater than 0") + if min_hits <= 0: + raise ValueError("min_hits must be greater than 0") + if max_entries_per_key is not None and max_entries_per_key <= 0: + raise ValueError("max_entries_per_key must be None or greater than 0") + if sync_check_tokens <= 0: + raise ValueError("sync_check_tokens must be greater than 0") + + mode = mode.lower() + if mode not in ("k", "k4v"): + raise ValueError("mode must be either 'k' or 'k4v'") + + self.ngram_size = int(ngram_size) + self.num_pred_tokens = int(num_pred_tokens) + self.mode = mode + self.min_hits = int(min_hits) + self.sync_check_tokens = int(sync_check_tokens) + + if mode == "k4v" and max_entries_per_key is None: + max_entries_per_key = 8 + self.max_entries_per_key = max_entries_per_key - # Core state cache - # Mapping format: (token_1, ..., token_N) -> [index_1, index_2, ...] - self._ngram_map: Dict[Tuple[int, ...], List[int]] = collections.defaultdict(list) self._history: List[int] = [] - def _update_cache(self, input_ids: npt.NDArray[np.intc]) -> None: + # In "k" mode: + # key -> [position, position, ...] + self._map_k: DefaultDict[Tuple[int, ...], List[int]] = collections.defaultdict(list) + + # In "k4v" mode: + # key -> {position: continuation} + # + # A dict is used so that recent entries can be refreshed when more continuation + # tokens become available. + self._map_k4v: DefaultDict[ + Tuple[int, ...], Dict[int, Tuple[int, ...]] + ] = collections.defaultdict(dict) + + self._closed = False + self._last_draft_len = 0 + + def clear(self) -> None: """ - Smart state synchronization and incremental build (Extreme O(1) optimization). + Clear token history and indexes. - Args: - input_ids (npt.NDArray[np.intc]): The complete sequence of current token IDs - generated or processed so far. + Use this when starting a completely unrelated generation while keeping the + decoder instance reusable. + """ + self._history.clear() + self._map_k.clear() + self._map_k4v.clear() + self._last_draft_len = 0 + + def close(self) -> None: + """ + Release internal memory. + + This class does not own native memory, but clearing large Python containers + explicitly is still useful for long-running applications. + """ + self.clear() + self._closed = True + + def __del__(self) -> None: + # Best-effort cleanup. Program correctness must not depend on __del__. + try: + self.close() + except Exception: + pass + + def accept(self, n_accepted: int) -> None: """ - new_len = len(input_ids) + Notify how many draft tokens were accepted by the target model. + + This implementation does not need to update internal state here, because the + next call receives the verified token history through `input_ids`. + + The method is kept for API symmetry and future extensions, such as acceptance + statistics, adaptive reset, or low-acceptance fallback. + """ + return + + def _sync_and_index(self, input_ids: npt.NDArray[np.intc]) -> None: + """ + Synchronize internal history with input_ids and update the n-gram index. + + The index intentionally stores only n-grams that have at least one continuation + token. This prevents the current tail n-gram from matching itself and returning + an empty draft. + """ + if self._closed: + raise RuntimeError("LlamaNGramMapDecoding is closed") + + tokens = np.asarray(input_ids, dtype=np.intc).reshape(-1).tolist() + old_len = len(self._history) + new_len = len(tokens) + + if new_len == 0: + self.clear() + return + + # Fast path: identical input, no update needed. + if new_len == old_len: + if self._history == tokens: + return + + # Incremental append path. + is_append = False + if old_len > 0 and new_len > old_len: + check_len = min(old_len, max(self.ngram_size, self.sync_check_tokens)) + is_append = self._history[old_len - check_len : old_len] == tokens[ + old_len - check_len : old_len + ] + + if is_append: + # Append only new tokens. + self._history.extend(tokens[old_len:]) + + if self.mode == "k": + # Only newly-valid keys need to be added. + start = max(0, old_len - self.ngram_size) + else: + # K4V must also refresh recent keys because their continuation values + # can grow as new tokens are appended. + start = max(0, old_len - self.ngram_size - self.num_pred_tokens + 1) + else: + # Rollback, prompt switch, truncation, or unsafe mutation. + self.clear() + self._history.extend(tokens) + start = 0 + + # Only index keys that have at least one token after the key. + # Valid pos satisfies: + # pos + ngram_size < len(history) + end = max(0, len(self._history) - self.ngram_size) + + if start >= end: + return + + if self.mode == "k": + for pos in range(start, end): + key = tuple(self._history[pos : pos + self.ngram_size]) + bucket = self._map_k[key] + + if not bucket or bucket[-1] != pos: + bucket.append(pos) + + if ( + self.max_entries_per_key is not None + and len(bucket) > self.max_entries_per_key + ): + del bucket[: len(bucket) - self.max_entries_per_key] - # Check if it's a perfect incremental append (verify if the previous token matches) - is_incremental = False - if new_len > old_len and old_len > 0: - if self._history[-1] == input_ids[old_len - 1]: - is_incremental = True - - if is_incremental: - # Only extract, convert, and append new tokens. - # Never copy or touch the entire historical array! - new_tokens = input_ids[old_len:].tolist() - self._history.extend(new_tokens) - start_idx = max(0, old_len - self.ngram_size) else: - # Rollback occurred (wrong prediction) or a completely new Prompt. Trigger full rebuild. - self._ngram_map.clear() - self._history = input_ids.tolist() - start_idx = 0 + for pos in range(start, end): + key_start = pos + value_start = pos + self.ngram_size + value_end = min(value_start + self.num_pred_tokens, len(self._history)) + + if value_start >= value_end: + continue + + key = tuple(self._history[key_start:value_start]) + value = tuple(self._history[value_start:value_end]) - # Build/update the hash inverted index - for i in range(start_idx, new_len - self.ngram_size): - key = tuple(self._history[i : i + self.ngram_size]) - self._ngram_map[key].append(i) + bucket = self._map_k4v[key] + bucket[pos] = value + + if ( + self.max_entries_per_key is not None + and len(bucket) > self.max_entries_per_key + ): + # Keep the most recent positions. + for old_pos in sorted(bucket)[: len(bucket) - self.max_entries_per_key]: + del bucket[old_pos] def __call__( self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any ) -> npt.NDArray[np.intc]: """ - Generates draft tokens based on historical N-Gram frequency. + Generate draft tokens from verified token history. Args: - input_ids (npt.NDArray[np.intc]): The current sequence of token IDs. - **kwargs: Additional generation arguments (ignored in this implementation). + input_ids: + Complete verified token sequence so far. Returns: - npt.NDArray[np.intc]: An array of predicted draft tokens. Returns an empty - array if no matching context is found. + np.ndarray[np.intc]: + Predicted draft tokens. Empty array means no reliable match was found. """ - # 1. Ultra-fast state synchronization - self._update_cache(input_ids) + _ = kwargs + + self._sync_and_index(input_ids) + self._last_draft_len = 0 - # 2. Cannot speculate if the history is too short if len(self._history) < self.ngram_size: return np.array([], dtype=np.intc) - # 3. Extract the Search Key (the last N tokens) - search_key = tuple(self._history[-self.ngram_size:]) + search_key = tuple(self._history[-self.ngram_size :]) - # 4. O(1) instant lookup - match_indices = self._ngram_map.get(search_key) + if self.mode == "k": + positions = self._map_k.get(search_key) + if not positions or len(positions) < self.min_hits: + return np.array([], dtype=np.intc) - if not match_indices: - return np.array([], dtype=np.intc) + # Use the latest valid match with an available continuation. + draft: List[int] = [] + for pos in reversed(positions): + start = pos + self.ngram_size + if start < len(self._history): + end = min(start + self.num_pred_tokens, len(self._history)) + draft = self._history[start:end] + break + + else: + values = self._map_k4v.get(search_key) + if not values or len(values) < self.min_hits: + return np.array([], dtype=np.intc) - # 5. Get the context of the last match and extract draft tokens - best_match_idx = match_indices[-1] - draft_start = best_match_idx + self.ngram_size - draft_end = min(draft_start + self.num_pred_tokens, len(self._history)) + # Use the continuation from the latest historical position. + latest_pos = max(values) + draft = list(values[latest_pos]) - return np.array(self._history[draft_start:draft_end], dtype=np.intc) + self._last_draft_len = len(draft) + return np.asarray(draft, dtype=np.intc) # Legacy Numpy sliding window implementation +# Fast in some cases, but may degrade output quality. +# Not recommended for production. class LlamaPromptLookupDecoding(LlamaDraftModel): """ Stateless speculative decoding based on Numpy sliding window From 91627a0c6b713858ce5a102253d9c694e36c511b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 08:11:26 +0800 Subject: [PATCH 061/139] examples: add benchmark script for speculative decoding - Add `benchmark_speculative.py` to the `examples/benchmark` directory. - Test `LlamaPromptLookupDecoding` and `LlamaNGramMapDecoding` (k/k4v). - Include diverse test scenarios (code, JSON logs, tables, essays) to measure tokens-per-second (TPS) speedup compared to baseline generation. Signed-off-by: JamePeng --- examples/benchmark/benchmark_speculative.py | 466 ++++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100644 examples/benchmark/benchmark_speculative.py diff --git a/examples/benchmark/benchmark_speculative.py b/examples/benchmark/benchmark_speculative.py new file mode 100644 index 0000000000..73e7c203a2 --- /dev/null +++ b/examples/benchmark/benchmark_speculative.py @@ -0,0 +1,466 @@ +import csv +import gc +import random +import statistics +import time +from dataclasses import dataclass +from typing import Callable, Dict, List, Optional + +from llama_cpp import Llama +from llama_cpp.llama_speculative import ( + LlamaPromptLookupDecoding, + LlamaNGramMapDecoding, +) + + +# ============================================================ +# Model Configuration +# ============================================================ + +MODEL_PATH = r"/path/to/your/model.GGUF" + +N_CTX = 4096 +MAX_TOKENS = 1024 +REPEATS = 2 +CSV_OUTPUT = "speculative_benchmark_results.csv" + +RANDOMIZE_ENGINE_ORDER = False + + +# ============================================================ +# Benchmark Scenario Definition +# ============================================================ + +@dataclass(frozen=True) +class Scenario: + name: str + category: str + prompt: str + expected_behavior: str + + +TEST_SCENARIOS: List[Scenario] = [ + Scenario( + name="A1. Medium-High Repetition - CRUD Boilerplate Code", + category="code_boilerplate", + expected_behavior="Should benefit from n-gram lookup because class and method structures repeat.", + prompt="""<|im_start|>system +You are a senior backend developer. Write highly structured and consistent boilerplate code.<|im_end|> +<|im_start|>user +Write a Python script using `sqlite3` to define CRUD operations for a core banking system database. + +Create 6 separate classes: +- Account +- Transaction +- Customer +- Loan +- Portfolio +- AuditLog + +Each class MUST use the same internal method structure: +- create +- get +- update +- delete +- list_all + +Do not add extra explanations. Output only code.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="A2. Extreme Repetition - JSONL Trading Logs", + category="structured_logs", + expected_behavior="Should strongly favor n-gram methods, especially K/K4V.", + prompt="""<|im_start|>system +You are a deterministic data generation script. Output only raw JSON lines.<|im_end|> +<|im_start|>user +Continue this algorithmic trading execution log for 40 more lines. +Only change timestamp seconds, symbol, quantity, price, and execution_time_ms. + +{"timestamp":"2026-05-23T09:30:01Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"AAPL","side":"BUY","quantity":100,"price":175.50,"execution_time_ms":12} +{"timestamp":"2026-05-23T09:30:02Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"MSFT","side":"SELL","quantity":50,"price":410.25,"execution_time_ms":15} +{"timestamp":"2026-05-23T09:30:03Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"TSLA","side":"BUY","quantity":200,"price":180.10,"execution_time_ms":11}<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="A3. Markdown Table - Repetitive Course Catalog", + category="markdown_table", + expected_behavior="Repeated table columns and row structure should benefit from speculative lookup.", + prompt="""<|im_start|>system +You generate clean Markdown tables with consistent formatting.<|im_end|> +<|im_start|>user +Create a Markdown comparison table for 30 university postgraduate courses. + +Columns: +| Course ID | Course Title | Department | Credits | Prerequisites | Grading Basis | Core Objective | + +The row format must stay consistent. +Use concise but realistic academic descriptions. +Do not add explanation outside the table.<|im_end|> +<|im_start|>assistant +| Course ID | Course Title | Department | Credits | Prerequisites | Grading Basis | Core Objective | +|---:|---|---|---:|---|---|---| +""", + ), + Scenario( + name="A4. Structured Financial Market Report", + category="structured_report", + expected_behavior="Heading and bullet patterns repeat; n-gram lookup should help moderately.", + prompt="""<|im_start|>system +You are a quantitative macroeconomic analyst. Output structured, clear, and professional financial reports.<|im_end|> +<|im_start|>user +Write a Q3 Macroeconomic & Equity Strategy Outlook Report for institutional investors. + +Requirements: +1. Divide the report into exactly 8 sections. +2. Each section MUST contain exactly one heading and 3 bullet points. +3. Repeatedly emphasize the following themes across the sections: interest rate trajectory, inflation stickiness, equity market volatility, supply chain realignment, and fixed-income duration strategies. +4. Keep the tone highly professional and analytical.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="B1. Low Repetition - Macroeconomic Historical Essay", + category="low_repetition_creative", + expected_behavior="Should show limited or no speedup; useful as a negative control.", + prompt="""<|im_start|>system +You are an academic historian of economics. Write with varied sentence structures, rich vocabulary, and analytical depth.<|im_end|> +<|im_start|>user +Write a comprehensive essay exploring the psychological and sociological impacts of hyperinflation on institutional trust during the Weimar Republic in the 1920s. + +Requirements: +- Use highly academic and varied language. +- Do NOT use repetitive paragraph structures. +- Do NOT use bullet points or lists. +- Avoid parallel phrasing; favor complex, flowing narrative analysis. +- Make it a long, continuous essay.<|im_end|> +<|im_start|>assistant +The catastrophic devaluation of the Papiermark in the early 1920s fundamentally fractured the psychological bedrock of the Weimar Republic. """, + ), + Scenario( + name="B2. Reasoning-Like Explanation - Quantitative Finance", + category="reasoning_explanation", + expected_behavior="May show smaller speedup because content is less template-like.", + prompt="""<|im_start|>system +You are a careful technical explainer. Avoid repetitive phrasing.<|im_end|> +<|im_start|>user +Explain the foundational assumptions and inherent limitations of the Black-Scholes option pricing model. + +Discuss the following concepts contextually: +- Log-normal distribution of asset prices +- The assumption of constant volatility and risk-free rates +- Frictionless markets (no transaction costs or taxes) +- The difference in applicability between European and American options + +Write in clear, academic paragraphs. Do not use bullet points or lists.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="C1. Long Context Copy-Edit - High Local Reuse", + category="copy_edit", + expected_behavior="Prompt contains repeated phrases; n-gram lookup should exploit local reuse.", + prompt="""<|im_start|>system +You are a precise academic editing assistant. Preserve the structure while improving the wording.<|im_end|> +<|im_start|>user +Rewrite the following academic grant proposal abstract in a cleaner professional style. +Keep the same repetitive sentence layout but fix the grammar and flow. + +Draft Proposal: +The proposed research will investigate the efficiency of machine learning in high-frequency trading. +The proposed research will demonstrate the risk vectors of automated market making. +The methodology will utilize massive historical limit order book datasets. +The methodology will require significant computational cluster resources. +The expected outcomes will provide a new framework for liquidity provisioning. +The expected outcomes will establish a baseline for regulatory compliance monitoring. +The budget will allocate funds for data acquisition from major exchanges. +The budget will allocate funds for two postdoctoral researchers. +The timeline will span twenty-four months of continuous data analysis. +The timeline will include three major peer-reviewed journal submissions. +The significance will address the growing instability in algorithmic flash crashes. +The significance will ensure safer automated trading environments.<|im_end|> +<|im_start|>assistant +""", + ), +] + + +# ============================================================ +# Engine Definition +# ============================================================ + +@dataclass(frozen=True) +class EngineConfig: + name: str + draft_factory: Callable[[], Optional[object]] + note: str + + +ENGINE_CONFIGS: List[EngineConfig] = [ + EngineConfig( + name="Baseline", + draft_factory=lambda: None, + note="No speculative decoding.", + ), + EngineConfig( + name="PromptLookup-Numpy-n10", + draft_factory=lambda: LlamaPromptLookupDecoding( + max_ngram_size=3, + num_pred_tokens=10, + ), + note="Legacy sliding-window prompt lookup.", + ), + EngineConfig( + name="NGramMap-K-n6", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=6, + mode="k", + min_hits=1, + ), + note="Key-only n-gram map, shorter draft.", + ), + EngineConfig( + name="NGramMap-K-n10", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=1, + ), + note="Key-only n-gram map, default draft length.", + ), + EngineConfig( + name="NGramMap-K4V-n10-cap8", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k4v", + min_hits=1, + max_entries_per_key=8, + ), + note="K4V with bounded per-key memory.", + ), + EngineConfig( + name="NGramMap-K4V-n16-cap8", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=16, + mode="k4v", + min_hits=1, + max_entries_per_key=8, + ), + note="Longer K4V draft; can be faster on highly repetitive outputs.", + ), + EngineConfig( + name="NGramMap-K-minhits2-n10", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + ), + note="More conservative K mode.", + ), +] + + +# ============================================================ +# Measurement Helpers +# ============================================================ + +def cleanup_model(llm: Optional[Llama]) -> None: + if llm is not None: + del llm + gc.collect() + + +def create_llama(draft_model: Optional[object]) -> Llama: + return Llama( + model_path=MODEL_PATH, + n_ctx=N_CTX, + n_gpu_layers=-1, + draft_model=draft_model, + verbose=False, + ) + + +def measure_once( + scenario: Scenario, + engine: EngineConfig, + repeat_idx: int, +) -> Dict[str, object]: + draft_model = engine.draft_factory() + + print(f"\n⏳ [{scenario.name}] Engine={engine.name} | Repeat={repeat_idx + 1}") + print(f" Note: {engine.note}") + + llm: Optional[Llama] = None + + try: + llm = create_llama(draft_model) + + # Warmup: force backend initialization and first-token path. + llm.create_completion( + prompt=scenario.prompt, + max_tokens=1, + temperature=0.0, + echo=False, + ) + + start = time.perf_counter() + + response = llm.create_completion( + prompt=scenario.prompt, + max_tokens=MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + repeat_penalty=1.0, + echo=False, + ) + + end = time.perf_counter() + + duration = end - start + usage = response.get("usage", {}) + completion_tokens = int(usage.get("completion_tokens", 0)) + total_tokens = int(usage.get("total_tokens", 0)) + prompt_tokens = int(usage.get("prompt_tokens", 0)) + + text = response["choices"][0]["text"] + tps = completion_tokens / duration if duration > 0 else 0.0 + + print( + f"✅ {engine.name:<28} " + f"{tps:8.2f} tok/s | " + f"time={duration:7.2f}s | " + f"gen={completion_tokens:4d} | " + f"prompt={prompt_tokens:4d}" + ) + print(f" Snippet: {text[:120].replace(chr(10), ' ')}...") + + return { + "scenario": scenario.name, + "category": scenario.category, + "expected_behavior": scenario.expected_behavior, + "engine": engine.name, + "engine_note": engine.note, + "repeat": repeat_idx + 1, + "duration_sec": duration, + "completion_tokens": completion_tokens, + "prompt_tokens": prompt_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tps, + "snippet": text[:160].replace("\n", "\\n"), + } + + finally: + if hasattr(draft_model, "close"): + draft_model.close() + cleanup_model(llm) + + +# ============================================================ +# Reporting +# ============================================================ + +def summarize_results(rows: List[Dict[str, object]]) -> None: + print("\n\n" + "=" * 90) + print("📊 Benchmark Summary") + print("=" * 90) + + by_scenario: Dict[str, List[Dict[str, object]]] = {} + for row in rows: + by_scenario.setdefault(str(row["scenario"]), []).append(row) + + for scenario_name, scenario_rows in by_scenario.items(): + print(f"\n📂 {scenario_name}") + print("-" * 90) + + grouped: Dict[str, List[float]] = {} + for row in scenario_rows: + grouped.setdefault(str(row["engine"]), []).append(float(row["tokens_per_sec"])) + + baseline_avg = statistics.mean(grouped.get("Baseline", [0.0])) + + print( + f"{'Engine':<32} | {'Avg tok/s':>10} | {'Best':>10} | " + f"{'Worst':>10} | {'Speedup':>8}" + ) + print("-" * 90) + + for engine_name, speeds in grouped.items(): + avg = statistics.mean(speeds) + best = max(speeds) + worst = min(speeds) + speedup = avg / baseline_avg if baseline_avg > 0 else 1.0 + + print( + f"{engine_name:<32} | " + f"{avg:10.2f} | " + f"{best:10.2f} | " + f"{worst:10.2f} | " + f"{speedup:8.2f}x" + ) + + +def save_csv(rows: List[Dict[str, object]], path: str) -> None: + if not rows: + return + + fieldnames = list(rows[0].keys()) + + with open(path, "w", newline="", encoding="utf-8-sig") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + print(f"\n💾 CSV saved to: {path}") + + +# ============================================================ +# Main Benchmark Flow +# ============================================================ + +def run_benchmark() -> None: + print("=" * 90) + print("🏆 llama-cpp-python Speculative Decoding Benchmark") + print("=" * 90) + print(f"Model: {MODEL_PATH}") + print(f"n_ctx={N_CTX}, max_tokens={MAX_TOKENS}, repeats={REPEATS}") + print("=" * 90) + + rows: List[Dict[str, object]] = [] + + for scenario in TEST_SCENARIOS: + print("\n\n" + "#" * 90) + print(f"📂 Scenario: {scenario.name}") + print(f"📌 Category: {scenario.category}") + print(f"🧠 Expected: {scenario.expected_behavior}") + print("#" * 90) + + engines = list(ENGINE_CONFIGS) + if RANDOMIZE_ENGINE_ORDER: + baseline = [e for e in engines if e.name == "Baseline"] + others = [e for e in engines if e.name != "Baseline"] + random.shuffle(others) + engines = baseline + others + + for engine in engines: + for repeat_idx in range(REPEATS): + row = measure_once( + scenario=scenario, + engine=engine, + repeat_idx=repeat_idx, + ) + rows.append(row) + + summarize_results(rows) + save_csv(rows, CSV_OUTPUT) + + +if __name__ == "__main__": + run_benchmark() \ No newline at end of file From 969f5be484ab9f9d602fd73c129906f4ca2ed63e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 08:31:19 +0800 Subject: [PATCH 062/139] docs(speculative): update wiki for NGramMap k/k4v modes and lifecycle APIs Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in the official documentation. - Document the new `__init__` parameters (`mode`, `min_hits`, `max_entries_per_key`, `sync_check_tokens`) and their validation rules. - Add a detailed comparison table explaining the memory and behavior differences between the `"k"` and `"k4v"` lookup modes. - Document the newly exposed lifecycle methods (`clear`, `close`, `accept`). - Add comprehensive usage examples demonstrating `k4v` mode with memory caps. - Update internal state descriptions (replacing `_ngram_map` with `_map_k` and `_map_k4v`). - Add a strong production warning against the legacy `LlamaPromptLookupDecoding` and cross-link the new `benchmark_speculative.py` script. Signed-off-by: JamePeng --- docs/wiki/modules/LlamaSpeculative.md | 343 +++++++++++++++++++------- 1 file changed, 260 insertions(+), 83 deletions(-) diff --git a/docs/wiki/modules/LlamaSpeculative.md b/docs/wiki/modules/LlamaSpeculative.md index 0c0ad099fb..9255d01496 100644 --- a/docs/wiki/modules/LlamaSpeculative.md +++ b/docs/wiki/modules/LlamaSpeculative.md @@ -2,7 +2,7 @@ title: Llama Speculative Decoding module_name: llama_cpp.llama_speculative source_file: llama_cpp/llama_speculative.py -last_updated: 2026-05-02 +last_updated: 2026-05-23 version_target: "latest" --- @@ -10,30 +10,37 @@ version_target: "latest" ## Overview -`llama_speculative.py` provides draft model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. +`llama_speculative.py` defines draft-model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. -Speculative decoding uses a lightweight draft model to propose candidate tokens before the main model verifies them. In this module, the draft model does not need to be a neural model. It can also be a prompt lookup decoder that predicts future tokens by finding repeated token patterns in the existing context. +Speculative decoding lets a draft model propose candidate tokens before the main `Llama` model verifies them. In this module, the draft model does not have to be a neural network. It can also be a model-free prompt lookup decoder that predicts future tokens from repeated token patterns in the already verified context. This module currently defines: | Class | Status | Description | |---|---|---| -| `LlamaDraftModel` | public interface | Abstract base class for draft models used by speculative decoding. | -| `LlamaNGramMapDecoding` | public | Fast stateful n-gram map based speculative decoder. | +| `LlamaDraftModel` | public interface | Abstract base class for speculative draft models. | +| `LlamaNGramMapDecoding` | public | Stateful model-free n-gram lookup decoder with `k` and `k4v` modes. | | `LlamaPromptLookupDecoding` | legacy public | Stateless NumPy sliding-window prompt lookup decoder. | ## Role in the Library This module defines the draft-model side of speculative decoding. -A draft model receives the current token sequence and returns predicted draft tokens. These draft tokens can then be verified by the main `Llama` model during generation. +A draft model receives the verified token sequence so far and returns predicted draft token IDs. These tokens are later verified by the main `Llama` model during generation. The module provides two prompt-based implementations: -- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based lookup. -- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window implementation. +- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based n-gram lookup. +- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window lookup. -For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains an n-gram index instead of scanning the full token history on every call. +For new usage, prefer `LlamaNGramMapDecoding`. It incrementally maintains an n-gram index, supports memory-oriented lookup modes, and avoids scanning the full token history on every call. + +## Choosing Between Related APIs + +| API | Recommended Use | Notes | +|---|---|---| +| `LlamaNGramMapDecoding` | Default prompt lookup decoder for new usage. | Uses stateful n-gram maps and supports `k` / `k4v` modes. | +| `LlamaPromptLookupDecoding` | Compatibility with older prompt lookup behavior. | Stateless and simple, but scans token history with NumPy sliding windows. | ## Classes @@ -41,7 +48,7 @@ For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains ```python class LlamaDraftModel(abc.ABC) -```` +``` Abstract base class for speculative draft models. @@ -58,15 +65,15 @@ def __call__( ) -> npt.NDArray[np.intc] ``` -| Parameter | Type | Description | -| ----------- | ---------------------- | ----------------------------------------------------------------- | -| `input_ids` | `npt.NDArray[np.intc]` | Current token sequence. | -| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete verified token sequence so far. | +| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | Returns: -| Type | Description | -| ---------------------- | -------------------------------------------- | +| Type | Description | +|---|---| | `npt.NDArray[np.intc]` | Draft token IDs proposed by the draft model. | ## `LlamaNGramMapDecoding` @@ -75,9 +82,11 @@ Returns: class LlamaNGramMapDecoding(LlamaDraftModel) ``` -Fast speculative decoder based on an n-gram hash map. +Fast model-free speculative decoder based on prompt n-gram lookup. + +This decoder maintains internal indexes from historical n-grams to either previous positions or cached continuation tokens. When called with the current verified token sequence, it searches for the final n-gram in the already verified history and returns a continuation from the most recent valid historical match. -This decoder maintains an internal inverted index from historical n-grams to their positions. When called with the current token sequence, it looks up the final n-gram in the history and returns the following tokens from the most recent matching context. +It does not own or run a separate draft model. Rejected draft tokens do not require manual rollback inside this class, because the next call receives the verified token history through `input_ids`. ### Constructor @@ -86,52 +95,207 @@ def __init__( self, ngram_size: int = 3, num_pred_tokens: int = 10, -) + mode: Literal["k", "k4v"] = "k", + min_hits: int = 2, + max_entries_per_key: Optional[int] = None, + sync_check_tokens: int = 16, +) -> None ``` -| Parameter | Type | Default | Description | -| ----------------- | ----- | ------- | ------------------------------------------------------------------------------------------------------------------------------- | -| `ngram_size` | `int` | `3` | Length of the token sequence used as the lookup key. Larger values require stricter context matches but may produce fewer hits. | -| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return after a matching n-gram is found. | +| Parameter | Type | Default | Source | Description | +|---|---|---|---|---| +| `ngram_size` | `int` | `3` | `__init__` signature | Number of tokens used as the lookup key. Larger values require stricter matches and may reduce hit rate. | +| `num_pred_tokens` | `int` | `10` | `__init__` signature | Maximum number of draft tokens to return. | +| `mode` | `Literal["k", "k4v"]` | `"k"` | `__init__` signature | Lookup storage mode. `"k"` stores key-to-position mappings. `"k4v"` stores key-to-continuation mappings. | +| `min_hits` | `int` | `2` | `__init__` signature | Minimum number of historical matches required before returning a draft. Use `1` for maximum recall; use values greater than `1` to reduce low-confidence drafts. | +| `max_entries_per_key` | `Optional[int]` | `None` | `__init__` signature and initialization logic | Optional memory cap per n-gram key. If `mode="k4v"` and this is `None`, it is automatically set to `8`. | +| `sync_check_tokens` | `int` | `16` | `__init__` signature | Number of trailing tokens used to detect whether new input is an incremental append without doing a full prefix comparison. | + +### Parameter Validation + +The constructor raises `ValueError` when: + +| Condition | Error Meaning | +|---|---| +| `ngram_size <= 0` | `ngram_size` must be positive. | +| `num_pred_tokens <= 0` | `num_pred_tokens` must be positive. | +| `min_hits <= 0` | `min_hits` must be positive. | +| `max_entries_per_key is not None and max_entries_per_key <= 0` | The memory cap must be `None` or positive. | +| `sync_check_tokens <= 0` | `sync_check_tokens` must be positive. | +| `mode` is not `"k"` or `"k4v"` after lowercasing | Only the two supported lookup modes are valid. | + +### Lookup Modes + +| Mode | Internal Storage | Memory Use | Behavior | +|---|---|---|---| +| `"k"` | `key -> [position, position, ...]` | Lower | Stores historical positions and slices continuations from `_history` during lookup. | +| `"k4v"` | `key -> {position: continuation}` | Higher | Stores continuation tokens directly and returns the latest cached continuation. | + +Use `"k"` as the general-purpose default. Use `"k4v"` when faster continuation retrieval is preferred and the extra memory use is acceptable. For `"k4v"`, `max_entries_per_key` defaults to `8` when not specified. ### Important Attributes / State -| Attribute | Type | Source | Description | -| ----------------- | ---------------------------------- | -------------- | -------------------------------------------------------------------------------- | -| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | -| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | -| `_ngram_map` | `Dict[Tuple[int, ...], List[int]]` | internal cache | Internal inverted index mapping n-gram tuples to positions in the token history. | -| `_history` | `List[int]` | internal cache | Internal token history used to maintain the n-gram map. | +| Attribute | Type | Source | Description | +|---|---|---|---| +| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | +| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | +| `mode` | `str` | constructor | Active lookup mode: `"k"` or `"k4v"`. | +| `min_hits` | `int` | constructor | Required number of historical matches before returning a draft. | +| `max_entries_per_key` | `Optional[int]` | constructor / initialization logic | Optional per-key memory cap. Automatically becomes `8` for `k4v` mode when not provided. | +| `sync_check_tokens` | `int` | constructor | Trailing-token window used for incremental append detection. | +| `_history` | `List[int]` | internal state | Verified token history mirrored from `input_ids`. | +| `_map_k` | `DefaultDict[Tuple[int, ...], List[int]]` | internal state | Key-to-position index used in `"k"` mode. | +| `_map_k4v` | `DefaultDict[Tuple[int, ...], Dict[int, Tuple[int, ...]]]` | internal state | Key-to-continuation index used in `"k4v"` mode. | +| `_closed` | `bool` | internal state | Marks the decoder as closed. Calling the decoder after `close()` raises `RuntimeError`. | +| `_last_draft_len` | `int` | internal state | Length of the most recent returned draft. Currently internal diagnostic state. | + +Internal state should not be mutated directly. + +### Core Methods + +#### `__call__` + +```python +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] +``` + +Generates draft tokens from verified token history. + +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete verified token sequence so far. | +| `**kwargs` | `Any` | Accepted for interface compatibility and ignored by this implementation. | + +Returns: + +| Type | Description | +|---|---| +| `npt.NDArray[np.intc]` | Predicted draft tokens. Returns an empty array when no reliable match is found. | + +Raises: + +| Exception | Condition | +|---|---| +| `RuntimeError` | The decoder has been closed with `close()` and is called again. | + +#### `clear` + +```python +def clear(self) -> None +``` + +Clears token history and internal indexes while keeping the decoder reusable. + +Use this when starting a completely unrelated generation with the same decoder instance. + +#### `close` + +```python +def close(self) -> None +``` + +Clears internal containers and marks the decoder as closed. + +This class does not own native memory, but explicit cleanup can be useful in long-running applications that may otherwise keep large Python containers alive. + +#### `accept` + +```python +def accept(self, n_accepted: int) -> None +``` + +Compatibility hook for speculative decoding loops. -`_ngram_map` and `_history` are internal state and should not be modified directly. +This implementation is intentionally a no-op. Accepted tokens are reflected by the next `input_ids` passed to `__call__`, so no separate rollback or acceptance state update is required. ### Behavior When called, `LlamaNGramMapDecoding`: -1. Synchronizes its internal history with the provided `input_ids`. -2. Incrementally updates the n-gram map when tokens are appended. -3. Rebuilds the map if the input sequence is no longer a simple continuation, such as after rollback or a new prompt. -4. Uses the last `ngram_size` tokens as the search key. -5. Returns up to `num_pred_tokens` tokens following the most recent historical match. -6. Returns an empty NumPy array if no match is found. +1. Converts `input_ids` to a flat `np.intc` token list. +2. Synchronizes internal history with the verified token sequence. +3. Uses a fast path when the new input is identical to the stored history. +4. Uses an incremental append path when the trailing tokens indicate that the new input extends the previous input. +5. Rebuilds the index after rollback, prompt switch, truncation, or unsafe mutation. +6. Indexes only n-grams with at least one available continuation token, so the current tail n-gram does not match itself. +7. Looks up the final `ngram_size` tokens as the search key. +8. Requires at least `min_hits` historical matches before returning a draft. +9. Returns up to `num_pred_tokens` tokens from the latest valid historical match. +10. Returns an empty NumPy array if no reliable match is available. -### Example +### Example: Direct Prompt Lookup + +Use `min_hits=1` in a small standalone example so that one historical match is enough to return a draft. ```python import numpy as np + from llama_cpp.llama_speculative import LlamaNGramMapDecoding draft_model = LlamaNGramMapDecoding( ngram_size=3, - num_pred_tokens=5, + num_pred_tokens=2, + min_hits=1, ) -input_ids = np.array([1, 2, 3, 4, 1, 2, 3], dtype=np.intc) - +input_ids = np.array([1, 2, 3, 4, 5, 1, 2, 3], dtype=np.intc) draft_tokens = draft_model(input_ids) print(draft_tokens) +# Expected output: +# [4 5] +``` + +### Example: Use with `Llama` + +```python +from llama_cpp import Llama +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +llm = Llama( + model_path="path/to/model.gguf", + n_ctx=4096, + n_gpu_layers=-1, + draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + ), +) + +response = llm.create_chat_completion( + messages=[ + { + "role": "user", + "content": ( + "Write five short Python classes with the same CRUD method layout: " + "User, Product, Order, Review, and Category." + ), + } + ] +) + +print(response["choices"][0]["message"]["content"]) +``` + +### Example: Use `k4v` Mode with a Memory Cap + +```python +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=4, + num_pred_tokens=8, + mode="k4v", + min_hits=2, + max_entries_per_key=8, +) ``` ## `LlamaPromptLookupDecoding` @@ -144,7 +308,7 @@ Legacy speculative decoder based on NumPy sliding-window lookup. This implementation is stateless. Each call scans the input token sequence to find previous occurrences of the current n-gram and returns the following tokens as draft predictions. -> Warning: This implementation may have high computational overhead for long contexts. Prefer `LlamaNGramMapDecoding` for new usage. +> Warning: This implementation is not recommended for production. It may have high computational overhead for long contexts and may degrade output quality. Prefer `LlamaNGramMapDecoding` for new usage. ### Constructor @@ -156,16 +320,16 @@ def __init__( ) ``` -| Parameter | Type | Default | Description | -| ----------------- | ----- | ------- | -------------------------------------------------------------------------- | -| `max_ngram_size` | `int` | `3` | Maximum n-gram size to search for. The decoder tries larger n-grams first. | -| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return. | +| Parameter | Type | Default | Source | Description | +|---|---|---|---|---| +| `max_ngram_size` | `int` | `3` | `__init__` signature | Maximum n-gram size to search for. The decoder tries larger n-grams first. | +| `num_pred_tokens` | `int` | `10` | `__init__` signature | Maximum number of draft tokens to return. | ### Important Attributes / State -| Attribute | Type | Source | Description | -| ----------------- | ----- | ----------- | --------------------------------------------------- | -| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | +| Attribute | Type | Source | Description | +|---|---|---|---| +| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | | `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | ### Static Method @@ -181,58 +345,71 @@ def find_candidate_pred_tokens( Linearly scans `input_ids` using NumPy sliding windows to find matching n-grams. -| Parameter | Type | Description | -| ----------------- | ---------------------- | ----------------------------------------- | -| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | -| `max_ngram_size` | `int` | Maximum n-gram size to search for. | -| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | +| `max_ngram_size` | `int` | Maximum n-gram size to search for. | +| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | Returns: -| Type | Description | -| ---------------------- | --------------------------------------------------------------- | +| Type | Description | +|---|---| | `npt.NDArray[np.intc]` | Candidate draft tokens, or an empty array if no match is found. | -### Example +### Method ```python -from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaNGramMapDecoding - -llama = Llama( - model_path="path/to/qwen-3.6-27b.gguf", - n_ctx=4096, - n_gpu_layers=-1, - draft_model=LlamaNGramMapDecoding( - ngram_size=3, - num_pred_tokens=10 - ) -) - -response = llama.create_chat_completion( - messages=[{"role": "user", "content": """ - Write a Python script using `sqlite3` to define CRUD (Create, Read, Update, Delete) operations for an e-commerce database. -You need to create 5 separate classes for the following entities: `User`, `Product`, `Order`, `Review`, and `Category`. -Each class MUST have exactly the same internal structure and method names (create, get, update, delete). Do not add extra logic, just the standard boilerplate. - """}] -) +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] ``` +Calls `find_candidate_pred_tokens` with the instance's `max_ngram_size` and `num_pred_tokens`. + ## Best Practices & Common Patterns -* Prefer `LlamaNGramMapDecoding` for new usage. -* Use `LlamaPromptLookupDecoding` only when compatibility with the older stateless prompt lookup behavior is needed. -* Increase `ngram_size` or `max_ngram_size` for stricter context matching. -* Increase `num_pred_tokens` when you want longer draft proposals, but keep in mind that speculative decoding still depends on later verification by the main model. -* Do not mutate `_ngram_map` or `_history` directly. -* If input token history rolls back or changes unexpectedly, `LlamaNGramMapDecoding` automatically rebuilds its internal cache. +- Prefer `LlamaNGramMapDecoding` for new usage. +- Use `mode="k"` as the default memory-efficient mode. +- Use `mode="k4v"` when cached continuations are useful and the additional memory use is acceptable. +- Keep `max_entries_per_key` set for `k4v` mode unless you intentionally want an unbounded per-key cache. +- Use `min_hits=1` for maximum recall in repetitive prompts or benchmarks. +- Use `min_hits > 1` to reduce low-confidence drafts. +- Increase `ngram_size` for stricter pattern matching. +- Increase `num_pred_tokens` to allow longer draft proposals, but remember that the target model still verifies the tokens. +- Call `clear()` before reusing the same decoder for an unrelated prompt or generation session. +- Do not call the decoder again after `close()` unless you create a new instance. +- Do not mutate `_history`, `_map_k`, `_map_k4v`, or other internal state directly. + +## Limitations + +- Prompt lookup only predicts tokens that are already implied by repeated patterns in the verified context. +- It is most useful for repetitive, structured, or boilerplate-heavy output. +- It may return an empty draft when the context has too few repeated n-grams or when `min_hits` is too strict. +- It does not replace target-model verification. +- `LlamaPromptLookupDecoding` is kept for compatibility and is not recommended for production use. ## Deprecated / Changed APIs -`LlamaPromptLookupDecoding` is marked as a legacy NumPy sliding-window implementation in the source code. It is still available, but `LlamaNGramMapDecoding` is the preferred implementation for faster repeated calls over long contexts. +`LlamaPromptLookupDecoding` is the legacy NumPy sliding-window implementation. It remains available, but `LlamaNGramMapDecoding` is the preferred prompt lookup implementation for new code. + +Compared with the older `LlamaNGramMapDecoding` documentation, the current implementation adds: + +- `mode` +- `min_hits` +- `max_entries_per_key` +- `sync_check_tokens` +- `clear()` +- `close()` +- `accept()` +- Separate internal indexes for `k` and `k4v` modes ## Related Links * [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +* [[Benchmark_Speculative](https://github.com/JamePeng/llama-cpp-python/blob/main/examples/benchmark/benchmark_speculative.py)] From d90895d33c7868d9c949d9c1648d33ad3ebc7f8e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 11:02:33 +0800 Subject: [PATCH 063/139] docs(readme): revamp speculative decoding documentation Expand the Speculative Decoding section to fully document the new `LlamaNGramMapDecoding` capabilities and configuration options. - Clarify that `LlamaNGramMapDecoding` is a model-free prompt lookup decoder that does not require a secondary GGUF draft model. - Add a detailed parameter table explaining `mode` (k vs. k4v), `min_hits`, memory caps, and sync thresholds. - Provide usage examples and tuning recommendations for different hardware (e.g., lowering `num_pred_tokens` for CPU setups). - Demote the older `LlamaPromptLookupDecoding` to a legacy section, warning about its sliding-window overhead on long contexts. - Add practical notes on performance and state management (`clear()`). Signed-off-by: JamePeng --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6c56c034c3..1986a6ca54 100644 --- a/README.md +++ b/README.md @@ -1592,44 +1592,116 @@ emb = llm.create_embedding("text") --- -### Speculative Decoding +## Speculative Decoding -`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model. +`llama-cpp-python` supports speculative decoding through a `draft_model` passed to the `Llama` class. -The fastest way to use speculative decoding is through the `LlamaNGramMapDecoding`(**Recommend**) or `LlamaPromptLookupDecoding` class. +Speculative decoding lets a draft decoder propose candidate tokens before the main model verifies them. This can improve generation speed, especially for repetitive or structured outputs such as code, JSON, boilerplate text, templates, and long-form responses with repeated patterns. -Just pass this as a draft model to the `Llama` class during initialization. +The recommended built-in draft decoder is `LlamaNGramMapDecoding`. + +Unlike neural draft-model speculative decoding, `LlamaNGramMapDecoding` does not require a second GGUF model. It is a model-free prompt n-gram lookup decoder that predicts draft tokens from already verified token history. ```python from llama_cpp import Llama from llama_cpp.llama_speculative import LlamaNGramMapDecoding llama = Llama( - model_path="path/to/qwen-3.6-27b.gguf", + model_path="path/to/model.gguf", n_ctx=4096, n_gpu_layers=-1, draft_model=LlamaNGramMapDecoding( ngram_size=3, - num_pred_tokens=10 - ) + num_pred_tokens=10, + ), ) response = llama.create_chat_completion( - messages=[{"role": "user", "content": "Write a python script..."}] + messages=[ + { + "role": "user", + "content": "Write a Python script using sqlite3 with repeated CRUD classes.", + } + ] +) +```` + +`LlamaNGramMapDecoding` maintains an internal n-gram index and can reuse repeated token patterns from the current prompt and generated context. Compared with the legacy sliding-window prompt lookup decoder, it avoids scanning the full token history on every call, making draft generation much cheaper for long contexts. + +#### Advanced configuration + +```python +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + max_entries_per_key=None, + sync_check_tokens=16, +) +``` + +| Parameter | Default | Description | +| --------------------- | ----------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| `ngram_size` | `3` | Number of tokens used as the lookup key. Larger values require stricter matches. | +| `num_pred_tokens` | `10` | Maximum number of draft tokens to propose. | +| `mode` | `"k"` | N-gram map mode. `"k"` stores key-to-position mappings. `"k4v"` stores key-to-continuation mappings. | +| `min_hits` | `2` | Minimum number of historical matches required before returning draft tokens. Use `1` for higher recall, or `2+` to reduce low-confidence drafts. | +| `max_entries_per_key` | `None` in `"k"` mode, `8` in `"k4v"` mode | Optional memory cap per n-gram key. Strongly recommended for `"k4v"` mode. | +| `sync_check_tokens` | `16` | Number of trailing tokens used to detect whether the new input is an incremental append or requires rebuilding the internal index. | + +#### Choosing a mode + +`LlamaNGramMapDecoding` supports two modes: + +* `mode="k"`: stores n-gram keys mapped to historical positions. This is the default and is usually the best starting point. +* `mode="k4v"`: stores n-gram keys mapped directly to continuation tokens. This can make continuation lookup cheaper, but uses more memory. When using `"k4v"`, keeping `max_entries_per_key` enabled is recommended. + +For most users, the default configuration is enough: + +```python +draft_model=LlamaNGramMapDecoding() +``` + +For higher recall, especially when the prompt has fewer repeated patterns, you can lower `min_hits`: + +```python +draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + min_hits=1, ) ``` -Note: `LlamaPromptLookupDecoding.num_pred_tokens` is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. Now, `LlamaNGramMapDecoding` with the new Hash Map algorithm, draft generation becomes instantaneous $O(1)$, and the time consumption is almost 0 regardless of whether you set the prediction to 2 or 10 words. -### Adjusting the Context Window +For CPU-only machines, smaller draft lengths such as `num_pred_tokens=2` may still be a better tradeoff. For GPU inference, larger values such as `num_pred_tokens=10` are often reasonable, but the best value depends on model size, prompt structure, backend, and acceptance rate. -The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. +#### Legacy prompt lookup decoder -For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: +`LlamaPromptLookupDecoding` is still available for compatibility: ```python -llm = Llama(model_path="./models/llama-model.gguf", n_ctx=2048) +from llama_cpp.llama_speculative import LlamaPromptLookupDecoding + +draft_model = LlamaPromptLookupDecoding( + max_ngram_size=3, + num_pred_tokens=10, +) ``` +However, it uses a legacy NumPy sliding-window lookup and may have higher overhead on long contexts. For new usage, prefer `LlamaNGramMapDecoding`. + +#### Notes + +* Speculative decoding still requires the main model to verify proposed draft tokens. +* Speedup depends on how many draft tokens are accepted. +* Prompt n-gram speculative decoding works best when the current context contains repeated patterns. +* It is especially useful for code generation, structured text, repeated templates, and boilerplate-heavy completions. +* `LlamaNGramMapDecoding` stores internal Python-side history and indexes. If you want to reuse the same decoder instance for an unrelated generation, call `draft_model.clear()`. + +--- + ## Docker image See here: https://github.com/JamePeng/llama-cpp-python/tree/main/docker#cuda_simple From 5364cf914b590065690eacaaf94ecb2453766a67 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 12:58:02 +0800 Subject: [PATCH 064/139] feat(LlamaContext): add safety checks and docstrings to logits retrieval - Add explicit null pointer validation to `get_logits` and `get_logits_ith`. These methods now raise a `RuntimeError` instead of silently returning invalid pointers when logits are unavailable or the index is out of bounds. - Add comprehensive docstrings to both methods, detailing the underlying buffer shape and memory layout. - Include a performance warning in `get_logits_ith` about the internal synchronization/reordering overhead to discourage its use on the hot path. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index c026440a2d..fda9187855 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -755,12 +755,36 @@ def synchronize(self): llama_cpp.llama_synchronize(self.ctx) def get_logits(self): + """ + Token logits obtained from the last call to llama_decode() + The logits for which llama_batch.logits[i] != 0 are stored contiguously + in the order they have appeared in the batch. + Rows: number of tokens for which llama_batch.logits[i] != 0 + Cols: n_vocab + + Returns: + Pointer to the logits buffer of shape (n_tokens, n_vocab) + """ self._assert_ctx() - return llama_cpp.llama_get_logits(self.ctx) + logits = llama_cpp.llama_get_logits(self.ctx) + if not logits: + raise RuntimeError(f"LlamaContext.get_logits: failed to get logits") + return logits def get_logits_ith(self, i: int): + """ + Return logits for the ith output row from the last llama_decode call. + + Note: + This calls llama_get_logits_ith(), which may reorder/synchronize + the output buffer internally. Avoid calling it on the hot path unless + Python-side logits are required. + """ self._assert_ctx() - return llama_cpp.llama_get_logits_ith(self.ctx, i) + logits = llama_cpp.llama_get_logits_ith(self.ctx, i) + if not logits: + raise RuntimeError(f"LlamaContext.get_logits_ith: invalid logits index {i}") + return logits def set_embeddings(self, embeddings: bool): self._assert_ctx() From 7e0cd122d0af2f9971ebdfc40fb177366c394280 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 13:02:09 +0800 Subject: [PATCH 065/139] build(cmake): disable building of upstream unified binary Set `LLAMA_BUILD_APP` to `OFF` to prevent the compilation of the new unified `llama` binary introduced in upstream llama.cpp. Since the Python package only requires the underlying shared libraries and specific targets, explicitly disabling the standalone application reduces build times and prevents unnecessary executable artifacts from being compiled. Signed-off-by: JamePeng --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6dfb7c136..6f09cdb783 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,9 @@ if (LLAMA_BUILD) # Disable building of server set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example" FORCE) + # Disable building of unified binary + set(LLAMA_BUILD_APP OFF CACHE BOOL "llama: build the unified binary" FORCE) + # Disable build the embedded Web UI for server set(LLAMA_BUILD_UI OFF CACHE BOOL "llama: build the embedded Web UI for server" FORCE) set(LLAMA_USE_PREBUILT_UI OFF CACHE BOOL "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" FORCE) From 615e45a47f47387e741c12eeca397339fee0e74b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 13:20:06 +0800 Subject: [PATCH 066/139] perf(eval): skip unnecessary logit array copies during native sampling - Introduce the `copy_logits` parameter to `Llama.eval()` to control whether C-level logits are copied into the Python `self.scores` array. - Automatically disable `copy_logits` during the generation loop unless Python-side hooks (`logits_processor`, `stopping_criteria`) or `logits_all` explicitly require them. - Skip logit copies entirely for intermediate prompt evaluations (e.g., before hybrid checkpoints). - Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch the final token's logits when copying is required. In a PDF-reading summarization workload, this reduced the end-to-end completion time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path also improved noticeably: - `_create_completion`: 41.32s -> 25.93s - `generate`: 37.82s -> below the top sampled entries - `eval`: 35.14s -> 21.96s - logits retrieval/copy path: 29.89s `get_logits()` -> 18.68s `get_logits_ith()` - `decode`: 3.89s -> 2.25s - `detokenize`: 2.60s -> 1.33s - `sample`: 2.35s -> 2.03s This significantly reduces CPU overhead and memory bandwidth during generation, as the native `llama.cpp` sampler reads directly from the C context without needing to expose the `n_vocab` array to Python on every token. Signed-off-by: JamePeng --- llama_cpp/llama.py | 52 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 734485802e..e9d16438e5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1035,11 +1035,20 @@ def eval( tokens: Sequence[int], active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + copy_logits: bool = True, ): """Evaluate a list of tokens. Args: - tokens: The list of tokens to evaluate. + tokens: The token ids to evaluate. + active_loras: Optional LoRA adapters to apply for this evaluation. + Each item should contain a ``name`` and an optional ``scale``. + control_vector: Optional control vector configuration to apply during + this evaluation. + copy_logits: Whether to copy the final logits into ``self.scores`` when + ``logits_all`` is disabled. Set to ``False`` for native sampler paths + that sample directly from the llama context and do not need + Python-side logits. """ n_eval = len(tokens) if n_eval == 0: @@ -1246,9 +1255,11 @@ def eval( if self.verbose: print(f"Llama.eval: [Periodic Checkpoint] HybridCheckpoint save failed at pos {current_pos}, skipping update", file=sys.stderr) - # Save the final logit if not in _logits_all mode - if not self._logits_all: - logits_ptr = self._ctx.get_logits() + # Save the final logits only when Python-side logits are required. + # Native sampler can sample directly from ctx, so normal generation does not + # need to copy n_vocab floats into self.scores on every token. + if not self._logits_all and copy_logits: + logits_ptr = self._ctx.get_logits_ith(-1) logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,)) self.scores[0, :] = logits_view @@ -1666,6 +1677,14 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): self._sampling_ctx = LlamaSamplingContext(params, self._model) + # Native sampler samples directly from ctx. Python-side logits are only needed + # for compatibility hooks that explicitly consume self._scores. + copy_logits = ( + self._logits_all + or logits_processor is not None + or stopping_criteria is not None + ) + sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) @@ -1685,8 +1704,13 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): body_tokens = tokens[:-1] last_token = [tokens[-1]] - # 1. Evaluate up to N-1 - self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector) + # 1. Evaluate up to N-1 without copying logits. + self.eval( + body_tokens, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=False, + ) # 2. Save the N-1 state snapshot current_history = self._input_ids[:self.n_tokens].tolist() @@ -1695,11 +1719,21 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): tokens=current_history, seq_id=0 ) - # 3. Evaluate the final token to refresh logits - self.eval(last_token, active_loras=active_loras, control_vector=control_vector) + # 3. Evaluate final token. Copy logits only if Python-side hooks need them. + self.eval( + last_token, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=copy_logits, + ) else: # Standard evaluation or single-token generation step - self.eval(tokens, active_loras=active_loras, control_vector=control_vector) + self.eval( + tokens, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=copy_logits, + ) # Sample loop while sample_idx < self.n_tokens: From 4d50e5860798ac4e1706e0de250e01c298a0f126 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 16:34:00 +0800 Subject: [PATCH 067/139] =?UTF-8?q?docs(CUDA):=20Add=20note=20about=20PDL?= =?UTF-8?q?=20optimization=20for=20newer=20NVIDIA=20GPUs=20(CC=20=E2=89=A5?= =?UTF-8?q?=2090)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: JamePeng --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1986a6ca54..cc83e9814c 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,9 @@ $env:CMAKE_ARGS = "-DGGML_CUDA=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` +Note: **Programmatic Dependent Launch (PDL)** is a CUDA optimization for newer NVIDIA GPUs (CC >= 90; does not include Ada). +It enables stream-level dependency-driven concurrent execution of CUDA kernels within the same stream, achieving similar kernel launch overhead reduction as CUDA Graphs. If you have a newer NVIDIA GPU (e.g. `Hoppper`, `Blackwell` and above), you can achieve significant speedups and latency reduction in token generation across nearly all models when compiling with ` -DGGML_CUDA_PDL=ON`. + **Pre-built Wheel (New)** It is also possible to install a pre-built wheel with CUDA support. Make sure your system meets the following requirements: From 8a107375f0e4e2d482ce64ad1e886ffb6ac5df37 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 16:51:25 +0800 Subject: [PATCH 068/139] docs(development): add AI agent prompt for git commit generation Introduce `git-commit-generation-agent.md` to the development wiki to standardize the creation of high-quality git commit messages using LLM assistants. - Define the system persona, core principles (Conventional Commits, DCO), and strict formatting rules for generating commits. - Provide concrete template examples for build, performance, and documentation updates. - Ensure future maintainers and contributors can easily generate consistent, maintainer-level commits that explicitly explain the "Why" and "How" of code changes. Signed-off-by: JamePeng --- .../git-commit-generation-agent.md | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 docs/wiki/development/git-commit-generation-agent.md diff --git a/docs/wiki/development/git-commit-generation-agent.md b/docs/wiki/development/git-commit-generation-agent.md new file mode 100644 index 0000000000..4cce635154 --- /dev/null +++ b/docs/wiki/development/git-commit-generation-agent.md @@ -0,0 +1,214 @@ +--- +title: Git Commit Generation Agent +page_type: development-helper +source_file: docs/wiki/development/git-commit-generation-agent.md +last_updated: 2026-05-23 +version_target: "latest" +author: JamePeng +audience: maintainers +--- + +# Git Commit Generation Agent for `llama-cpp-python` + +## Overview + +This page defines a maintainer-facing LLM helper workflow for generating +high-quality, descriptive, and standardized Git commit messages for +`llama-cpp-python`. + +## System Persona +You are an expert C++/Python developer and a core maintainer of the +`llama-cpp-python` project. Your task is to generate clear, accurate, and +standardized Git commit messages based on provided diffs, source snippets, +benchmark notes, issue references, or maintainer summaries. + +## Core Principles + +The project follows the **Conventional Commits** specification and requires a +**Developer Certificate of Origin (DCO) Sign-off**. + +Generated commit messages must prioritize: + +- **Why** the change was needed. +- **How** the change was implemented. +- **What** user-visible, runtime, build, packaging, or documentation behavior + changed. +- **What** future maintainers need to know when reading the project history. + +## Input Requirements + +The agent may receive: + +- A full Git diff +- A changed file list +- Source snippets +- Benchmark results +- Maintainer notes +- Issue or PR references +- A natural-language summary of changes + +When the input is incomplete, generate the best possible commit message from the +provided information, but do not invent implementation details. + +## Formatting Rules + +### 1. Header Line (Subject) +Use the following format: + +```text +(): +```` + +Allowed types: + +| Type | Use for | +| ---------- | ----------------------------------------------------------- | +| `feat` | New features or user-facing capabilities | +| `fix` | Bug fixes | +| `docs` | Documentation-only changes | +| `build` | CMake, build scripts, compiler flags, packaging build logic | +| `perf` | Performance optimizations | +| `ci` | GitHub Actions or other workflow changes | +| `chore` | Maintenance, cleanup, or non-user-facing changes | +| `refactor` | Internal restructuring without behavior change | +| `test` | Test additions or updates | + +Recommended scopes: + +* `llama` +* `core` +* `bindings` +* `sampling` +* `speculative` +* `cache` +* `chat` +* `multimodal` +* `embedding` +* `types` +* `cmake` +* `windows` +* `cuda` +* `metal` +* `ci` +* `docs` +* `readme` +* `packaging` + +Subject rules: + +* Use imperative mood, such as `add`, `fix`, `update`, `skip`, `expose`. +* Do not use past tense, such as `added`, `fixed`, or `updated`. +* Keep the subject under 72 characters when possible. +* Use lowercase unless a proper noun, symbol, or API name requires otherwise. +* Do not end the subject with a period. + +### 2. Body +Leave one blank line between the header and the body. +The body should: +* Start with a short paragraph explaining the motivation or problem. +* Use bullets when the diff contains multiple logical changes. +* Mention important files, classes, functions, flags, or APIs using Markdown + backticks. +* Keep lines wrapped at around 72-80 characters. +* Mention user-visible behavior changes when relevant. +* Mention performance impact only when supported by the input. + +### 3. Footer (Sign-off) +* Leave one blank line after the body. +* You MUST append a generic DCO sign-off line at the very end. +* **Format:** `Signed-off-by: Developer Name ` + +--- + +## Accuracy Rules + +* Do not invent changed files, functions, APIs, benchmarks, flags, or behavior. +* Do not claim performance improvements unless benchmark data is provided or the + diff clearly supports the optimization. +* Do not mention issue or PR numbers unless provided by the user. +* Do not include migration notes unless the change affects user-facing APIs. +* If the change is documentation-only, do not imply runtime behavior changed. +* If the change is internal-only, do not overstate it as a user-facing feature. +* Prefer specific technical descriptions over generic wording. + +## Output Rules + +When the user provides a code diff or a summary of changes, analyze the intent +and output only the raw Git commit message. + +Do not: + +* Wrap the commit message in Markdown code fences. +* Add explanations before or after the commit message. +* Add headings such as `Commit message:`. +* Include alternative versions unless explicitly requested. + +## Output Examples + +### Example 1: Build System Change +```text +build(cmake): package LLVM OpenMP runtime DLL for Windows wheels + +Dynamically loaded GGML CPU backends compiled with LLVM/Clang and OpenMP +require `libomp140.x86_64.dll` at runtime. Since this dependency is not +always caught by `$`, it must be packaged manually. + +- Add `llama_cpp_python_install_windows_runtime_file` to handle installing + arbitrary extra DLLs with proper CMake path normalization. +- Add fallback search logic to locate the OpenMP DLL in common Visual Studio + directories. +- Execute the installation before the dev-file cleanup step to ensure the + DLL is correctly packaged in the final Python wheel. + +Signed-off-by: Developer Name + +``` + +### Example 2: Performance Optimization + +```text +perf(eval): skip unnecessary logit array copies during native sampling + +Introduce a `copy_logits` flag to `Llama.eval()` to control whether C-level +logits are copied into the Python `self.scores` array. + +- Automatically disable `copy_logits` during the generation loop unless + Python-side hooks (`logits_processor`, `stopping_criteria`) explicitly + require them. +- Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch + the final token's logits when copying is required. + +This significantly reduces CPU overhead and memory bandwidth during generation, +as the native `llama.cpp` sampler reads directly from the C context without +needing to expose the `n_vocab` array to Python on every token. + +Signed-off-by: Developer Name + +``` + +### Example 3: Documentation Update + +```text +docs(speculative): document n-gram map k/k4v modes and new parameters + +Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in +the official documentation. + +- Document the new `__init__` parameters (`mode`, `min_hits`, + `max_entries_per_key`) and their validation rules. +- Add a detailed comparison table explaining the memory and behavior + differences between the `"k"` and `"k4v"` lookup modes. +- Add a strong production warning against the legacy `LlamaPromptLookupDecoding` + implementation. + +Signed-off-by: Developer Name + +``` + +## Execution + +When the user provides a code diff or a summary of changes, analyze the intent and output ONLY the raw Git commit message following the exact structure and tone demonstrated above. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] From 1b0ae7097688a0c328f5c4149afa7b9f519318fd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 17:14:35 +0800 Subject: [PATCH 069/139] docs(wiki): add development helper to index Introduce the development section in the wiki index so maintainer-facing workflows and LLM-assisted helper tools are discoverable from the main navigation. - Add a Development section with a link to the Git commit generation agent. Include the helper in the recommended reading order for new wiki users. - Add development/git-commit-generation-agent.md to the available pages list. Signed-off-by: JamePeng jame_peng@sina.com --- docs/wiki/index.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index 143d6e629b..c721fc4e89 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -34,6 +34,18 @@ These pages document major source modules and related classes. --- +### Development + +This section contains maintainer-facing development notes, workflows, and LLM-assisted helper tools for working on `llama-cpp-python`. + +#### Pages + +| Page | Description | +|---|---| +| [[development/Git Commit Generation Agent]] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | + +--- + ### Wiki Maintenance These pages define how the wiki should be written, updated, and reviewed. @@ -55,9 +67,9 @@ If you are new to this wiki, read the pages in this order: 4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] 5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] 6. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] +7. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] If you are contributing documentation, start with: - 1. [[SCHEMA|Wiki Schema](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md)] 2. [[contributing-to-wiki|Contributing to the Wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md)] @@ -75,6 +87,7 @@ Currently available pages: - `modules/LlamaGrammar.md` - `modules/LlamaSpeculative.md` - `modules/Logger.md` +- `development/git-commit-generation-agent.md` - `SCHEMA.md` - `contributing-to-wiki.md` From 0239328f3f22ba87fd74351d96c5c65f6c95f95a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 25 May 2026 19:55:26 +0800 Subject: [PATCH 070/139] Update Submodule vendor/llama.cpp 1acee6b..328874d Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 1 + vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ec2b665a16..238a1a4fe1 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2804,6 +2804,7 @@ def llama_state_seq_load_file( LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1 # // keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +# // Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag. LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2 llama_state_seq_flags = ctypes.c_uint32 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1acee6bf89..328874d054 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1acee6bf8939948f9bcbf4b14034e4b475f06069 +Subproject commit 328874d054e0eb44591202a23c209cf02c18e3cb From a32daf797a5e1aea527c8957da1f25f631ba98e9 Mon Sep 17 00:00:00 2001 From: Jay0360 Date: Wed, 27 May 2026 21:12:15 +0800 Subject: [PATCH 071/139] fix: wire LFM VL chat handlers into server loader --- llama_cpp/server/model.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 37c5195687..6b3fd1dd15 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -199,6 +199,34 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) + elif settings.chat_format == "lfm2-vl": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.LFM2VLChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.LFM2VLChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "lfm2.5-vl": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.LFM25VLChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.LFM25VLChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "hf-autotokenizer": assert ( settings.hf_pretrained_model_name_or_path is not None From d9cc25bcb4e563eed910454f6fb5faa5b736124a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 27 May 2026 22:10:47 +0800 Subject: [PATCH 072/139] Update Submodule vendor/llama.cpp 328874d..617255d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 328874d054..617255d437 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 328874d054e0eb44591202a23c209cf02c18e3cb +Subproject commit 617255d437898fcef6c3d80d4994b307454da850 From 4a6c311364ca3463619c107d37e0ae8a4c0cd98b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 28 May 2026 00:55:17 +0800 Subject: [PATCH 073/139] refactor(internals): align model metadata wrappers with llama.cpp API - Use `llama_vocab_n_tokens()` instead of the old vocab size helper. - Add Python wrappers for model description, size, chat template, and trained RoPE frequency scaling. - Clarify model capability helpers with docstrings matching llama.cpp semantics. - Rename `desc()` and `size()` to `model_desc()` and `model_size()` to make their scope explicit. - Drop the unused `get_tensor()` stub since llama.cpp does not expose it. - Route rerank template lookup through `LlamaModel.model_chat_template()` for consistency with the internal model abstraction. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 65 +++++++++++++++++++++++++++--------- llama_cpp/llama.py | 9 ++++- llama_cpp/llama_embedding.py | 4 +-- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index fda9187855..5416ce2416 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -102,7 +102,7 @@ def vocab_type(self) -> int: return llama_cpp.llama_vocab_type(self.model) def n_vocab(self) -> int: - return llama_cpp.llama_n_vocab(self.vocab) + return llama_cpp.llama_vocab_n_tokens(self.vocab) def n_ctx_train(self) -> int: return llama_cpp.llama_model_n_ctx_train(self.model) @@ -131,41 +131,76 @@ def n_head_kv(self) -> int: def n_swa(self) -> int: return llama_cpp.llama_model_n_swa(self.model) + def rope_freq_scale_train(self) -> float: + """ + Get the model's RoPE frequency scaling factor + """ + return llama_cpp.llama_model_rope_freq_scale_train(self.model) + + def model_desc(self) -> str: + """ + Get a string describing the model type + """ + buf = ctypes.create_string_buffer(256) + llama_cpp.llama_model_desc(self.model, buf, 256) + return buf.value.decode("utf-8") + + def model_size(self) -> int: + """ + Returns the total size of all the tensors in the model in bytes + """ + return llama_cpp.llama_model_size(self.model) + + def model_chat_template(self, name: bytes) -> str: + """ + Get the default chat template. Returns nullptr if not available + If name is NULL, returns the default chat template + """ + return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8") + def n_params(self) -> int: + """ + Returns the total number of parameters in the model + """ return llama_cpp.llama_model_n_params(self.model) def has_encoder(self) -> bool: + """ + Returns true if the model contains an encoder that requires llama_encode() call + """ return llama_cpp.llama_model_has_encoder(self.model) def has_decoder(self) -> bool: + """ + Returns true if the model contains a decoder that requires llama_decode() call + """ return llama_cpp.llama_model_has_decoder(self.model) def decoder_start_token(self) -> int: + """ + For encoder-decoder models, this function returns id of the token that must be provided + to the decoder to start generating output sequence. For other models, it returns -1. + """ return llama_cpp.llama_model_decoder_start_token(self.model) def is_recurrent(self) -> bool: + """ + Returns true if the model is recurrent (like Mamba, RWKV, etc.) + """ return llama_cpp.llama_model_is_recurrent(self.model) def is_hybrid(self) -> bool: + """ + Returns true if the model is hybrid (like Jamba, Granite, etc.) + """ return llama_cpp.llama_model_is_hybrid(self.model) def is_diffusion(self) -> bool: + """ + Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) + """ return llama_cpp.llama_model_is_diffusion(self.model) - def rope_freq_scale_train(self) -> float: - return llama_cpp.llama_model_rope_freq_scale_train(self.model) - - def desc(self) -> str: - buf = ctypes.create_string_buffer(1024) - llama_cpp.llama_model_desc(self.model, buf, 1024) - return buf.value.decode("utf-8") - - def size(self) -> int: - return llama_cpp.llama_model_size(self.model) - - def get_tensor(self, name: str) -> ctypes.c_void_p: - raise NotImplementedError("get_tensor is not implemented in llama.cpp") - # Vocab def token_get_text(self, token: int) -> str: diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e9d16438e5..c2d2757e13 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -696,13 +696,20 @@ def __init__( try: self.metadata = self._model.metadata() + self.model_desc = self._model.model_desc() + # The total size of all the tensors in the model in bytes + self.model_size = self._model.model_size() + except Exception as e: self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + print(f"Model desc: {self.model_desc}, " + f"Model size: {self.model_size / (1024 * 1024):.2f} MB, " + f"Model metadata: {self.metadata}", + file=sys.stderr) eos_token_id = self.token_eos() bos_token_id = self.token_bos() diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 7c8ad1e90f..0c1df339ce 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -303,9 +303,7 @@ def rank(self, query: str, documents: List[str]) -> List[float]: # 1. Attempt to retrieve the built-in 'rerank' chat template from model metadata. # Modern GGUF models often include a template for formatting query/document pairs. - rerank_template = llama_cpp.llama_model_chat_template(self._model.model, b"rerank") - if rerank_template: - rerank_template = rerank_template.decode("utf-8") + rerank_template = self._model.model_chat_template(b"rerank") batch_inputs: List[List[int]] = [] From 677db7b0d5b834ae3d3831af4702ec21986ab335 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Thu, 28 May 2026 00:12:35 +0200 Subject: [PATCH 074/139] Resolve file conflicts. --- .github/workflows/build-wheels-cu131-win.yml | 25 -------------------- 1 file changed, 25 deletions(-) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 5f77003a5f..14bea65d19 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,31 +67,6 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% - - name: Copy LLVM OpenMP runtime - shell: pwsh - run: | - # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. - # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. - # If it is missing from the wheel, ggml_backend_load_all_from_path() - # may fail to load CPU backend DLLs at runtime. - $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" - New-Item -ItemType Directory -Force $packageLibDir | Out-Null - - $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` - -Recurse ` - -Filter "libomp140.x86_64.dll" ` - -ErrorAction SilentlyContinue | - Where-Object { $_.FullName -match "OpenMP\.LLVM" } | - Select-Object -First 1 - - if (!$omp) { - Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." - exit 1 - } - - Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force - Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" - - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') From 4794c8c20ee731838cbc2c8d601ccb2c245d6893 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Thu, 28 May 2026 01:52:48 +0200 Subject: [PATCH 075/139] Added support when using the keyword 'audio' instead of 'audio_url'. --- llama_cpp/llama_chat_format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f9b9d52367..254195f95a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2996,13 +2996,13 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa media_items.append({"url": url, "type": "image"}) # 2. Audio Processing - elif content_type in ["audio_url", "input_audio"]: + elif content_type in ["audio", "audio_url", "input_audio"]: if not self.is_support_audio: raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url": - audio_url = content["audio_url"] + if content_type == "audio_url" or content_type == "audio": + audio_url = content[content_type] url = audio_url if isinstance(audio_url, str) else audio_url["url"] media_items.append({"url": url, "type": "audio"}) # Case B: Handle OpenAI standard input_audio format From 103639ce04b72d09e09ce895f3c8d8cfba518e13 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 28 May 2026 21:48:01 +0800 Subject: [PATCH 076/139] Update Submodule vendor/llama.cpp 617255d..6ed481e Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 617255d437..6ed481eea4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 617255d437898fcef6c3d80d4994b307454da850 +Subproject commit 6ed481eea4cf4ed40777db2fa29e8d08eb712b3b From 6c9e7bf92c346806f91ef06f2522b0def7611f10 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 29 May 2026 22:51:33 +0800 Subject: [PATCH 077/139] feat(chat_handler): update multimodal handlers for Qwen2.5-VL, Qwen3-VL, and PaddleOCR - Update PaddleOCRChatHandler to support version 1.6 - Add token configuration and stop sequences for Qwen2.5-VL and Qwen3-VL - Standardize input_ids initialization in __call__ methods for Qwen2.5-VL, Qwen3-ASR, and Qwen3-VL handlers Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0365d8f871..cf5dca2492 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5324,7 +5324,7 @@ def __call__(self, **kwargs): class PaddleOCRChatHandler(MTMDChatHandler): """ - Handler for PaddleOCR 1.5 multimodal models. + Handler for PaddleOCR 1.5/1.6 multimodal models. """ PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" @@ -5431,6 +5431,11 @@ def __call__(self, **kwargs): class Qwen25VLChatHandler(MTMDChatHandler): + + QWEN25_VL_BOS_TOKEN = "<|endoftext|>" + QWEN25_VL_PAD_TOKEN = "<|endoftext|>" + QWEN25_VL_EOS_TOKEN = "<|im_end|>" + CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" "{% for message in messages %}" @@ -5462,6 +5467,8 @@ class Qwen25VLChatHandler(MTMDChatHandler): ) def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] + llama = kwargs['llama'] if hasattr(llama, 'input_ids'): @@ -5547,12 +5554,22 @@ def __call__(self, **kwargs): # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + if self.verbose: print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") return super().__call__(**kwargs) class Qwen3VLChatHandler(MTMDChatHandler): + + QWEN3_VL_BOS_TOKEN = "<|endoftext|>" + QWEN3_VL_PAD_TOKEN = "<|endoftext|>" + QWEN3_VL_EOS_TOKEN = "<|im_end|>" + CHAT_FORMAT = ( "{{- '<|im_start|>system\n' -}}" "{%- if messages[0].content is string and messages[0].role == 'system' -%}" @@ -5661,6 +5678,8 @@ def __init__( self.extra_template_arguments["add_vision_id"] = add_vision_id def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] + llama = kwargs['llama'] if hasattr(llama, 'input_ids'): From 69e740ce51b064be36fa5e28214839429f89c94e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 30 May 2026 01:21:11 +0800 Subject: [PATCH 078/139] Update Submodule vendor/llama.cpp 6ed481e..06d26df Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6ed481eea4..06d26dfdff 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6ed481eea4cf4ed40777db2fa29e8d08eb712b3b +Subproject commit 06d26dfdff4097dc51eac20155371a9cfd53e094 From e7976f42b23ce29491d1b48bd044682ce4f261a2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 30 May 2026 01:24:56 +0800 Subject: [PATCH 079/139] feat(mtmd): improve fallback chat template for multimodal models - Add BOS/EOS token handling to the default MTMD chat format. - Use a clearer role-based template with explicit USER and ASSISTANT prefixes. - Append a newline after each message to keep generated prompts readable. - Treat EOS as the end marker for the serialized conversation history before the optional generation prompt. - Improve fallback behavior for multimodal GGUF models that do not provide a chat template, such as OCR-oriented models like DeepSeek-OCR 1/2. - Make the default system prompt a single normalized string while preserving its original meaning. - Clean up minor formatting around MTMD context parameter initialization. This improves prompt compatibility for multimodal models that either lack a GGUF chat template or are not yet covered by a complete custom chat handler. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index cf5dca2492..71228d0627 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2811,21 +2811,20 @@ def generate_streaming(tools, functions, function_call, prompt): class MTMDChatHandler: DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"""You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, -while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful.""" +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." ) CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" "{% for message in messages %}" "{% if message.role == 'system' %}" "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" + "{% elif message.role == 'user' %}" + "USER: " "{% if message.content is string %}" - "\nUSER: {{ message.content }}" + "{{ message.content }}" "{% elif message.content is iterable %}" - "\nUSER: " "{% for content in message.content %}" "{% if content.type == 'image_url' %}" "{{ content.image_url if content.image_url is string else content.image_url.url }}" @@ -2842,15 +2841,19 @@ class MTMDChatHandler: "{% endif %}" "{% endfor %}" "{% endif %}" - "{% endif %}" - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" "{% endif %}" + "{{ \"\n\" }}" "{% endfor %}" + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + "{% if add_generation_prompt %}" - "\nASSISTANT: " + "ASSISTANT: " "{% endif %}" ) @@ -2906,7 +2909,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): self.mctx_params.use_gpu = self.use_gpu self.mctx_params.print_timings = self.verbose self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO self.mctx_params.warmup = True if self.image_min_tokens > 0: self.mctx_params.image_min_tokens = self.image_min_tokens From 1df7ffc07b7a8f52000614d9f63a90f8b80f0d6f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 30 May 2026 02:16:12 +0800 Subject: [PATCH 080/139] docs(Readme): Update Deepseek-OCR-2-GGUF Link Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index cc83e9814c..c39df4abd7 100644 --- a/README.md +++ b/README.md @@ -953,6 +953,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | +| [deepseek-ocr](https://huggingface.co/JamePeng2023/DeepSeek-OCR-2-GGUF) | `MTMDChatHandler` | `None` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | From c4efcff5c1534e0a3946809bec6d0e97e374bf4a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 19:13:39 +0800 Subject: [PATCH 081/139] Update Submodule vendor/llama.cpp 06d26df..d4c8e2c Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 06d26dfdff..d4c8e2c29c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 06d26dfdff4097dc51eac20155371a9cfd53e094 +Subproject commit d4c8e2c29ce2fb9a251a0a4a16d6c857b4f70f8c From 6a7fde40a2d96bee1da4c004bf3ac0c31b2432d4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 19:27:11 +0800 Subject: [PATCH 082/139] ci : update metal build/test job to macos-26/macos-15-intel - Build on the Tahoe runners in order to enable the tensor API for M5 and A19. Signed-off-by: JamePeng --- .github/workflows/build-wheels-metal.yaml | 7 +++---- .github/workflows/test.yaml | 12 ++++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index a809909720..2b00d1abaa 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -8,8 +8,8 @@ permissions: jobs: build_wheels: - name: Build wheels (Metal macos) - runs-on: macos-latest + name: Build wheels (Metal macos-26) + runs-on: macos-26 outputs: version: ${{steps.get_version.outputs.version}} @@ -53,8 +53,7 @@ jobs: -DCMAKE_CROSSCOMPILING=on -DGGML_METAL=on -DGGML_METAL_USE_BF16=on - -DGGML_METAL_EMBED_LIBRARY=off - -DGGML_METAL_SHADER_DEBUG=on" + -DGGML_METAL_EMBED_LIBRARY=on" with: package-dir: . output-dir: wheelhouse2 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 420c5e9495..a9f359d1cd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -28,21 +28,21 @@ jobs: python-version: ["3.9", "3.14"] include: # macOS Non-Metal - - os: macos-14 + - os: macos-15-intel python-version: "3.9" - cmake_args: "-DLLAMA_METAL=off" + cmake_args: "-DLLAMA_METAL=off -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3" metal_status: "(No Metal)" - - os: macos-14 + - os: macos-15-intel python-version: "3.14" - cmake_args: "-DLLAMA_METAL=off" + cmake_args: "-DLLAMA_METAL=off -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3" metal_status: "(No Metal)" # macOS Metal - - os: macos-14 + - os: macos-26 python-version: "3.9" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" metal_status: "(Metal)" - - os: macos-14 + - os: macos-26 python-version: "3.14" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" metal_status: "(Metal)" From ea0907d3870aabbeaf669f42bd1b484a2d7e7c83 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 20:06:49 +0800 Subject: [PATCH 083/139] refactor(llama_cpp): wrap llama constants into enum.IntEnum - Group global `LLAMA_*` constants into `enum.IntEnum` classes (`llama_vocab_type`, `llama_vocab_pre_type`, `llama_rope_type`, etc.) for better type safety and organization. - Sync new values for `llama_vocab_pre_type` (`SARVAM_MOE`, `MINICPM5`, `WHITESPACE`). Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 245 +++++++++++++++++++++-------------------- 1 file changed, 128 insertions(+), 117 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 238a1a4fe1..62c4c81ef9 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -122,20 +122,21 @@ # LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization # LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming # }; -LLAMA_VOCAB_TYPE_NONE = 0 -"""For models without vocab""" -LLAMA_VOCAB_TYPE_SPM = 1 -"""LLaMA tokenizer based on byte-level BPE with byte fallback""" -LLAMA_VOCAB_TYPE_BPE = 2 -"""GPT-2 tokenizer based on byte-level BPE""" -LLAMA_VOCAB_TYPE_WPM = 3 -"""BERT tokenizer based on WordPiece""" -LLAMA_VOCAB_TYPE_UGM = 4 -"""T5 tokenizer based on Unigram""" -LLAMA_VOCAB_TYPE_RWKV = 5 -"""RWKV tokenizer based on greedy tokenization""" -LLAMA_VOCAB_TYPE_PLAMO2 = 6 -"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" +class llama_vocab_type(enum.IntEnum): + LLAMA_VOCAB_TYPE_NONE = 0 + """For models without vocab""" + LLAMA_VOCAB_TYPE_SPM = 1 + """LLaMA tokenizer based on byte-level BPE with byte fallback""" + LLAMA_VOCAB_TYPE_BPE = 2 + """GPT-2 tokenizer based on byte-level BPE""" + LLAMA_VOCAB_TYPE_WPM = 3 + """BERT tokenizer based on WordPiece""" + LLAMA_VOCAB_TYPE_UGM = 4 + """T5 tokenizer based on Unigram""" + LLAMA_VOCAB_TYPE_RWKV = 5 + """RWKV tokenizer based on greedy tokenization""" + LLAMA_VOCAB_TYPE_PLAMO2 = 6 + """PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" # NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) @@ -193,58 +194,65 @@ # LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, # LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, # LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, +# LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, +# LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, +# LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, # }; -LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 -LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 -LLAMA_VOCAB_PRE_TYPE_FALCON = 4 -LLAMA_VOCAB_PRE_TYPE_MPT = 5 -LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 -LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 -LLAMA_VOCAB_PRE_TYPE_REFACT = 8 -LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 -LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 -LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 -LLAMA_VOCAB_PRE_TYPE_OLMO = 12 -LLAMA_VOCAB_PRE_TYPE_DBRX = 13 -LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 -LLAMA_VOCAB_PRE_TYPE_PORO = 15 -LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 -LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 -LLAMA_VOCAB_PRE_TYPE_VIKING = 18 -LLAMA_VOCAB_PRE_TYPE_JAIS = 19 -LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 -LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 -LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 -LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 -LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 -LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 -LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 -LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 -LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 -LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 -LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 -LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 -LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 -LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 -LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 -LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 -LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 -LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38 -LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39 -LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 -LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 -LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 -LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 -LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 -LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 -LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 -LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 -LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 -LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 -LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 +class llama_vocab_pre_type(enum.IntEnum): + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 + LLAMA_VOCAB_PRE_TYPE_FALCON = 4 + LLAMA_VOCAB_PRE_TYPE_MPT = 5 + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 + LLAMA_VOCAB_PRE_TYPE_REFACT = 8 + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 + LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 + LLAMA_VOCAB_PRE_TYPE_OLMO = 12 + LLAMA_VOCAB_PRE_TYPE_DBRX = 13 + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 + LLAMA_VOCAB_PRE_TYPE_PORO = 15 + LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 + LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 + LLAMA_VOCAB_PRE_TYPE_VIKING = 18 + LLAMA_VOCAB_PRE_TYPE_JAIS = 19 + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 + LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 + LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 + LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 + LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 + LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 + LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 + LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 + LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38 + LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39 + LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 + LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 + LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 + LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 + LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 + LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 + LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 + LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 + LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 + LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 + LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 + LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51 + LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52 + LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53 # // note: these values should be synchronized with ggml_rope @@ -257,12 +265,13 @@ # LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE, # LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, # }; -LLAMA_ROPE_TYPE_NONE = -1 -LLAMA_ROPE_TYPE_NORM = 0 -LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 -LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 -LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 -LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 +class llama_rope_type(enum.IntEnum): + LLAMA_ROPE_TYPE_NONE = -1 + LLAMA_ROPE_TYPE_NORM = 0 + LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 + LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 + LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 + LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 # enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file @@ -274,13 +283,14 @@ # LLAMA_TOKEN_TYPE_UNUSED = 5, # LLAMA_TOKEN_TYPE_BYTE = 6, # }; -LLAMA_TOKEN_TYPE_UNDEFINED = 0 -LLAMA_TOKEN_TYPE_NORMAL = 1 -LLAMA_TOKEN_TYPE_UNKNOWN = 2 -LLAMA_TOKEN_TYPE_CONTROL = 3 -LLAMA_TOKEN_TYPE_USER_DEFINED = 4 -LLAMA_TOKEN_TYPE_UNUSED = 5 -LLAMA_TOKEN_TYPE_BYTE = 6 +class llama_token_type(enum.IntEnum): + LLAMA_TOKEN_TYPE_UNDEFINED = 0 + LLAMA_TOKEN_TYPE_NORMAL = 1 + LLAMA_TOKEN_TYPE_UNKNOWN = 2 + LLAMA_TOKEN_TYPE_CONTROL = 3 + LLAMA_TOKEN_TYPE_USER_DEFINED = 4 + LLAMA_TOKEN_TYPE_UNUSED = 5 + LLAMA_TOKEN_TYPE_BYTE = 6 # enum llama_token_attr { @@ -355,45 +365,46 @@ # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; -LLAMA_FTYPE_ALL_F32 = 0 -LLAMA_FTYPE_MOSTLY_F16 = 1 -LLAMA_FTYPE_MOSTLY_Q4_0 = 2 -LLAMA_FTYPE_MOSTLY_Q4_1 = 3 -LLAMA_FTYPE_MOSTLY_Q8_0 = 7 -LLAMA_FTYPE_MOSTLY_Q5_0 = 8 -LLAMA_FTYPE_MOSTLY_Q5_1 = 9 -LLAMA_FTYPE_MOSTLY_Q2_K = 10 -LLAMA_FTYPE_MOSTLY_Q3_K_S = 11 -LLAMA_FTYPE_MOSTLY_Q3_K_M = 12 -LLAMA_FTYPE_MOSTLY_Q3_K_L = 13 -LLAMA_FTYPE_MOSTLY_Q4_K_S = 14 -LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 -LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 -LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 -LLAMA_FTYPE_MOSTLY_Q6_K = 18 -LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 -LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 -LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 -LLAMA_FTYPE_MOSTLY_IQ3_XS = 22 -LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 -LLAMA_FTYPE_MOSTLY_IQ1_S = 24 -LLAMA_FTYPE_MOSTLY_IQ4_NL = 25 -LLAMA_FTYPE_MOSTLY_IQ3_S = 26 -LLAMA_FTYPE_MOSTLY_IQ3_M = 27 -LLAMA_FTYPE_MOSTLY_IQ2_S = 28 -LLAMA_FTYPE_MOSTLY_IQ2_M = 29 -LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 -LLAMA_FTYPE_MOSTLY_IQ1_M = 31 -LLAMA_FTYPE_MOSTLY_BF16 = 32 -# LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33 -# LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34 -# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 -LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 -LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 -LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 -LLAMA_FTYPE_MOSTLY_NVFP4 = 39 -LLAMA_FTYPE_MOSTLY_Q1_0 = 40 -LLAMA_FTYPE_GUESSED = 1024 +class llama_ftype(enum.IntEnum): + LLAMA_FTYPE_ALL_F32 = 0 + LLAMA_FTYPE_MOSTLY_F16 = 1 + LLAMA_FTYPE_MOSTLY_Q4_0 = 2 + LLAMA_FTYPE_MOSTLY_Q4_1 = 3 + LLAMA_FTYPE_MOSTLY_Q8_0 = 7 + LLAMA_FTYPE_MOSTLY_Q5_0 = 8 + LLAMA_FTYPE_MOSTLY_Q5_1 = 9 + LLAMA_FTYPE_MOSTLY_Q2_K = 10 + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11 + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12 + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13 + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14 + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 + LLAMA_FTYPE_MOSTLY_Q6_K = 18 + LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 + LLAMA_FTYPE_MOSTLY_IQ3_XS = 22 + LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 + LLAMA_FTYPE_MOSTLY_IQ1_S = 24 + LLAMA_FTYPE_MOSTLY_IQ4_NL = 25 + LLAMA_FTYPE_MOSTLY_IQ3_S = 26 + LLAMA_FTYPE_MOSTLY_IQ3_M = 27 + LLAMA_FTYPE_MOSTLY_IQ2_S = 28 + LLAMA_FTYPE_MOSTLY_IQ2_M = 29 + LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 + LLAMA_FTYPE_MOSTLY_IQ1_M = 31 + LLAMA_FTYPE_MOSTLY_BF16 = 32 + # LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33 + # LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34 + # LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 + LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 + LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 + LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 + LLAMA_FTYPE_MOSTLY_NVFP4 = 39 + LLAMA_FTYPE_MOSTLY_Q1_0 = 40 + LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { # LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, From ca81fd457969bba20d183d27962e28b45d9207ea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 20:33:38 +0800 Subject: [PATCH 084/139] feat: add `ReasoningBudgetState` enum and `TokenMatcher` helper class to _internals.py Introduce `ReasoningBudgetState` enum and `TokenMatcher` helper class to `_internals.py`. This lays the groundwork for the upcoming `ReasoningBudgetSampler`, mirroring the state machine defined in `common/reasoning-budget.h`. - `ReasoningBudgetState`: Tracks the lifecycle of the first reasoning block. - `TokenMatcher`: Handles incremental matching for multi-token sequences. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 5416ce2416..5b5c533c52 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1265,6 +1265,60 @@ class CommonSamplerType(enum.IntEnum): CUSTOM = 99 + +# common/reasssoning-budget.h +# +# enum common_reasoning_budget_state { +# REASONING_BUDGET_IDLE, // waiting for start sequence +# REASONING_BUDGET_COUNTING, // counting down tokens +# REASONING_BUDGET_FORCING, // forcing budget message + end sequence +# REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion +# REASONING_BUDGET_DONE, // passthrough forever +# }; +class ReasoningBudgetState(enum.IntEnum): + """ + State machine for the generic first-reasoning-block budget controller. + + This sampler only controls the first reasoning block. Once the first block + naturally ends or is forcibly closed, the sampler enters DONE and becomes a + permanent passthrough. + """ + + IDLE = 0 # Waiting for the first reasoning_start sequence. + COUNTING = 1 # Counting generated tokens inside the first reasoning block. + FORCING = 2 # Forcing reasoning_budget_message + reasoning_end. + WAITING_UTF8 = 3 # Budget exhausted; waiting for a complete UTF-8 boundary. + DONE = 4 # Permanent passthrough; later reasoning tags are ignored. + + +class TokenMatcher: + """ + Incremental matcher for a multi-token sequence. + Accepts None as tokens to represent no matcher. + """ + def __init__(self, tokens: Optional[Sequence[int]]): + # If None, matcher never matches anything + self.tokens = list(tokens) if tokens is not None else [] + self.pos = 0 + + def advance(self, token: int) -> bool: + if not self.tokens: + return False + if token == self.tokens[self.pos]: + self.pos += 1 + if self.pos >= len(self.tokens): + self.pos = 0 + return True + else: + self.pos = 0 + if token == self.tokens[0]: + self.pos = 1 + return False + + def reset(self) -> None: + self.pos = 0 + + @dataclass class LlamaSamplingParams: seed: int = llama_cpp.LLAMA_DEFAULT_SEED # the seed used to initialize llama_sampler From ab42b8664313a30c390fcf26caaec9602199c0f4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 22:48:42 +0800 Subject: [PATCH 085/139] docs(readme): update supported embeddings models table - Add jina-embeddings-v2-base-zh - Add jina-embeddings-v3 - Minor table formatting clean up Signed-off-by: JamePeng --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c39df4abd7..c5aa1a1b26 100644 --- a/README.md +++ b/README.md @@ -1463,7 +1463,9 @@ run_inference( | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-m3`| Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`jina-embeddings-v2-base-zh`| Embedding |[jina-embeddings-v2-base-zh-GGUF](https://huggingface.co/gpustack/jina-embeddings-v2-base-zh-GGUF) | Useful ✅ | +|`jina-embeddings-v3`| Embedding |[jina-embeddings-v3-GGUF](https://huggingface.co/second-state/jina-embeddings-v3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | |`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | From 90d610ffd7b491603ca23c3b0027629553731658 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 23:11:38 +0800 Subject: [PATCH 086/139] docs(llama_embedding): update supported embeddings models table - Add jina-embeddings-v2-base-zh - Add jina-embeddings-v3 - Minor table formatting clean up Signed-off-by: JamePeng --- docs/wiki/modules/LlamaEmbedding.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/wiki/modules/LlamaEmbedding.md b/docs/wiki/modules/LlamaEmbedding.md index 1279db5cab..3aa2427227 100644 --- a/docs/wiki/modules/LlamaEmbedding.md +++ b/docs/wiki/modules/LlamaEmbedding.md @@ -3,7 +3,7 @@ title: Llama Embedding module_name: llama_cpp.llama_embedding source_file: llama_cpp/llama_embedding.py class_name: LlamaEmbedding -last_updated: 2026-05-01 +last_updated: 2026-05-31 version_target: "latest" --- @@ -18,7 +18,9 @@ version_target: "latest" | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-m3`| Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`jina-embeddings-v2-base-zh`| Embedding |[jina-embeddings-v2-base-zh-GGUF](https://huggingface.co/gpustack/jina-embeddings-v2-base-zh-GGUF) | Useful ✅ | +|`jina-embeddings-v3`| Embedding |[jina-embeddings-v3-GGUF](https://huggingface.co/second-state/jina-embeddings-v3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | |`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | From e174c1073c3c9408b6325ea1fac63688efacbb2e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 1 Jun 2026 09:07:29 +0800 Subject: [PATCH 087/139] feat(sampling): add reasoning budget configurations Introduce reasoning budget and block control parameters to `LlamaSamplingParams` to mirror llama.cpp CLI semantics. This includes: - `reasoning_budget` - `reasoning_start` / `reasoning_end` - `reasoning_budget_message` - `reasoning_start_in_prompt` - `reasoning_start_max_tokens` - Fix typo from typ_p to typical_p in logs Also updated `print_params()` to include these new metrics. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 64 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 5b5c533c52..9a22096a26 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1363,6 +1363,59 @@ class LlamaSamplingParams: default_factory=lambda: ["\n", ":", "\"", "*"] # default sequence breakers for DRY ) + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. + # + # This is intentionally model-agnostic: + # - It does not infer model families. + # - It does not guess reasoning tags from chat templates. + # - Downstream code should pass reasoning_start / reasoning_end explicitly + # for models that do not use the default ... tags. + # + # The sampler only controls the first visible reasoning block. After that + # block naturally ends or is forcibly closed, later reasoning tags are ignored. + # Matches llama.cpp CLI semantics: + # --reasoning-budget N + reasoning_budget: int = -1 # -1 = unrestricted / disabled, 0 = immediate end, N > 0 = token budget + + # Token/text sequence that marks the beginning of the first reasoning block. + # This sequence is tokenized with add_bos=False, special=True before building + # the ReasoningBudgetSampler. + reasoning_start: str = "" + + # Token/text sequence that marks the natural end of the reasoning block. + # When the budget is exhausted, the sampler forces: + # reasoning_budget_message + reasoning_end + reasoning_end: str = "" + + # Optional message injected before reasoning_end when the budget is exhausted. + # Mirrors llama.cpp CLI semantics: + # --reasoning-budget-message MESSAGE + # + # Example forced text: + # "[reasoning budget exhausted]\n" + reasoning_budget_message: Optional[str] = None + + # True when the prompt/chat template has already inserted reasoning_start. + # + # In that case, the sampler will not see the start tag during generation, so + # it must start directly in COUNTING state from the first generated token. + reasoning_start_in_prompt: bool = False + + # Safety window for non-reasoning models. + # + # If reasoning_start is not generated within this many output tokens, the + # sampler permanently switches to DONE and becomes a no-op. This prevents + # later literal mentions of "" in normal answer text from accidentally + # activating the budget controller. + # + # Ignored when reasoning_start_in_prompt=True because counting starts from + # the first generated token. + # + # Set to None to keep waiting for reasoning_start indefinitely. + reasoning_start_max_tokens: Optional[int] = 32 + custom_samplers: List['CustomSampler'] = field(default_factory=list) samplers: List[CommonSamplerType] = field( @@ -1402,11 +1455,18 @@ def print_params(self) -> str: f"\ttop_k = {self.top_k}, top_p = {self.top_p:.3f}, min_p = {self.min_p:.3f}, " f"xtc_probability = {self.xtc_probability:.3f}, xtc_threshold = {self.xtc_threshold:.3f}, " - f"typical_p = {self.typ_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" + f"typical_p = {self.typical_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" f"\tmirostat = {self.mirostat}, mirostat_lr = {self.mirostat_eta:.3f}, " f"mirostat_ent = {self.mirostat_tau:.3f}, adaptive_target = {self.adaptive_target:.3f}, " - f"adaptive_decay = {self.adaptive_decay:.3f}" + f"adaptive_decay = {self.adaptive_decay:.3f}\n" + + f"\treasoning_budget = {self.reasoning_budget}, " + f"reasoning_start = {self.reasoning_start!r}, reasoning_end = {self.reasoning_end!r}\n" + + f"\treasoning_budget_message = {self.reasoning_budget_message!r}, " + f"reasoning_start_in_prompt = {self.reasoning_start_in_prompt}, " + f"reasoning_start_max_tokens = {self.reasoning_start_max_tokens}" ) return result From 9bb06dacc676cda4678e20ba3171f90e4e9e9362 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 01:01:31 +0800 Subject: [PATCH 088/139] Update Submodule vendor/llama.cpp d4c8e2c..27d9ed8 Signed-off-by: JamePeng --- llama_cpp/llama.py | 11 +++++++++-- llama_cpp/llama_cpp.py | 6 ++++++ vendor/llama.cpp | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c2d2757e13..b9a1265b49 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -120,6 +120,7 @@ def __init__( n_ubatch: int = 512, n_seq_max: int = 1, n_rs_seq: int = 0, + n_outputs_max: int = 0, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, ctx_type: Optional[ @@ -478,7 +479,8 @@ def __init__( self.n_batch = min(n_ctx, n_batch) # ??? self.n_keep = n_keep if n_keep > 0 else 256 self.n_seq_max = n_seq_max - self.n_rs_seq = n_rs_seq + self.n_rs_seq = n_rs_seq + self.n_outputs_max = n_outputs_max self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -490,8 +492,13 @@ def __init__( self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) - self.context_params.n_seq_max = self.n_seq_max + + self.context_params.n_seq_max = max(1, self.n_seq_max) + if self.context_params.n_seq_max > llama_cpp_lib.LLAMA_MAX_SEQ: + raise RuntimeError(f"n_seq_max must be <= {llama_cpp_lib.LLAMA_MAX_SEQ}") + self.context_params.n_rs_seq = self.n_rs_seq + self.context_params.n_outputs_max = self.n_batch if self.n_outputs_max == 0 else self.n_outputs_max self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 62c4c81ef9..01aa8cce9b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -55,6 +55,8 @@ LLAMA_MAX_DEVICES = _lib.llama_max_devices() +LLAMA_MAX_SEQ = 256 + # define LLAMA_DEFAULT_SEED 0xFFFFFFFF LLAMA_DEFAULT_SEED = 0xFFFFFFFF @@ -847,6 +849,7 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) # uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] +# uint32_t n_outputs_max; // max outputs in a ubatch (0 = n_batch) # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing @@ -905,6 +908,7 @@ class llama_context_params(ctypes.Structure): n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) n_rs_seq (int): number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] + n_outputs_max (int): max outputs in a ubatch (0 = n_batch) n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing @@ -949,6 +953,7 @@ class llama_context_params(ctypes.Structure): n_ubatch: int n_seq_max: int n_rs_seq: int + n_outputs_max: int n_threads: int n_threads_batch: int ctx_type: int @@ -985,6 +990,7 @@ class llama_context_params(ctypes.Structure): ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), ("n_rs_seq", ctypes.c_uint32), + ("n_outputs_max", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), ("ctx_type", ctypes.c_int), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d4c8e2c29c..27d9ed8397 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d4c8e2c29ce2fb9a251a0a4a16d6c857b4f70f8c +Subproject commit 27d9ed839713e31c7a0ba45e342109a04549834f From a7db23afd86269bb9c08c00b00f2d23288880e50 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 03:09:17 +0800 Subject: [PATCH 089/139] feat(chat-format): improve Jinja2ChatFormatter HF compatibility Enhance Jinja2ChatFormatter to better support HuggingFace-style chat templates while keeping the formatter lightweight and aligned with llama-cpp-python's prompt-rendering needs. This change adds a custom Jinja extension for `{% generation %}` blocks. HuggingFace Transformers uses this tag to track assistant-token spans for assistant masks, but llama-cpp-python only needs the final rendered prompt. The new IgnoreGenerationTags extension therefore treats the tag as a transparent wrapper: it removes the generation/endgeneration tag pair while rendering the inner template body normally. This allows templates that contain `{% generation %}` blocks to render successfully without introducing span tracking overhead. The Jinja environment is also expanded to more closely match Transformers' chat-template runtime behavior. It now enables `jinja2.ext.loopcontrols` for templates that use `{% break %}` or `{% continue %}`, registers a plain JSON `tojson` filter that avoids Jinja's HTML escaping behavior, and exposes `raise_exception` and `strftime_now` as globals instead of passing them on every render call. The formatter now accepts an optional `special_tokens_map`, making additional tokenizer special tokens available to templates. This improves compatibility with templates that reference variables such as `pad_token`, `unk_token`, `sep_token`, or model-specific special tokens beyond `bos_token` and `eos_token`. This also adds optional `documents` support to `__call__`, allowing RAG-style or document-aware chat templates to receive a `documents` variable in the render context. Finally, static stop fields are precomputed during initialization. Text stop sequences and token-id stopping criteria are now built once instead of being recreated for every chat formatting call. The token-id stopping callback also guards against empty token arrays before reading the last token. Key changes: - Add IgnoreGenerationTags Jinja extension for HF `{% generation %}` blocks. - Enable Jinja loop controls for chat templates using break/continue. - Register Transformers-compatible `tojson` behavior. - Register `raise_exception` and `strftime_now` as Jinja globals. - Add `special_tokens_map` support for additional template variables. - Add optional `documents` argument for document-aware templates. - Precompute text stop sequences and token-id stopping criteria. - Improve type normalization for `stop_token_ids`. - Expand docstrings for formatter initialization and render-time variables. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 264 +++++++++++++++++++++++++++++---- 1 file changed, 232 insertions(+), 32 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 71228d0627..f91844bbb7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -26,6 +26,7 @@ ) import jinja2 +from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment import numpy as np @@ -220,6 +221,46 @@ def __call__( class Jinja2ChatFormatter(ChatFormatter): + class IgnoreGenerationTags(Extension): + """Render HuggingFace `{% generation %}` blocks without tracking. + + HuggingFace chat templates may wrap assistant text with: + + {% generation %} + ... + {% endgeneration %} + + Transformers uses this tag to compute assistant-token masks. In + llama-cpp-python chat formatting we only need the final rendered prompt, + so this extension simply removes the tag pair and renders the inner + content as normal Jinja template content. + + This keeps compatibility with HF templates while avoiding the overhead + of span tracking. + + More information see: + https://github.com/huggingface/transformers/blob/39603d0e5cdb6f00e8d473d7fcbb01032d709181/src/transformers/utils/chat_template_utils.py#L425 + """ + + tags = {"generation"} + + def parse(self, parser: jinja2.parser.Parser): + # Consume the opening `{% generation %}` token. + lineno = next(parser.stream).lineno + + # Parse and return the block body until `{% endgeneration %}`. + # Returning the body directly makes the tag a transparent wrapper. + body = parser.parse_statements( + ("name:endgeneration",), + drop_needle=True, + ) + + # Preserve line numbers for better template error messages. + for node in body: + node.set_lineno(lineno) + + return body + def __init__( self, template: str, @@ -227,21 +268,118 @@ def __init__( bos_token: str, add_generation_prompt: bool = True, stop_token_ids: Optional[List[int]] = None, + special_tokens_map: Optional[Dict[str, str]] = None, ): - """A chat formatter that uses jinja2 templates to format the prompt.""" + """Format chat messages with a HuggingFace-style Jinja2 chat template. + + Args: + template: + Raw HuggingFace chat template string. + eos_token: + Text form of the model EOS token. + bos_token: + Text form of the model BOS token. + add_generation_prompt: + Whether to ask the template to append the assistant generation + prefix. This mirrors Transformers' `add_generation_prompt`. + stop_token_ids: + Optional token ids that should stop generation when they appear + as the last generated token. This is llama-cpp-python specific. + special_tokens_map: + Optional tokenizer special-token map. Some HF templates may + reference extra variables such as `pad_token`, `unk_token`, + `sep_token`, or model-specific special tokens. + """ self.template = template self.eos_token = eos_token self.bos_token = bos_token self.add_generation_prompt = add_generation_prompt + self.special_tokens_map = special_tokens_map or {} + self.stop_token_ids = ( - set(stop_token_ids) if stop_token_ids is not None else None + {int(token_id) for token_id in stop_token_ids} + if stop_token_ids is not None + else None ) - self._environment = ImmutableSandboxedEnvironment( + environment = ImmutableSandboxedEnvironment( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, - ).from_string(self.template) + # Keep this aligned with Transformers' chat-template Jinja setup: + # - IgnoreGenerationTags supports `{% generation %}` blocks. + # - loopcontrols supports `{% break %}` and `{% continue %}`. + extensions=[ + Jinja2ChatFormatter.IgnoreGenerationTags, + jinja2.ext.loopcontrols, + ], + ) + + # Match Transformers' chat-template JSON behavior. + # Jinja's default `tojson` escapes HTML characters, which is not what + # plain-text chat templates usually expect. + environment.filters["tojson"] = self.tojson + + # Register these as globals once instead of passing them on every render. + environment.globals["raise_exception"] = self.raise_exception + environment.globals["strftime_now"] = self.strftime_now + + self._environment = environment + self._template = environment.from_string(self.template) + + # Precompute static stop fields once. This avoids rebuilding closures and + # StoppingCriteriaList objects for every chat completion request. + self._stop = [self.eos_token] if self.eos_token else [] + self._stopping_criteria = self._build_stopping_criteria() + + @staticmethod + def raise_exception(message: str): + """Raise a Jinja template error from inside a chat template.""" + raise jinja2.exceptions.TemplateError(message) + + @staticmethod + def strftime_now(format_string: str = "%Y-%m-%d %H:%M:%S") -> str: + """Return the current local time formatted with `datetime.strftime`.""" + return datetime.datetime.now().strftime(format_string) + + @staticmethod + def tojson( + x: Any, + ensure_ascii: bool = False, + indent: Optional[int] = None, + separators: Optional[Tuple[str, str]] = None, + sort_keys: bool = False, + ) -> str: + """Serialize an object to JSON for chat-template rendering. + + This intentionally bypasses Jinja's built-in `tojson` filter because + the built-in filter escapes HTML-sensitive characters. HuggingFace chat + templates expect plain JSON text instead. + """ + return json.dumps( + x, + ensure_ascii=ensure_ascii, + indent=indent, + separators=separators, + sort_keys=sort_keys, + ) + + def _build_stopping_criteria(self): + """Create stopping criteria once during initialization.""" + if self.stop_token_ids is None: + return None + + stop_token_ids = self.stop_token_ids + + def stop_on_last_token( + tokens: npt.NDArray[np.intc], + logits: npt.NDArray[np.single], + ) -> bool: + # Defensive guard: generation normally calls this with at least one + # token, but the callback should never crash on empty input. + return len(tokens) > 0 and int(tokens[-1]) in stop_token_ids + + return llama_core.StoppingCriteriaList([stop_on_last_token]) def __call__( self, @@ -251,44 +389,106 @@ def __call__( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + documents: Optional[List[Dict[str, Any]]] = None, **kwargs: Any, ) -> ChatFormatterResponse: - def raise_exception(message: str): - raise ValueError(message) + """Render OpenAI-style chat messages into a model prompt. - def strftime_now(format_string="%Y-%m-%d %H:%M:%S") -> str: - """ - Returns the current time formatted as a string. - """ - return datetime.datetime.now().strftime(format_string) + The method builds the variable context expected by HuggingFace-style + Jinja chat templates and renders the final prompt string used by + llama-cpp-python. - prompt = self._environment.render( - messages=messages, - eos_token=self.eos_token, - bos_token=self.bos_token, - raise_exception=raise_exception, - strftime_now=strftime_now, - add_generation_prompt=self.add_generation_prompt, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - ) + Template variables provided by default: + messages: + The chat history to render. Each item is expected to be an + OpenAI-style message dictionary, usually containing at least + `role` and `content`. - stopping_criteria = None - if self.stop_token_ids is not None: + eos_token: + The model's end-of-sequence token string. + + bos_token: + The model's beginning-of-sequence token string. + + add_generation_prompt: + Whether the template should append the assistant generation + prefix. This mirrors Transformers' `add_generation_prompt`. + + functions: + Legacy OpenAI-compatible function definitions, if provided. - def stop_on_last_token( - tokens: npt.NDArray[np.intc], logits: npt.NDArray[np.single] - ) -> bool: - return tokens[-1] in self.stop_token_ids + function_call: + Legacy OpenAI-compatible function-call selection, if provided. - stopping_criteria = llama_core.StoppingCriteriaList([stop_on_last_token]) + tools: + OpenAI/HuggingFace-compatible tool definitions, if provided. + This formatter expects tools to already be normalized into + JSON-schema-like dictionaries. It does not auto-convert Python + callables into JSON schemas like Transformers can. + + tool_choice: + Optional tool-choice instruction, such as `"auto"`, `"none"`, + or a specific tool/function selection object. + + documents: + Optional RAG/document context. Some HF chat templates reference + this variable when rendering retrieval-augmented prompts. + + **kwargs: + Extra model-specific or template-specific variables. These are + merged into the template context last, so they can intentionally + override the defaults above when needed. + + Additional variables: + Values from `special_tokens_map` are also exposed to the template, + such as `pad_token`, `unk_token`, `sep_token`, or custom + model-specific special tokens. Core variables like `messages`, + `eos_token`, and `bos_token` override `special_tokens_map` entries + by default. + + Returns: + ChatFormatterResponse: + Contains the rendered prompt, text stop sequences, optional + token-id stopping criteria, and `added_special=True` because the + chat template is responsible for adding model special tokens. + + Raises: + jinja2.exceptions.TemplateError: + If the template calls `raise_exception(...)` or Jinja rendering + fails. + """ + template_kwargs: Dict[str, Any] = {} + + # Make extra tokenizer special tokens available to templates, e.g. + # `pad_token`, `unk_token`, `sep_token`, or model-specific tokens. + template_kwargs.update(self.special_tokens_map) + + # Explicit core variables should override values from special_tokens_map. + template_kwargs.update( + { + "messages": messages, + "eos_token": self.eos_token, + "bos_token": self.bos_token, + "add_generation_prompt": self.add_generation_prompt, + "functions": functions, + "function_call": function_call, + "tools": tools, + "tool_choice": tool_choice, + "documents": documents, + } + ) + + # Let caller-provided kwargs extend the template context. + # If a caller intentionally passes a same-name key, it will override the + # defaults above. This is useful for model-specific template variables. + template_kwargs.update(kwargs) + + prompt = self._template.render(**template_kwargs) return ChatFormatterResponse( prompt=prompt, - stop=[self.eos_token], - stopping_criteria=stopping_criteria, + stop=self._stop, + stopping_criteria=self._stopping_criteria, added_special=True, ) From bbede198b8012b702bc1e6d241f0887b6e3336a2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 03:40:31 +0800 Subject: [PATCH 090/139] feat(llama): enhance chat template initialization with full special tokens Update Llama.__init__ to register additional tokenizer special tokens and improve stop token handling for chat templates. - Expose extra special tokens (EOT, SEP, NL, PAD, MASK) via `special_tokens_map` to Jinja2ChatFormatter. - Keep BOS and EOS tokens as explicit parameters, no longer redundantly put them in `special_tokens_map`. - Build `stop_token_ids` once, including EOS and EOT tokens, skipping invalid (-1) ids. - Update try-block comment: now `{% generation %}` blocks are supported, guard only against malformed or model-specific templates. - This ensures better compatibility with HuggingFace-style chat templates while maintaining llama-cpp-python prompt-rendering behavior. Signed-off-by: JamePeng --- llama_cpp/llama.py | 48 +++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b9a1265b49..43e3d6f1fd 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -692,9 +692,6 @@ def __init__( self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - self._token_nl = self.token_nl() - self._token_eos = self.token_eos() - self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 @@ -720,13 +717,38 @@ def __init__( eos_token_id = self.token_eos() bos_token_id = self.token_bos() + eot_token_id = self.token_eot() + sep_token_id = self.token_sep() + nl_token_id = self.token_nl() + pad_token_id = self.token_pad() + mask_token_id = self.token_mask() + + def _token_text(token_id: int) -> str: + return self._model.token_get_text(token_id) if token_id != -1 else "" + + bos_token = _token_text(bos_token_id) + eos_token = _token_text(eos_token_id) + + special_tokens_map = { + name: text + for name, token_id in { + "eot_token": eot_token_id, + "sep_token": sep_token_id, + "nl_token": nl_token_id, + "pad_token": pad_token_id, + "mask_token": mask_token_id, + }.items() + if token_id != -1 and (text := _token_text(token_id)) + } - eos_token = ( - self._model.token_get_text(eos_token_id) if eos_token_id != -1 else "" - ) - bos_token = ( - self._model.token_get_text(bos_token_id) if bos_token_id != -1 else "" - ) + stop_token_ids = [ + token_id + for token_id in (eos_token_id, eot_token_id) + if token_id != -1 + ] + + if not stop_token_ids: + stop_token_ids = None # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates template_choices = dict( @@ -750,14 +772,14 @@ def __init__( for name, template in template_choices.items(): try: # Attempt to parse and register the template as a valid chat handler. - # We wrap this in a try-block because some models (like LLaVA) contain - # non-standard Jinja2 tags (e.g., {% generation %}) that cause the - # standard parser to crash. + # Keep this guarded because model metadata may contain malformed or + # model-specific Jinja templates that still cannot be rendered by this runtime. self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( template=template, eos_token=eos_token, bos_token=bos_token, - stop_token_ids=[eos_token_id], + stop_token_ids=stop_token_ids, + special_tokens_map=special_tokens_map, ).to_chat_handler() except Exception as e: # If parsing fails (e.g., TemplateSyntaxError), log a warning but do not crash. From e6b58356323d116df141b163f40be3ec988cf290 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 04:54:18 +0800 Subject: [PATCH 091/139] docs: update SCHEMA.md to v0.4 with full wiki path layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added comprehensive docs/wiki/ directory structure overview. - Reorganized modules description; removed hardcoded module page list. - Clarified top-level file purposes and update guidance. - Updated page type examples and templates (Class/Module, Feature, Example, Development). - Strengthened cross-linking rules and update/placeholder guidance. - Bumped schema version from 0.3 → 0.4 and last_modified date. Signed-off-by: JamePeng --- docs/wiki/SCHEMA.md | 141 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 21 deletions(-) diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index 23954a156e..1ffcb1e227 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -4,14 +4,15 @@ - **Author**: JamePeng - **Maintainer**: LLM-assisted documentation workflow - **Project**: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) wiki -- **Last Modified**: 2026-05-16 +- **Last Modified**: 2026-06-02 - **Version Target**: latest source code -- **Schema Version**: 0.3 +- **Schema Version**: 0.4 **Purpose**: - Maintain a living, always-up-to-date, structured documentation wiki for the `llama-cpp-python` library, with LLMs acting as the primary documentation maintainer. - The wiki must help users understand the latest public API, core classes, modules, configuration options, examples, and migration paths based on the current source code. - The wiki should explain not only *how to call an API*, but also *what role the class/module plays in the library*, *how its state is configured*, and *how users should choose between related APIs*. +- The schema also defines the expected wiki directory layout, page ownership, and update rules so new pages can be generated consistently. **Core Principles**: - The source of truth is the latest code in `llama_cpp/`, especially: @@ -29,7 +30,7 @@ - Prefer documenting public and user-facing APIs first. Internal implementation details may be documented only when they help users understand behavior, extension points, debugging, or advanced usage. - All examples must be complete, runnable with the latest API, and include necessary imports. - Clearly mark deprecated, legacy, or changed usage with a warning and show the modern replacement. -- Use internal wiki links (e.g. [[Llama]], [[Qwen35ChatHandler]]) for cross-referencing. +- Use internal wiki links, such as `[[Llama]]`, `[[LlamaCache]]`, `[[LlamaSpeculative]]`, or `[[Qwen35ChatHandler]]`, for cross-referencing. - Keep pages concise, professional, and user-friendly. **Documentation Language**: @@ -38,9 +39,53 @@ - Code comments inside examples should also be in English by default. - If the source code contains Chinese comments or non-English notes, translate them into clear English while preserving the original meaning. +**Wiki Directory Layout**: + +The wiki should be organized by documentation purpose rather than by source-file location alone. + +```text +docs/wiki/ +├─ core/ # Core classes and modules (e.g., Llama, main API objects) +├─ development/ # Developer-focused pages, tools, agents, CI/CD workflows +├─ examples/ # Complete runnable examples for users +├─ features/ # High-level features spanning multiple classes/modules +├─ modules/ # Specialized modules (cache, embeddings, logging, speculative decoding, bindings) +├─ types/ # Type definitions and data structures used across the library +├─ .gitkeep # Placeholder for Git to track empty directories +├─ contributing-to-wiki.md # Guidelines for contributing to the wiki +├─ index.md # Entry point and table of contents +├─ install.md # Installation instructions +├─ SCHEMA.md # Documentation schema and style guide (this file) +├─ troubleshooting.md # Known issues, debugging tips, FAQ +``` + +### Top-Level Files + +| Path | Purpose | Update Guidance | +|---|---|---| +| `docs/wiki/SCHEMA.md` | Defines the documentation contract, directory structure, page templates, and LLM update rules. | Update when adding a new page type, directory, documentation standard, or structural convention. | +| `docs/wiki/index.md` | Main wiki landing page and navigation entry. | Update when important pages are added, renamed, reorganized, or promoted. | +| `docs/wiki/contributing-to-wiki.md` | Human and LLM contribution guide for maintaining the wiki. | Keep aligned with this schema, especially source-reading and accuracy rules. | +| `docs/wiki/install.md` | Installation guide placeholder or final installation documentation. | Convert from placeholder to complete page when installation docs are ready. | +| `docs/wiki/troubleshooting.md` | Troubleshooting guide placeholder or final diagnostics documentation. | Expand with common runtime, build, backend, model loading, and environment issues. | +| `docs/wiki/.gitkeep` | Keeps the wiki directory tracked when needed. | No documentation content is required. | + +### Directory Ownership + +| Directory | Purpose | Typical Content | Primary Audience | +|---|---|---|---| +| `core/` | High-level public entry points and central user APIs. | `Llama`, model lifecycle, generation APIs, chat/completion interfaces. | General users and advanced users. | +| `modules/` | Focused subsystem pages, user-facing modules, low-level bindings, helpers, and advanced API areas. | Cache, embeddings, grammar, speculative decoding, logging, llama.cpp bindings, MTMD bindings. | Advanced users, extension authors, maintainers. | +| `features/` | Workflow-oriented guides that span multiple APIs or modules. | Chat formatting, structured output, multimodal usage, backend loading, caching workflows, speculative decoding workflows. | Users solving a specific task. | +| `examples/` | Complete runnable examples. | Minimal inference, chat completion, embeddings, grammar-constrained generation, speculative decoding, multimodal usage. | Users who want copy-paste starting points. | +| `types/` | Type and schema documentation. | Request/response structures, typed dictionaries, protocol-style types, OpenAI-compatible payloads. | Users integrating with typed code or API-compatible workflows. | +| `development/` | Maintainer-facing documentation and contribution workflows. | Build notes, CI notes, release notes, commit generation workflow, documentation maintenance rules. | Maintainers and contributors. | + **Page Types and Templates**: -1. **Class / Module Page** (e.g. core/Llama.md, modules/LlamaEmbedding.md) +1. **Class / Module Page** + Examples: `core/Llama.md`, `modules/LlamaEmbedding.md`, `modules/LlamaCache.md` + - Frontmatter (YAML): ```yaml --- @@ -51,14 +96,15 @@ version_target: "latest" --- ``` - - Sections (in order): + + - Sections, in order: - Overview - Role in the Library - Constructor (`__init__`) – full parameter table with types, defaults, and explanations - Important Attributes / State - - Core Methods (with signatures and usage examples) + - Core Methods, with signatures and usage examples - Best Practices & Common Patterns - - Deprecated / Changed APIs (with migration notes) + - Deprecated / Changed APIs, with migration notes - Related Links - The **Overview** should briefly explain: @@ -81,24 +127,77 @@ - Only document attributes that affect user understanding, configuration, lifecycle, inference behavior, caching, chat formatting, embeddings, or debugging. Do not document every trivial private variable. -2. **Feature Page** (features/xxx.md) - - Overview, When to use, Related APIs, Code examples, Configuration Notes, Limitations, Related features - - Feature pages should explain workflows across multiple classes or modules. - -3. **Example Page** (examples/xxx.md) - - Goal, Prerequisites, Complete runnable code block, Expected output, Tips - - Rules: - * Use the latest API. - * Include all imports as need. - * Avoid pseudo-code. - * Keep examples focused. - * Mention required model assumptions when needed, such as GGUF file path or chat format. +2. **Feature Page** + Example: `features/speculative-decoding.md`, `features/embeddings-rerank.md` + + Feature pages should explain workflows across multiple classes or modules. + + Required sections: + - Overview + - When to Use + - Related APIs + - Code Examples + - Configuration Notes + - Limitations + - Related Features + +3. **Example Page** + Example: `examples/chat-completion.md` + + Required sections: + - Goal + - Prerequisites + - Complete Runnable Code + - Expected Output + - Tips + + Rules: + - Use the latest API. + - Include all required imports. + - Avoid pseudo-code. + - Keep examples focused. + - Mention required model assumptions when needed, such as GGUF file path, embedding mode, grammar file, chat format, or multimodal assets. + +4. **Development Page** + Example: `development/GitCommitGenerationAgent.md` + + Development pages are maintainer-facing and may document repository workflows, CI, release notes, build matrix decisions, or documentation maintenance conventions. + + Required sections: + - Overview + - Scope + - Workflow + - Inputs / Outputs + - Rules and Constraints + - Examples + - Related Links + +**Cross-Linking Rules**: + +- Use wiki-style internal links for pages that exist or should exist, such as `[[Llama]]`, `[[LlamaCache]]`, `[[LlamaSpeculative]]`, and `[[Logger]]`. +- Link from high-level pages to lower-level module pages when the module explains advanced details. +- Link from feature pages back to the relevant class/module pages. +- Avoid circular explanations. A page may link to another page for details instead of repeating the same explanation. **Update Rules**: + - Before updating any page, the LLM must read the relevant source files. - Update the `last_updated` date. -- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, or backend option, create or expand the corresponding page. +- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, backend option, or binding wrapper, create or expand the corresponding page. - If behavior is inferred from implementation rather than explicitly documented in code, mark the explanation as implementation-based. +- Empty files should be converted into explicit placeholder pages instead of being left blank. - Maintain a high standard of readability and accuracy. -This schema is the contract. All generated content must follow it. \ No newline at end of file +**Quality Checklist**: + +Before finalizing a wiki page, verify: + +- The page reflects the latest source code. +- All parameters, defaults, and return values are accurate. +- Examples are runnable and include necessary imports. +- Internal links point to the correct wiki page names. +- Advanced or low-level APIs are clearly labeled. +- Deprecated behavior is clearly separated from current usage. +- The page avoids undocumented claims, speculative behavior, or outdated assumptions. + +This schema is the contract. All generated content must follow it. From 2fbe63ddf829ab596ce359339f49dd7f110bbe89 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 05:33:37 +0800 Subject: [PATCH 092/139] build(deps): align Jinja2 minimum with Transformers Require Jinja2 >= 3.1.0 for HuggingFace-style chat template support. The updated Jinja2ChatFormatter relies on behavior aligned with Transformers' chat-template runtime, which also requires Jinja2 3.1 or newer. Updating the minimum dependency avoids parser/runtime differences with older Jinja versions. Signed-off-by: JamePeng --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eb4b879dd6..dea9b48ff3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "typing-extensions>=4.8.0", "numpy>=1.21.6,<=2.3.2", "diskcache>=5.6.2", - "jinja2>=2.11.3", + "jinja2>=3.1.0", "Pillow>=9.5.0", ] requires-python = ">=3.9" From acf896381f7b18a92bc0477a0c3939e3a79d910b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 20:21:08 +0800 Subject: [PATCH 093/139] Update Submodule vendor/llama.cpp 27d9ed8..60130d1 Signed-off-by: JamePeng --- llama_cpp/_internals.py | 7 ------- llama_cpp/llama_cpp.py | 10 +++++++--- vendor/llama.cpp | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 9a22096a26..92ff51447f 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -774,13 +774,6 @@ def set_causal_attn(self, causal_attn: bool): """ llama_cpp.llama_set_causal_attn(self.ctx, causal_attn) - def set_warmup(self, warmup: bool): - """ - Set whether the model is in warmup mode or not - If true, all model tensors are activated during llama_decode() to load and cache their weights. - """ - llama_cpp.llama_set_warmup(self.ctx, warmup) - def synchronize(self): """ Wait until all computations are finished diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 01aa8cce9b..9c911bcb14 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -3085,11 +3085,15 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /): # // Set whether the model is in warmup mode or not # // If true, all model tensors are activated during llama_decode() to load and cache their weights. -# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); +# // +# // note: using this can cause extra graph reallocations because it changes the graph topology with MoE models, +# // so it is generally not recommended to use in practice. will be removed in the future +# DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup), +# "user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]"); @ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None) def llama_set_warmup(ctx: llama_context_p, warmup: bool, /): - """ Set whether the model is in warmup mode or not - If true, all model tensors are activated during llama_decode() to load and cache their weights""" + """DEPRECATED: using this can cause extra graph reallocations because it changes the graph topology with MoE models, + so it is generally not recommended to use in practice. will be removed in the future""" ... # // Set abort callback diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 27d9ed8397..60130d18f9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 27d9ed839713e31c7a0ba45e342109a04549834f +Subproject commit 60130d18f9ac7f42cb4d7f6060b088a45d8f242e From a29c75495d69dd0bcd9596fecd99789d07a09ffa Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 23:22:47 +0800 Subject: [PATCH 094/139] docs(install): add source-aligned build and backend guide Document installation workflows for llama-cpp-python with a focus on the underlying llama.cpp CMake build configuration. - Add virtual environment, source install, editable install, rebuild, and verification guidance. - Document common CMake options such as GGML_NATIVE, GGML_BACKEND_DL, GGML_CPU_ALL_VARIANTS, and compiler selection. - Summarize backend-specific build flags for CUDA, BLAS, Metal, Vulkan, OpenVINO, HIP, SYCL, OpenCL, CANN, ZenDNN, and zDNN. - Include backend runtime notes and common installation pitfalls while keeping server-related installation content out of the page. Signed-off-by: JamePeng --- docs/wiki/install.md | 775 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 775 insertions(+) diff --git a/docs/wiki/install.md b/docs/wiki/install.md index e69de29bb2..576ca14c6f 100644 --- a/docs/wiki/install.md +++ b/docs/wiki/install.md @@ -0,0 +1,775 @@ +--- +title: Installation +page_type: guide +source_files: + - README.md + - vendor/llama.cpp/docs/build.md + - vendor/llama.cpp/docs/backend/ +last_updated: 2026-06-02 +author: JamePeng +version_target: "latest" +--- + +# Installation + +## Overview + +This page explains how to install `llama-cpp-python` from source, with or +without hardware acceleration. + +`llama-cpp-python` builds the native `llama.cpp` libraries during installation +and installs them inside the Python package. The exact build depends on your +Python version, compiler, CMake version, operating system, and selected +`llama.cpp` backend. + +For most users, the safest installation path is: + +1. Create a clean Python virtual environment. +2. Upgrade `pip`. +3. Install from the GitHub repository. +4. Pass `CMAKE_ARGS` only when you need a specific backend. + +--- + +## Requirements + +| Requirement | Notes | +|---|---| +| Python | Python 3.9 or newer. The package metadata currently lists Python 3.9 through 3.14. | +| CMake | CMake 3.21 or newer. | +| C/C++ compiler | Required because the package builds `llama.cpp` native libraries. | +| Git | Required when installing from the GitHub repository or cloning recursively. | +| Backend SDKs | Required only for GPU or accelerator builds, such as CUDA, Vulkan, OpenVINO, ROCm/HIP, or SYCL. | + +Platform compiler guidance: + +| Platform | Typical compiler setup | +|---|---| +| Linux | `gcc` or `clang` plus Python development headers if required by your distribution. | +| Windows | Visual Studio 2022 Build Tools or MinGW. For most native builds, Visual Studio Build Tools is recommended. | +| macOS | Xcode Command Line Tools. Metal is enabled by default on supported macOS builds. | + +--- + +## Use a Virtual Environment + +Using a virtual environment avoids mixing build artifacts and dependencies from +different Python installations. + +### Linux and macOS + +```bash +python3 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip setuptools wheel +``` + +### Windows PowerShell + +```powershell +py -3 -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip setuptools wheel +``` + +If PowerShell blocks activation scripts, run: + +```powershell +Set-ExecutionPolicy -Scope CurrentUser RemoteSigned +``` + +Then activate the environment again. + +--- + +## Basic Installation + +Install directly from the project repository: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Windows PowerShell: + +```powershell +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +This builds `llama.cpp` from source and installs the generated native runtime +libraries alongside the Python package. + +Use verbose output when diagnosing build failures: + +```bash +python -m pip install --verbose "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Install From a Local Clone + +Clone recursively so the `vendor/llama.cpp` submodule is available: + +```bash +git clone https://github.com/JamePeng/llama-cpp-python --recursive +cd llama-cpp-python +python -m pip install --upgrade pip +python -m pip install . +``` + +If you already cloned without `--recursive`, initialize the submodule manually: + +```bash +git submodule update --init --recursive +``` + +For editable development installs: + +```bash +python -m pip install -e . +``` + +--- + +## Passing CMake Options + +`llama.cpp` backend options are passed through CMake. There are two common +ways to pass those options during `pip install`. + +### Environment Variable + +Linux and macOS: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows PowerShell: + +```powershell +$env:CMAKE_ARGS = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Clear the variable after the build if you do not want it reused: + +```powershell +Remove-Item Env:CMAKE_ARGS +``` + +### `pip --config-settings` + +You can also pass CMake arguments through `pip`: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" \ + -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" +``` + +Use semicolons inside `cmake.args` when passing multiple CMake definitions. + +--- + +## Common CMake Options + +The Python package forwards CMake options to the bundled `vendor/llama.cpp` +build. These options are useful across many backends. + +| Option | Typical values | Use | +|---|---|---| +| `CMAKE_BUILD_TYPE` | `Release`, `Debug` | Selects build type for single-config generators such as Ninja or Unix Makefiles. Release is the normal install choice. | +| `GGML_NATIVE` | `ON`, `OFF` | Controls whether ggml builds for the current host CPU/GPU. Use `OFF` for more portable wheels; use `ON` for local machine-specific optimization. | +| `BUILD_SHARED_LIBS` | `ON`, `OFF` | Controls shared versus static native libraries. The Python package normally installs shared runtime libraries. | +| `GGML_BACKEND_DL` | `ON`, `OFF` | Builds backend libraries so they can be loaded dynamically at runtime when supported by the build. | +| `GGML_CPU_ALL_VARIANTS` | `ON`, `OFF` | Builds multiple CPU backend variants for x86 feature sets when supported. Useful for portable x64 wheels. | +| `GGML_OPENMP` | `ON`, `OFF` | Enables OpenMP CPU parallelism. On Windows, OpenMP runtime DLLs may need to be packaged beside backend DLLs. | +| `CMAKE_PREFIX_PATH` | path list | Helps CMake find SDKs or libraries installed outside default locations. | +| `CMAKE_C_COMPILER` / `CMAKE_CXX_COMPILER` | compiler paths or names | Selects compilers, often needed for SYCL, HIP, or custom toolchains. | + +Example portable CUDA build: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_NATIVE=OFF" \ + python -m pip install --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Example dynamic CPU backend build: + +```bash +CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_NATIVE=OFF" \ + python -m pip install --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Backend Quick Reference + +Choose one backend path that matches your hardware and installed SDKs. + +| Backend | Typical CMake option | Notes | +|---|---|---| +| CPU only | none | Default portable path. Performance depends on CPU features and build options. | +| OpenBLAS | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS` | CPU BLAS acceleration for prompt processing and larger batches. | +| BLIS | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME` | CPU BLAS route using BLIS. | +| Intel oneMKL | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp` | Intel CPU BLAS route. This is not the Intel GPU path. | +| CUDA | `-DGGML_CUDA=on` | Requires NVIDIA CUDA Toolkit matching your driver and GPU. | +| Metal | `-DGGML_METAL=on` | Enabled by default on supported macOS builds. Use `-DGGML_METAL=OFF` to disable. | +| Vulkan | `-DGGML_VULKAN=on` | Requires Vulkan SDK and platform-specific setup. | +| OpenVINO | `-DGGML_OPENVINO=ON` | Useful for Intel CPU, GPU, and NPU workflows after OpenVINO environment setup. | +| HIP / ROCm | `-DGGML_HIP=ON` | For supported AMD GPUs. May require `GPU_TARGETS`. | +| SYCL | `-DGGML_SYCL=on` | Usually used with Intel oneAPI compilers. | +| OpenCL | `-DGGML_OPENCL=ON` | Primarily documented for Qualcomm Adreno and Snapdragon workflows; can also apply to some other OpenCL devices. | +| CANN | `-DGGML_CANN=ON` | Ascend NPU backend. Requires Ascend drivers and CANN toolkit. | +| ZenDNN | `-DGGML_ZENDNN=ON` | AMD Zen CPU acceleration, mainly matrix multiplication paths. | +| zDNN | `-DGGML_ZDNN=ON -DZDNN_ROOT=/path/to/zdnn` | IBM Z / LinuxONE acceleration path. | + +For the full list of backend options, check the upstream llama.cpp build +documentation and the current `vendor/llama.cpp` source. + +--- + +## CUDA + +CUDA builds require the NVIDIA CUDA Toolkit. Choose a toolkit version that is +compatible with your driver and GPU. + +Linux: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows PowerShell: + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +For newer NVIDIA GPUs with compute capability 90 or higher, the README notes +that Programmatic Dependent Launch can be enabled with: + +```bash +-DGGML_CUDA_PDL=ON +``` + +Example: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on -DGGML_CUDA_PDL=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If `nvcc` produces large volumes of non-blocking template warnings, the README +documents optional CUDA warning suppression: + +```bash +-DCMAKE_CUDA_FLAGS="--diag-suppress=177 --diag-suppress=221 --diag-suppress=550" +``` + +### CUDA Portability and Architecture Selection + +By default, llama.cpp may build for the GPU detected on the build machine. For +a wheel intended to run across multiple CUDA GPUs, disable native detection: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_NATIVE=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If `nvcc` cannot detect your GPU, or if you want to control the generated +binary size, specify CUDA architectures explicitly: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86;89" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Use NVIDIA's compute capability table to choose architecture numbers. For +example, RTX 30-series GPUs commonly use `86`, and RTX 4090 uses `89`. + +If multiple CUDA toolkits are installed, point CMake at the intended compiler: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-12.8/bin/nvcc" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Runtime variables that may matter after installation: + +| Variable | Use | +|---|---| +| `CUDA_VISIBLE_DEVICES` | Selects or hides CUDA devices for the current process. | +| `GGML_CUDA_ENABLE_UNIFIED_MEMORY` | Enables unified-memory fallback on Linux when VRAM is exhausted. On Windows, similar behavior may be controlled by NVIDIA driver settings. | +| `GGML_CUDA_P2P` | Enables peer-to-peer access between GPUs when driver and hardware support it. | +| `GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F` | Forces FP32 compute in selected cuBLAS paths, trading speed for numerical headroom. | +| `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` | Forces FP16 compute in selected cuBLAS paths when supported. | + +--- + +## BLAS and CPU Acceleration + +BLAS acceleration mainly improves prompt processing and larger batch prefill. +It generally does not improve single-token generation speed as much as GPU +offload. + +### OpenBLAS + +Use OpenBLAS when the OpenBLAS development package is available on your system. + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Linux, install the OpenBLAS development package with your system package +manager before building. Package names vary by distribution. + +### BLIS + +BLIS is selected through the `FLAME` BLAS vendor after BLIS is installed: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The upstream BLIS guide also notes that runtime variables such as +`BLIS_NUM_THREADS` and OpenMP affinity settings can affect CPU performance. + +### Intel oneMKL for CPU + +Intel oneMKL is a CPU BLAS path. It is different from Intel GPU acceleration, +which is usually handled through SYCL or OpenVINO. + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Metal on macOS + +On macOS, Metal is enabled by default by this project when building on Apple +platforms. A normal install is usually enough: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +To disable Metal at build time: + +```bash +CMAKE_ARGS="-DGGML_METAL=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +At runtime, use `n_gpu_layers=0` when you want CPU inference even though the +package was built with Metal support. + +--- + +## Vulkan + +Vulkan builds require the Vulkan SDK and any platform-specific environment +setup required by the SDK. + +```bash +CMAKE_ARGS="-DGGML_VULKAN=on" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Linux and macOS, make sure the Vulkan SDK setup script has been sourced in +the same shell session before running `pip install`. + +On Windows, install the Vulkan SDK and make sure its environment variables are +available in the shell that runs the build. + +On Linux, system packages can also provide the Vulkan loader and shader tools. +The upstream guide notes that SPIR-V headers may be required separately from +the Vulkan loader development package on some distributions. + +For macOS Vulkan builds, Vulkan usually runs through a Metal translation layer. +The upstream guide builds Vulkan with Metal disabled: + +```bash +CMAKE_ARGS="-DGGML_VULKAN=ON -DGGML_METAL=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## OpenVINO + +OpenVINO builds require the OpenVINO runtime and environment setup first. + +Linux: + +```bash +source /opt/intel/openvino/setupvars.sh +CMAKE_ARGS="-DGGML_OPENVINO=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows: + +```powershell +# Run this from a shell where OpenVINO setupvars.bat has been initialized, +# such as an OpenVINO command prompt, or initialize it through cmd first. +$env:CMAKE_ARGS = "-DGGML_OPENVINO=ON" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The OpenVINO backend is intended for Intel CPU, GPU, and NPU workflows when the +OpenVINO runtime supports the target device. + +Runtime variables: + +| Variable | Use | +|---|---| +| `GGML_OPENVINO_DEVICE` | Selects `CPU`, `GPU`, `NPU`, or a specific GPU such as `GPU.0`. Defaults to CPU if unset or unavailable. | +| `GGML_OPENVINO_CACHE_DIR` | Enables OpenVINO model caching when set. Not supported on NPU devices according to upstream docs. | +| `GGML_OPENVINO_STATEFUL_EXECUTION` | Enables stateful KV-cache execution. Upstream docs recommend it for CPU/GPU performance and note it is not effective on NPU. | +| `GGML_OPENVINO_PREFILL_CHUNK_SIZE` | Controls NPU prefill chunk size. | +| `GGML_OPENVINO_PROFILING` | Enables OpenVINO profiling. | + +Important limitations from the upstream OpenVINO backend docs: + +- GPU stateless execution has known issues; use `GGML_OPENVINO_STATEFUL_EXECUTION=1` for GPU workflows. +- NPU runs may fail when context size is too large. Keep context size small for NPU workflows. +- Encoder models such as embedding and reranking models are not supported by the current OpenVINO backend implementation. +- Some benchmark workflows require Flash Attention enabled in the llama.cpp tool layer; in Python, verify behavior against your target model and backend. + +--- + +## HIP / ROCm + +HIP builds are for supported AMD GPUs. + +Linux example: + +```bash +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +`GPU_TARGETS` is optional in some setups, but specifying your GPU architecture +can reduce build time and avoid unsupported target issues. + +Windows ROCm builds are more environment-sensitive. The README currently +documents a TheRock ROCm workflow that sets `HIP_PATH`, `ROCM_PATH`, +`HIP_DEVICE_LIB_PATH`, compiler paths, `CMAKE_GENERATOR`, and `CMAKE_ARGS` +before running `pip install`. + +For RDNA3 or CDNA hardware, upstream docs mention optional Flash Attention +acceleration through rocWMMA: + +```bash +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1100 -DGGML_HIP_ROCWMMA_FATTN=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Runtime variables that may matter: + +| Variable | Use | +|---|---| +| `HIP_VISIBLE_DEVICES` | Selects visible HIP devices. | +| `HSA_OVERRIDE_GFX_VERSION` | Can help unsupported Linux GPUs use a nearby architecture value. Upstream docs note this is not supported on Windows. | +| `HIP_DEVICE_LIB_PATH` | Points to ROCm device bitcode libraries when clang cannot find them. | + +--- + +## SYCL + +SYCL builds are usually used with Intel oneAPI compilers. + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +To request FP16 support: + +```bash +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Useful SYCL build options from the upstream backend docs: + +| Option | Use | +|---|---| +| `GGML_SYCL_F16` | Enables FP16 build path. Test both FP32 and FP16 for your model and device. | +| `GGML_SYCL_TARGET` | Selects SYCL target type. Intel is the default target in upstream docs. | +| `GGML_SYCL_DEVICE_ARCH` | Selects device architecture when known. | +| `GGML_SYCL_GRAPH` | Enables the experimental SYCL graph extension. | +| `GGML_SYCL_DNN` | Enables oneDNN integration. | +| `GGML_SYCL_HOST_MEM_FALLBACK` | Allows host-memory fallback when device memory is full, at reduced speed. | +| `GGML_SYCL_SUPPORT_LEVEL_ZERO` | Enables Level Zero support for Intel GPU memory allocation. | + +Useful SYCL runtime variables: + +| Variable | Use | +|---|---| +| `ONEAPI_DEVICE_SELECTOR` | Selects a SYCL device, such as a specific Level Zero GPU. | +| `GGML_SYCL_ENABLE_FLASH_ATTN` | Enables or disables Flash Attention in the SYCL backend. | +| `GGML_SYCL_ENABLE_LEVEL_ZERO` | Uses Level Zero allocation when support was built in. | +| `GGML_SYCL_DISABLE_DNN` | Disables oneDNN path and uses oneMKL path. | +| `ZES_ENABLE_SYSMAN` | Helps query free GPU memory in some Intel GPU setups. | + +--- + +## OpenCL + +OpenCL support is documented upstream mainly for Qualcomm Adreno GPUs and +Snapdragon devices. It may also work on certain other OpenCL-capable GPUs, but +SYCL is usually preferred for modern Intel GPU workflows. + +```bash +CMAKE_ARGS="-DGGML_OPENCL=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Useful OpenCL CMake options: + +| Option | Default | Use | +|---|---|---| +| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embeds OpenCL kernels into the built binary or library. | +| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Enables kernels optimized for Adreno. | + +For Linux builds where OpenCL headers and ICD loader are installed in a custom +prefix, pass that location through `CMAKE_PREFIX_PATH`. + +--- + +## CANN + +CANN is the Ascend NPU backend. It requires Ascend drivers and the CANN toolkit +before building. + +```bash +CMAKE_ARGS="-DGGML_CANN=ON -DCMAKE_BUILD_TYPE=Release" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The upstream CANN documentation focuses on Linux and Ascend devices such as +Atlas 300I A2 and Atlas 300I Duo. Supported model families and data types vary +by device generation. + +--- + +## ZenDNN and zDNN + +ZenDNN and zDNN are different backends. + +| Backend | Hardware | CMake option | +|---|---|---| +| ZenDNN | AMD Zen CPUs, especially AMD EPYC | `-DGGML_ZENDNN=ON` | +| zDNN | IBM Z / LinuxONE with NNPA acceleration | `-DGGML_ZDNN=ON -DZDNN_ROOT=/path/to/zdnn` | + +ZenDNN can be downloaded and built automatically by CMake: + +```bash +CMAKE_ARGS="-DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If you already have a ZenDNN installation: + +```bash +CMAKE_ARGS="-DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/ZenDNN/build/install" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +zDNN requires a zDNN library installation first: + +```bash +CMAKE_ARGS="-DGGML_ZDNN=ON -DZDNN_ROOT=/opt/zdnn-libs" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +ZenDNN currently accelerates matrix multiplication paths and may fall back to +the standard CPU backend for other operations. + +--- + +## Dynamic Backend Wheels + +The README notes that newer preview wheels may be built with: + +```text +GGML_BACKEND_DL=ON +GGML_CPU_ALL_VARIANTS=ON +``` + +In that build mode, CPU backend variants are installed as separate runtime +libraries under: + +```text +site-packages/llama_cpp/lib +``` + +Examples include: + +```text +ggml-cpu-x64 +ggml-cpu-sse42 +ggml-cpu-haswell +ggml-cpu-skylakex +ggml-cpu-alderlake +ggml-cpu-zen4 +``` + +On Windows, dynamic CPU backend DLLs may also need the LLVM OpenMP runtime +next to them: + +```text +libomp140.x86_64.dll +``` + +Based on the current top-level `CMakeLists.txt`, this project installs many +`llama`, `ggml`, CPU-variant, accelerator backend, and `mtmd` targets into the +Python package runtime directory when those targets are available. + +--- + +## Upgrading and Rebuilding + +Use `--upgrade`, `--force-reinstall`, and `--no-cache-dir` when you need to +force a rebuild with new CMake options: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on" \ + python -m pip install --upgrade --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +This is important because `pip` may otherwise reuse cached wheels or build +artifacts from a previous backend configuration. + +For local editable builds, clean old native artifacts before rebuilding when +switching backends: + +```bash +make clean +python -m pip install --verbose -e . +``` + +On Windows, if `make` is not available, remove `_skbuild` and old native +libraries under `llama_cpp/lib` manually before reinstalling. + +--- + +## Verify Installation + +Check that the package imports: + +```bash +python -c "import llama_cpp; print(llama_cpp.__version__)" +``` + +Check where the package was installed: + +```bash +python -c "import llama_cpp, pathlib; print(pathlib.Path(llama_cpp.__file__).parent)" +``` + +Check the bundled native runtime libraries: + +```bash +python -c "import llama_cpp, pathlib; print(list((pathlib.Path(llama_cpp.__file__).parent / 'lib').glob('*')))" +``` + +Run a minimal model load after downloading a GGUF model: + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model.gguf", + n_gpu_layers=0, + verbose=False, +) + +output = llm("Hello,", max_tokens=8) +print(output["choices"][0]["text"]) +``` + +For GPU builds, set `n_gpu_layers=-1` or another positive value to offload +layers: + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model.gguf", + n_gpu_layers=-1, +) +``` + +--- + +## Development Workflow + +Common local development commands: + +```bash +git clone https://github.com/JamePeng/llama-cpp-python --recursive +cd llama-cpp-python +python -m pip install --upgrade pip +python -m pip install -e . +python -m pytest +``` + +The repository also includes a `Makefile` with useful targets: + +| Target | Purpose | +|---|---| +| `make build` | Editable build with verbose output. | +| `make build.cuda` | Editable build with `GGML_CUDA=on`. | +| `make build.openblas` | Editable build with OpenBLAS. | +| `make build.openvino` | Editable build with OpenVINO. | +| `make build.vulkan` | Editable build with Vulkan. | +| `make build.sycl` | Editable build with SYCL. | +| `make test` | Run pytest with verbose tracing. | +| `make clean` | Remove local native build artifacts. | + +When testing a different `llama.cpp` commit, update the `vendor/llama.cpp` +submodule, clean the local build, and reinstall. If the upstream C API changes, +the ctypes declarations in `llama_cpp/llama_cpp.py` may also need to be updated. + +--- + +## Common Installation Pitfalls + +| Symptom | Likely cause | What to try | +|---|---|---| +| CMake cannot find a compiler | Build tools are missing or not available in the current shell. | Install platform build tools and reopen the terminal. On Windows, use a Developer PowerShell or initialize Visual Studio build variables. | +| Build ignores new backend flags | `pip` reused a cached wheel or previous build. | Reinstall with `--force-reinstall --no-cache-dir`, and clean `_skbuild` for local builds. | +| CUDA backend does not build | CUDA Toolkit is missing, incompatible, or not on `PATH`. | Verify `nvcc --version`, CUDA driver compatibility, and `CUDA_PATH` on Windows. | +| CUDA build targets the wrong GPU generation | Native architecture detection picked the build machine GPU, or `nvcc` could not detect it. | Use `-DGGML_NATIVE=OFF` for portability or set `-DCMAKE_CUDA_ARCHITECTURES=...` explicitly. | +| Native library fails to load on Windows | Required DLLs are missing from `PATH` or `llama_cpp/lib`. | Check `llama_cpp/lib` for `llama.dll`, `ggml*.dll`, backend DLLs, and runtime DLLs such as OpenMP or CUDA dependencies. | +| GPU is not used at runtime | The package was built without that backend or `n_gpu_layers` is `0`. | Rebuild with the correct CMake backend flag and set `n_gpu_layers` to a positive value or `-1`. | +| OpenVINO GPU or NPU behaves unexpectedly | Runtime device selection or context size is unsuitable. | Set `GGML_OPENVINO_DEVICE`, enable `GGML_OPENVINO_STATEFUL_EXECUTION=1` for GPU, and keep context size smaller for NPU workflows. | +| SYCL device is not selected | oneAPI environment or device selector is missing. | Source oneAPI setup and set `ONEAPI_DEVICE_SELECTOR` for the intended device. | +| Submodule files are missing | Repository was cloned without `--recursive`. | Run `git submodule update --init --recursive`. | + +For detailed diagnostics, see [[Troubleshooting]]. + +--- + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +* [README Installation](https://github.com/JamePeng/llama-cpp-python/blob/main/README.md#installation) +* [llama.cpp build documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) +* [llama.cpp backend documentation](https://github.com/ggml-org/llama.cpp/tree/master/docs/backend) From 3bcd8010fb89909d6780acb06de8ae0e537e95d9 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 23:26:59 +0800 Subject: [PATCH 095/139] docs(wiki): link installation guide from index Promote the completed installation guide into the wiki entry point so new users can find build and backend setup instructions before reading API-specific documentation. - Add a Getting Started section that links to install.md. - Move installation to the top of the recommended reading order. - Mark install.md as an available page. - Remove installation from the planned documentation areas. Signed-off-by: JamePeng --- docs/wiki/index.md | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index c721fc4e89..8e5dbed14b 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -10,6 +10,16 @@ The documentation is maintained with the help of LLMs, but the source of truth i ## Quick Navigation +### Getting Started + +Start here if you are installing or rebuilding `llama-cpp-python`. + +| Page | Description | +|---|---| +| [install\|Installation] | Source installation guide covering Python setup, CMake options, llama.cpp backend selection, hardware acceleration, rebuilds, and verification. | + +--- + ### Core API Start here if you are using `llama-cpp-python` directly. @@ -42,7 +52,7 @@ This section contains maintainer-facing development notes, workflows, and LLM-as | Page | Description | |---|---| -| [[development/Git Commit Generation Agent]] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | +| [development/Git Commit Generation Agent] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | --- @@ -61,13 +71,14 @@ These pages define how the wiki should be written, updated, and reviewed. If you are new to this wiki, read the pages in this order: -1. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] -2. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] -3. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] -4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] -5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] -6. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] -7. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] +1. [[install|Installation](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md)] +2. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +3. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] +4. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] +5. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] +6. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +7. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] +8. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] If you are contributing documentation, start with: 1. [[SCHEMA|Wiki Schema](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md)] @@ -81,6 +92,7 @@ The wiki is still being expanded. Currently available pages: +- `install.md` - `core/Llama.md` - `modules/LlamaCache.md` - `modules/LlamaEmbedding.md` @@ -99,7 +111,6 @@ Some planned pages may already exist as empty placeholder files. Empty pages are Future documentation may cover: -- Installation and build options - Chat formats and chat handlers - Low-level ctypes bindings - Multimodal APIs @@ -126,5 +137,6 @@ This wiki follows a few core rules: ## Project Links - GitHub: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) +- Installation guide: [install](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md) - Wiki schema: [SCHEMA](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md) -- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) \ No newline at end of file +- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) From 7cd0f6081251fba1852b4ecd61378d36a229de6e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 23:33:13 +0800 Subject: [PATCH 096/139] docs(readme): link detailed installation wiki guide Signed-off-by: JamePeng --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c5aa1a1b26..ba1969793c 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ Thank you for your continuous support! ## Installation +For a structured source-install and backend build guide, see [docs/wiki/install.md](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md). + Requirements: - Python 3.9+ From 14b3b4624065a4b054f4d07a8ac25f999bc7bd87 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 3 Jun 2026 22:26:28 +0800 Subject: [PATCH 097/139] Update Submodule vendor/llama.cpp 60130d1..9e58d4d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 60130d18f9..9e58d4d692 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 60130d18f9ac7f42cb4d7f6060b088a45d8f242e +Subproject commit 9e58d4d692ed3d350591cc86d06c73c61c122509 From fed47f2d398fcb971595f53b423f59fd7fe0d3c1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 4 Jun 2026 21:39:53 +0800 Subject: [PATCH 098/139] Update Submodule vendor/llama.cpp 9e58d4d..7c158fb Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9e58d4d692..7c158fbb4a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9e58d4d692ed3d350591cc86d06c73c61c122509 +Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae From fff6812e071d3d24fe57e1f635ed2ced51b8cd4e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 00:54:06 +0800 Subject: [PATCH 099/139] Update Submodule vendor/llama.cpp 7c158fb..c4a278d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7c158fbb4a..c4a278d68e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae +Subproject commit c4a278d68efa17811006f2123a84081dac03fac7 From be123f1c55c4ae503d4ae5edc845c310a313d2b2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 03:41:22 +0800 Subject: [PATCH 100/139] feat(internals): Add `ReasoningBudgetSampler` support - Add Python-backed ReasoningBudgetSampler for first reasoning-block control - Install the sampler before probability filters to preserve forced end tokens - Support reasoning_budget -1/0/N semantics in sampling params - Force reasoning_budget_message + reasoning_end when the budget is exhausted - Add manual force_reasoning_budget() at the sampling-context level - Match llama.cpp force behavior by allowing only COUNTING -> FORCING - Keep DONE as permanent passthrough and ignore later reasoning tags - Support prefilled reasoning starts with reasoning_start_in_prompt - Preserve UTF-8 boundary safety before forcing the end sequence - Keep Python-backed custom sampler callbacks alive across C sampler usage - Avoid shallow-copying custom_samplers when cloning sampler chains Signed-off-by: JamePeng --- llama_cpp/_internals.py | 525 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 504 insertions(+), 21 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 92ff51447f..c308fae056 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1527,7 +1527,7 @@ def __init__( _existing_sampler: Optional[LlamaSampler] = None, # Internal use for cloning ): if model is None: - raise RuntimeError("model must not be None") + raise RuntimeError("LlamaSamplingContext: model must not be None") self.model = model self.params = params @@ -1537,8 +1537,8 @@ def __init__( lparams = llama_cpp.llama_sampler_chain_default_params() lparams.no_perf = params.no_perf - # history (bounded) - # last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) + # History (bounded) + # Last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) if self.params.penalty_last_n == -1: # full context self.params.penalty_last_n = self.model.n_ctx_train() @@ -1551,10 +1551,10 @@ def __init__( ) self.prev = deque(maxlen=max(self.params.n_prev, 32)) - # reusable token data array + # Reusable token data array self._cur_p = LlamaTokenDataArray(n_vocab=self.n_vocab) - # reusable numpy logits view + # Reusable numpy logits view self._logits_view = None self._logits_ptr_addr = None @@ -1566,14 +1566,14 @@ def __init__( sorted=False, ) - # sampler chain + # Sampler chain if _existing_sampler: self.sampler_chain = _existing_sampler else: self.sampler_chain = LlamaSampler() self._build_sampler_chain() - # grammar sampler + # Grammar sampler self.grammar_sampler = None if params.grammar: self.grammar_sampler = GrammarSampler( @@ -1583,6 +1583,9 @@ def __init__( params.grammar_triggers, ) + # Active Python reasoning-budget sampler for this sampling context. + self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None + def _build_sampler_chain(self): """ Build sampler chain aligned with llama.cpp common_sampler_init @@ -1594,7 +1597,7 @@ def _build_sampler_chain(self): m = self.model if m is None: - raise RuntimeError("Model required to build sampler chain firstly") + raise RuntimeError("LlamaSamplingContext: Model required to build sampler chain firstly") use_adaptive_p = False @@ -1628,7 +1631,66 @@ def _build_sampler_chain(self): p.dry_sequence_breakers ) - # --- 5. Core Sampling Strategies (The "Filter" Loop) --- + # --- 5. Reasoning Budget --- + # + # Install before top-k/top-p/min-p filters so the forced end token cannot + # be removed from the candidate set before forcing happens. + # This sampler only controls the first reasoning block. Later blocks are ignored. + if p.reasoning_budget < -1: + raise ValueError( + "LlamaSamplingContext: reasoning_budget must be -1, 0, or a positive integer" + ) + + if p.reasoning_budget >= 0: + start_tokens = None + if not p.reasoning_start_in_prompt: + start_tokens = m.tokenize( + p.reasoning_start.encode("utf-8"), + add_bos=False, + special=True, + ) + if not start_tokens: + raise ValueError("LlamaSamplingContext: reasoning_start produced no tokens") + + end_tokens = m.tokenize( + p.reasoning_end.encode("utf-8"), + add_bos=False, + special=True, + ) + if not end_tokens: + raise ValueError("LlamaSamplingContext: reasoning_end produced no tokens") + + forced_text = (p.reasoning_budget_message or "") + p.reasoning_end + forced_tokens = m.tokenize( + forced_text.encode("utf-8"), + add_bos=False, + special=True, + ) + if not forced_tokens: + raise ValueError("LlamaSamplingContext: reasoning forced text produced no tokens") + + rb_sampler = ReasoningBudgetSampler( + model=m, + reasoning_budget=p.reasoning_budget, + start_tokens=start_tokens, + end_tokens=end_tokens, + forced_tokens=forced_tokens, + initial_state=( + ReasoningBudgetState.COUNTING + if p.reasoning_start_in_prompt + else ReasoningBudgetState.IDLE + ), + start_max_tokens=p.reasoning_start_max_tokens, + wait_utf8=True, + ) + + # Keep a direct Python reference so force_reasoning_budget() can + # manually transition COUNTING -> FORCING at runtime. + self.reasoning_budget_sampler = rb_sampler + + s.add_custom(rb_sampler) + + # --- 6. Core Sampling Strategies (The "Filter" Loop) --- # We iterate through the list to preserve user-defined order for these specific samplers for stype in p.samplers: if stype == CommonSamplerType.CUSTOM: @@ -1660,7 +1722,7 @@ def _build_sampler_chain(self): elif stype == CommonSamplerType.ADAPTIVE_P: use_adaptive_p = True - # --- 6. Final Distribution / Selection --- + # --- 7. Final Distribution / Selection --- # Mirostat overrides standard greedy/dist sampling if p.mirostat == 1 and m: s.add_mirostat(m.n_vocab(), p.seed, p.mirostat_tau, p.mirostat_eta, 100) @@ -1839,6 +1901,10 @@ def close(self): self.sampler_chain.close() self.sampler_chain = None + # Clear the convenience reference used for manual reasoning-budget force. + # The actual sampler lifetime is owned by sampler_chain.close(). + self.reasoning_budget_sampler = None + # Release large token data buffer used during sampling. # Important for high-vocab models to avoid memory retention. if hasattr(self, "_cur_p"): @@ -1885,24 +1951,53 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str: # Use the model linked to the context to detokenize return ctx_main.model.detokenize(last_n_tokens).decode("utf-8", errors="replace") + def force_reasoning_budget(self) -> bool: + """ + Manually force the active reasoning-budget sampler to end thinking. + + This mirrors llama.cpp's common_sampler_reasoning_budget_force() + behavior at the Python sampling-context level. + + Returns: + True if the sampler was actively COUNTING inside the first reasoning + block and was transitioned to FORCING. + + False if: + - no reasoning-budget sampler is installed + - the sampler is IDLE + - the sampler is WAITING_UTF8 + - the sampler is already FORCING + - the sampler is DONE + + Important: + Calling this while already FORCING must not rewind force_pos. The + underlying ReasoningBudgetSampler.force() handles this by allowing + only COUNTING -> FORCING. + """ + if self.reasoning_budget_sampler is None: + return False + + return self.reasoning_budget_sampler.force() + class CustomSampler: """ - Python wrapper for llama.cpp custom sampler. + Base class for Python-backed custom samplers in the Llama sampler chain. - apply_func: - Callable receiving llama_token_data_array - and modifying logits in-place. + Responsibilities: + - Provides apply, accept, reset, free and clone callbacks for the C sampler chain. + - Keeps Python references alive to prevent GC while C sampler still holds function pointers. + - Implements safe close to clear all callback references. """ def __init__( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None], - name: str = "custom", accept_func: Optional[Callable] = None, reset_func: Optional[Callable] = None, free_func: Optional[Callable] = None, clone_func: Optional[Callable] = None, + name: str = "custom", ): if not callable(apply_func): raise TypeError("apply_func must be callable") @@ -2002,6 +2097,389 @@ def __del__(self): self.close() +class ReasoningBudgetSampler(CustomSampler): + """ + Generic first-reasoning-block budget sampler. + + This sampler is intentionally model-agnostic. It does not infer model + families, inspect chat templates, or guess reasoning tags. The caller is + responsible for passing the correct reasoning_start and reasoning_end token + sequences. + + Behavior: + 1. Wait for the first reasoning_start token sequence, unless the prompt + already inserted it and initial_state is COUNTING. + 2. Count accepted tokens inside the first reasoning block. + 3. If reasoning_end appears naturally, switch to DONE. + 4. If the budget is exhausted first, force: + reasoning_budget_message + reasoning_end + token by token. + 5. Once DONE, remain passthrough forever. Later reasoning tags are ignored. + + This mirrors the core idea of llama.cpp's reasoning-budget sampler while + keeping the Python API small and explicit. + """ + + def __init__( + self, + *, + model: LlamaModel, + reasoning_budget: int, + start_tokens: Optional[Sequence[int]], + end_tokens: Sequence[int], + forced_tokens: Sequence[int], + initial_state: ReasoningBudgetState = ReasoningBudgetState.IDLE, + start_max_tokens: Optional[int] = 32, + wait_utf8: bool = True, + ): + """ + Initialize the reasoning budget sampler. + + Args: + model: + The active LlamaModel wrapper. Used for token_to_piece() when + checking UTF-8 boundaries. + + reasoning_budget: + Token budget inside the first reasoning block. + Must be >= 0 here. The disabled value -1 is handled before this + sampler is created. + + 0: + Force the end sequence immediately after reasoning starts. + + N > 0: + Allow at most N accepted tokens inside the reasoning block. + + start_tokens: + Token sequence that starts reasoning budget counting. + Must be provided when initial_state is IDLE. + Can be None when initial_state is COUNTING, which is used when + the prompt/chat template has already inserted reasoning_start. + + end_tokens: + Token sequence that naturally ends the reasoning block. + + forced_tokens: + Token sequence forced when the budget is exhausted. This should + normally be tokenized from: + reasoning_budget_message + reasoning_end + + initial_state: + Initial state of the sampler. + IDLE: + Wait for start_tokens during generation. + COUNTING: + Start counting from the first generated token. Use this when + reasoning_start is already present in the prompt. + + start_max_tokens: + Safety window for non-reasoning models. If start_tokens are not + observed within this many generated tokens, the sampler switches + to DONE and becomes a no-op. Set to None to wait indefinitely. + + wait_utf8: + If True, when the budget is exhausted on an incomplete UTF-8 + token piece, wait until a complete UTF-8 boundary before forcing + the end sequence. + """ + if model is None: + raise ValueError("model must not be None") + + if reasoning_budget < 0: + raise ValueError("reasoning_budget must be >= 0") + + self.model = model + + # Maximum number of tokens allowed inside the first reasoning block. + # The disabled value (-1) should be handled before constructing this sampler. + self.reasoning_budget = int(reasoning_budget) + + # Remaining tokens in the active reasoning block. + self.remaining = int(reasoning_budget) + + # Incremental matcher for the first reasoning_start sequence. + # Empty matcher is allowed only when initial_state=COUNTING. + self.start_matcher = TokenMatcher(start_tokens) + + # Incremental matcher for the natural reasoning_end sequence. + self.end_matcher = TokenMatcher(end_tokens) + + # Token sequence forced after budget exhaustion: + # reasoning_budget_message + reasoning_end + self.forced_tokens = list(forced_tokens) + + if initial_state == ReasoningBudgetState.IDLE and not self.start_matcher.tokens: + raise ValueError( + "start_tokens must not be empty when initial_state=IDLE" + ) + + if not self.end_matcher.tokens: + raise ValueError("end_tokens must not be empty") + + if not self.forced_tokens: + raise ValueError("forced_tokens must not be empty") + + # State used by reset(). This is important for templates that already + # insert reasoning_start into the prompt: reset must return to COUNTING, + # not always IDLE. + self.initial_state = ReasoningBudgetState(initial_state) + + # Current runtime state. + self.state = ReasoningBudgetState(initial_state) + + # Index of the next token in forced_tokens to force. + self.force_pos = 0 + + # Count of generated tokens observed by this sampler. + # Used only in IDLE to enforce start_max_tokens. + self.generated_tokens = 0 + + # Maximum number of generated tokens to wait for reasoning_start. + # None means wait indefinitely. + self.start_max_tokens = start_max_tokens + + # Whether to delay forcing until a complete UTF-8 boundary. + self.wait_utf8 = wait_utf8 + + # Keep cloned Python sampler objects alive when llama.cpp clones the + # sampler chain. Without this, cloned Python callbacks could be garbage + # collected while C still holds function pointers to them. + self._clone_keep_alive: List["ReasoningBudgetSampler"] = [] + + if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + super().__init__( + apply_func=self._apply, + accept_func=self._accept, + reset_func=self._reset, + clone_func=self._clone, + name="reasoning-budget", + ) + + def force(self) -> bool: + """ + Manually transition the active reasoning block into forced ending. + + This method is useful for external interruption scenarios, such as: + - user clicks "stop thinking" + - server-side thinking timeout + - UI wants to skip the rest of the reasoning block while still allowing + the model to continue with the final answer + + The transition is allowed only from COUNTING. This matches llama.cpp's + common_reasoning_budget_force() behavior and avoids unsafe rewinding when + the sampler is already FORCING. + """ + if self.state != ReasoningBudgetState.COUNTING: + return False + + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + return True + + def _token_utf8_complete(self, token: int) -> bool: + """ + Return whether the token piece is a complete UTF-8 byte sequence. + + This is a safety feature. If the budget is exhausted in the middle of a + multi-byte UTF-8 sequence, the sampler waits until a complete boundary + before forcing reasoning_budget_message + reasoning_end. + """ + if not self.wait_utf8: + return True + + try: + piece = self.model.token_to_piece(token, special=False) + if not piece: + return True + piece.decode("utf-8") + return True + except UnicodeDecodeError: + return False + except Exception: + # Avoid getting stuck forever if token_to_piece behaves unexpectedly. + return True + + def _start_counting(self) -> None: + """ + Enter COUNTING state and initialize the budget window. + + If reasoning_budget is 0, immediately enter FORCING state. + """ + self.state = ReasoningBudgetState.COUNTING + self.remaining = self.reasoning_budget + self.end_matcher.reset() + self.force_pos = 0 + + if self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + def _accept(self, token: int) -> None: + """ + Update sampler state after one token has been accepted. + + This method does not modify logits. It only tracks: + - whether reasoning_start has appeared + - whether reasoning_end has appeared + - how much budget remains + - where we are in the forced token sequence + """ + self.generated_tokens += 1 + + if self.state == ReasoningBudgetState.IDLE: + if self.start_matcher.advance(token): + self._start_counting() + return + + # Safety for non-reasoning models: + # + # If no reasoning_start appears near the beginning, assume this + # completion has no visible reasoning block. Switch to DONE forever + # so later literal mentions of reasoning_start do not accidentally + # activate the budget controller. + if ( + self.start_max_tokens is not None + and self.generated_tokens >= self.start_max_tokens + ): + self.state = ReasoningBudgetState.DONE + return + + if self.state in ( + ReasoningBudgetState.COUNTING, + ReasoningBudgetState.WAITING_UTF8, + ): + if self.end_matcher.advance(token): + self.state = ReasoningBudgetState.DONE + return + + utf8_complete = self._token_utf8_complete(token) + + if self.state == ReasoningBudgetState.WAITING_UTF8: + if utf8_complete: + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + return + + self.remaining -= 1 + if self.remaining <= 0: + if utf8_complete: + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + else: + self.state = ReasoningBudgetState.WAITING_UTF8 + self.end_matcher.reset() + return + + if self.state == ReasoningBudgetState.FORCING: + self.force_pos += 1 + if self.force_pos >= len(self.forced_tokens): + self.state = ReasoningBudgetState.DONE + return + + if self.state == ReasoningBudgetState.DONE: + # Only the first reasoning block is budget-controlled. + # Later reasoning tags are normal generated text. + return + + def _apply(self, cur_p: llama_cpp.llama_token_data_array) -> None: + """ + Apply logits forcing before sampling. + + In FORCING state, only forced_tokens[force_pos] is allowed. All other + candidate logits are set to -inf. The forced token is set to +inf to make + the intent explicit and robust against previous logit modifications. + """ + if self.state != ReasoningBudgetState.FORCING: + return + + if self.force_pos >= len(self.forced_tokens): + return + + forced = self.forced_tokens[self.force_pos] + data = cur_p.data + found = False + + for i in range(cur_p.size): + if data[i].id == forced: + data[i].logit = float("inf") + found = True + else: + data[i].logit = float("-inf") + + cur_p.sorted = False + cur_p.selected = -1 + + if not found: + raise RuntimeError( + f"ReasoningBudgetSampler: forced token {forced} is not present " + "in the candidate array. Move ReasoningBudgetSampler earlier in " + "the sampler chain." + ) + + def _reset(self) -> None: + """ + Reset the sampler to its configured initial state. + + Uses self.initial_state to determine whether to start in: + - IDLE: wait for reasoning_start token sequence + - COUNTING: prompt already contains start token, begin counting immediately + + Also resets internal counters and matchers: + - remaining budget + - generated_tokens + - start_matcher / end_matcher positions + - force_pos + """ + self.state = self.initial_state + self.remaining = self.reasoning_budget + self.generated_tokens = 0 + self.force_pos = 0 + + if self.start_matcher: + self.start_matcher.reset() + self.end_matcher.reset() + + # If initial_state = COUNTING and budget is zero, immediately enter FORCING + if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + def _clone(self): + """ + Clone the full runtime state. + + This mirrors the newer llama.cpp reasoning-budget sampler behavior where + clone copies the full sampler context, not only the static configuration. + """ + cloned = ReasoningBudgetSampler( + model=self.model, + reasoning_budget=self.reasoning_budget, + start_tokens=self.start_matcher.tokens, + end_tokens=self.end_matcher.tokens, + forced_tokens=self.forced_tokens, + initial_state=self.initial_state, + start_max_tokens=self.start_max_tokens, + wait_utf8=self.wait_utf8, + ) + + cloned.remaining = self.remaining + cloned.state = self.state + cloned.force_pos = self.force_pos + cloned.generated_tokens = self.generated_tokens + cloned.start_matcher.pos = self.start_matcher.pos + cloned.end_matcher.pos = self.end_matcher.pos + + # Keep the cloned Python object alive on the source sampler. The cloned + # LlamaSampler wrapper does not own this object directly because the C + # sampler clone is created through the callback. + self._clone_keep_alive.append(cloned) + + return cloned.get_sampler() + class LlamaSampler: def __init__(self, existing_sampler_p: Optional[llama_cpp.llama_sampler_p] = None): if existing_sampler_p: @@ -2055,12 +2533,13 @@ def clone(self) -> 'LlamaSampler': new_sampler = LlamaSampler(existing_sampler_p=new_sampler_p) - # copy _keep_alive and custom_samplers list to new sampler - if self._keep_alive: - new_sampler._keep_alive = self._keep_alive.copy() - - if self.custom_samplers: - new_sampler.custom_samplers = self.custom_samplers.copy() + # llama_sampler_clone() clones C samplers internally. For Python-backed + # custom samplers, the clone_func returns a new C sampler whose Python + # callback object is kept alive by the original custom sampler. Shallow + # copying custom_samplers would make the cloned chain close the original + # Python custom sampler, causing premature close/double-free issues. + new_sampler._keep_alive = self._keep_alive.copy() if self._keep_alive else [] + new_sampler.custom_samplers = [] return new_sampler @@ -2250,6 +2729,10 @@ def add_custom(self, custom_sampler: CustomSampler): [llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler] ) + # Keep the Python callback object alive while the C sampler chain holds + # function pointers to it. + self._keep_alive.append(custom_sampler) + def get_seed(self) -> int: assert self.sampler is not None return llama_cpp.llama_sampler_get_seed(self.sampler) From 82a026687eda143c2877fcee96d5f9dbf64d45e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 12:10:10 +0800 Subject: [PATCH 101/139] feat(internals): add verbose logging to ReasoningBudgetSampler - Add `verbose` parameter to ReasoningBudgetSampler to print high-level state transitions to stderr. - Log key events: initialization, reasoning_start matched, budget exhausted, forced end sequence, UTF-8 boundary waiting, manual force, natural end, reset. - Pass `verbose=getattr(model, "verbose", False)` from LlamaSamplingContext when building the sampler chain. - Preserve verbose flag when cloning the sampler. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index c308fae056..434921e6bd 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -3,6 +3,7 @@ import ctypes import enum import os +import sys from typing import ( Callable, @@ -1566,6 +1567,9 @@ def __init__( sorted=False, ) + # Active Python reasoning-budget sampler for this sampling context. + self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None + # Sampler chain if _existing_sampler: self.sampler_chain = _existing_sampler @@ -1583,9 +1587,6 @@ def __init__( params.grammar_triggers, ) - # Active Python reasoning-budget sampler for this sampling context. - self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None - def _build_sampler_chain(self): """ Build sampler chain aligned with llama.cpp common_sampler_init @@ -1682,6 +1683,7 @@ def _build_sampler_chain(self): ), start_max_tokens=p.reasoning_start_max_tokens, wait_utf8=True, + verbose=getattr(m, "verbose", False), ) # Keep a direct Python reference so force_reasoning_budget() can @@ -2131,6 +2133,7 @@ def __init__( initial_state: ReasoningBudgetState = ReasoningBudgetState.IDLE, start_max_tokens: Optional[int] = 32, wait_utf8: bool = True, + verbose: bool = False, ): """ Initialize the reasoning budget sampler. @@ -2182,6 +2185,11 @@ def __init__( If True, when the budget is exhausted on an incomplete UTF-8 token piece, wait until a complete UTF-8 boundary before forcing the end sequence. + + verbose: + If True, print high-level reasoning-budget state transitions to + stderr. Logging is intentionally limited to transitions instead + of per-token events to avoid noisy generation output. """ if model is None: raise ValueError("model must not be None") @@ -2242,6 +2250,10 @@ def __init__( # Whether to delay forcing until a complete UTF-8 boundary. self.wait_utf8 = wait_utf8 + # Whether to print high-level state transition logs. + # This follows the model/runtime verbose flag and avoids per-token spam. + self.verbose = verbose + # Keep cloned Python sampler objects alive when llama.cpp clones the # sampler chain. Without this, cloned Python callbacks could be garbage # collected while C still holds function pointers to them. @@ -2258,6 +2270,19 @@ def __init__( name="reasoning-budget", ) + if self.verbose: + print( + f"ReasoningBudgetSampler: initialized " + f"(state={self.state.name}, budget={self.reasoning_budget}, " + f"start_max_tokens={self.start_max_tokens}, wait_utf8={self.wait_utf8}).", + file=sys.stderr, + ) + + def _log(self, message: str) -> None: + """Print a verbose reasoning-budget state transition message.""" + if self.verbose: + print(f"ReasoningBudgetSampler: {message}", file=sys.stderr) + def force(self) -> bool: """ Manually transition the active reasoning block into forced ending. @@ -2278,6 +2303,7 @@ def force(self) -> bool: self.state = ReasoningBudgetState.FORCING self.force_pos = 0 self.end_matcher.reset() + self._log("manual force requested; entering FORCING state.") return True def _token_utf8_complete(self, token: int) -> bool: @@ -2313,9 +2339,11 @@ def _start_counting(self) -> None: self.remaining = self.reasoning_budget self.end_matcher.reset() self.force_pos = 0 + self._log(f"reasoning_start matched; entering COUNTING state (budget={self.reasoning_budget}).") if self.remaining <= 0: self.state = ReasoningBudgetState.FORCING + self._log("budget is 0; entering FORCING state immediately.") def _accept(self, token: int) -> None: """ @@ -2345,6 +2373,10 @@ def _accept(self, token: int) -> None: and self.generated_tokens >= self.start_max_tokens ): self.state = ReasoningBudgetState.DONE + self._log( + f"reasoning_start not found within {self.start_max_tokens} generated tokens; " + "switching to DONE passthrough." + ) return if self.state in ( @@ -2353,6 +2385,7 @@ def _accept(self, token: int) -> None: ): if self.end_matcher.advance(token): self.state = ReasoningBudgetState.DONE + self._log("reasoning_end matched naturally; switching to DONE passthrough.") return utf8_complete = self._token_utf8_complete(token) @@ -2362,6 +2395,7 @@ def _accept(self, token: int) -> None: self.state = ReasoningBudgetState.FORCING self.force_pos = 0 self.end_matcher.reset() + self._log("UTF-8 boundary reached; entering FORCING state.") return self.remaining -= 1 @@ -2370,15 +2404,18 @@ def _accept(self, token: int) -> None: self.state = ReasoningBudgetState.FORCING self.force_pos = 0 self.end_matcher.reset() + self._log("reasoning budget exhausted; entering FORCING state.") else: self.state = ReasoningBudgetState.WAITING_UTF8 self.end_matcher.reset() + self._log("reasoning budget exhausted; waiting for UTF-8 boundary before forcing.") return if self.state == ReasoningBudgetState.FORCING: self.force_pos += 1 if self.force_pos >= len(self.forced_tokens): self.state = ReasoningBudgetState.DONE + self._log("forced end sequence completed; switching to DONE passthrough.") return if self.state == ReasoningBudgetState.DONE: @@ -2448,6 +2485,8 @@ def _reset(self) -> None: if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: self.state = ReasoningBudgetState.FORCING + self._log(f"reset to {self.state.name} state.") + def _clone(self): """ Clone the full runtime state. @@ -2464,6 +2503,7 @@ def _clone(self): initial_state=self.initial_state, start_max_tokens=self.start_max_tokens, wait_utf8=self.wait_utf8, + verbose=self.verbose, ) cloned.remaining = self.remaining From 1b472b354b6d0dbb841b8b29e260a8544453ecf3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 12:13:18 +0800 Subject: [PATCH 102/139] feat: pass reasoning budget params through Llama APIs - Add reasoning budget params to public completion and chat entry points - Forward the params from chat handlers into create_completion - Propagate reasoning budget controls down to generate and sampling params - Document -1/0/N reasoning_budget behavior in completion docstrings - Support custom reasoning_start and reasoning_end tags without model-specific inference - Support reasoning_budget_message and reasoning_start_in_prompt - Wire MTMD chat handler to the same reasoning budget controls Signed-off-by: JamePeng --- llama_cpp/llama.py | 122 +++++++++++++++++++++++++++++++++ llama_cpp/llama_chat_format.py | 40 +++++++++++ 2 files changed, 162 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 43e3d6f1fd..2bab3709e6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1355,6 +1355,13 @@ def sample( grammar_lazy: bool = False, idx: Optional[int] = None, seed: Optional[int] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ): """Sample a token from the model. Returns: @@ -1413,6 +1420,16 @@ def sample( logit_bias=self._convert_logit_bias(logit_bias), grammar=grammar.grammar if grammar else "", grammar_lazy=grammar_lazy, + + # Reasoning Budget + # This generic controller only counts the first visible reasoning + # block. Use reasoning_budget=-1 to leave it disabled. + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) # LogitsProcessor Adapter @@ -1487,6 +1504,13 @@ def generate( seed: Optional[int] = None, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -1532,6 +1556,18 @@ def generate( grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax. grammar_lazy: If True, activates grammar constraints only on specific trigger tokens. seed: RNG seed for sampling. Overrides the instance seed. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the reasoning budget sampler, 0 forces the block to end + immediately after it starts, and N > 0 allows at most N generated tokens. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + Defaults to "". Pass a model-specific value for non-default tags. + reasoning_end: Token/text sequence that marks the natural and forced end of the reasoning block. + Defaults to "". + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template has already inserted reasoning_start, + so counting starts from the first generated token. + reasoning_start_max_tokens: Safety window for non-reasoning models. If reasoning_start is not + generated within this many output tokens, the sampler becomes a no-op. Set None to wait indefinitely. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -1682,6 +1718,16 @@ def generate( grammar=grammar._grammar if grammar else "", grammar_lazy=grammar_lazy, seed=seed if seed is not None else self._seed, + + # Reasoning Budget + # Keeps the core sampler model-agnostic: callers provide the visible + # reasoning start/end tags, and -1 keeps the controller disabled. + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) # Register custom python-level logits processors if provided @@ -2065,6 +2111,13 @@ def _create_completion( seed: Optional[int] = None, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -2253,6 +2306,12 @@ def _create_completion( seed=seed if seed is not None else self._seed, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ): if llama_cpp_lib.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -2717,6 +2776,13 @@ def create_completion( grammar_lazy: bool = False, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2761,6 +2827,14 @@ def create_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2820,6 +2894,12 @@ def create_completion( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -2871,6 +2951,13 @@ def __call__( grammar_lazy: bool = False, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2915,6 +3002,14 @@ def __call__( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2974,6 +3069,12 @@ def __call__( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) def create_chat_completion( @@ -3025,6 +3126,13 @@ def create_chat_completion( top_logprobs: Optional[int] = None, assistant_prefill: bool = False, add_generation_prompt: bool = True, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -3072,6 +3180,14 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -3138,6 +3254,12 @@ def create_chat_completion( control_vector=control_vector, assistant_prefill=assistant_prefill, add_generation_prompt=add_generation_prompt, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f91844bbb7..f502d68dc9 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -131,6 +131,17 @@ def __call__( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. These parameters are + # passed through to llama.create_completion() without model-specific + # inference or template guessing. + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -829,6 +840,17 @@ def chat_completion_handler( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. These parameters are + # passed through to llama.create_completion() without model-specific + # inference or template guessing. + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -964,6 +986,12 @@ def chat_completion_handler( stopping_criteria=stopping_criteria, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if tool is not None: tool_name = tool["function"]["name"] @@ -3512,6 +3540,12 @@ def __call__( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -3772,6 +3806,12 @@ def __call__( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if tool is not None: From fb65ed793957f6a197b910af65e2fe7c7cf215e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 20:01:50 +0800 Subject: [PATCH 103/139] Update Submodule vendor/llama.cpp c4a278d..6b80c74 Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 ++- llama_cpp/mtmd_cpp.py | 34 ++++++++++++++++++++++++++++------ vendor/llama.cpp | 2 +- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f502d68dc9..f929bcd150 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3292,7 +3292,8 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), - len(media_bytes) + len(media_bytes), + False, ) if bitmap is None: diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 839c718ccd..4542555c65 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -326,7 +326,10 @@ def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: # // if bitmap is audio: # // length of data must be n_samples * sizeof(float) # // the data is in float format (PCM F32) - +# // if data == nullptr: +# // the bitmap is considered "empty", and will be treated as a placeholder for counting tokens +# // you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens +# // note: passing a placeholder bitmap to mtmd_encode() will return an error # MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); @ctypes_function_mtmd( "mtmd_bitmap_init", [ @@ -787,11 +790,22 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # # // it calls mtmd_helper_bitmap_init_from_buf() internally # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); @ctypes_function_mtmd( - "mtmd_helper_bitmap_init_from_file", [mtmd_context_p_ctypes, c_char_p], mtmd_bitmap_p_ctypes) -def mtmd_helper_bitmap_init_from_file(ctx: mtmd_context_p, fname: c_char_p) -> mtmd_bitmap_p: + "mtmd_helper_bitmap_init_from_file", [ + mtmd_context_p_ctypes, + c_char_p, + c_bool, + ], + mtmd_bitmap_p_ctypes +) +def mtmd_helper_bitmap_init_from_file( + ctx: mtmd_context_p, + fname: c_char_p, + placeholder: c_bool, + /, +) -> mtmd_bitmap_p: """ helper function to construct a mtmd_bitmap from a file it calls mtmd_helper_bitmap_init_from_buf() internally @@ -807,13 +821,21 @@ def mtmd_helper_bitmap_init_from_file(ctx: mtmd_context_p, fname: c_char_p) -> m # // note: audio files will be auto-detected based on magic bytes # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); +# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); @ctypes_function_mtmd( - "mtmd_helper_bitmap_init_from_buf", [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t], mtmd_bitmap_p_ctypes) + "mtmd_helper_bitmap_init_from_buf", [ + mtmd_context_p_ctypes, + POINTER(c_uint8), + c_size_t, + c_bool, + ], + mtmd_bitmap_p_ctypes +) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], len: c_size_t, + placeholder: c_bool, /, ) -> mtmd_bitmap_p: """ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c4a278d68e..6b80c74f28 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c4a278d68efa17811006f2123a84081dac03fac7 +Subproject commit 6b80c74f285390368b3c99c5e750f19e9b096e98 From e001886f18d574b818a1963035fbc35ecfe1287c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 20:15:51 +0800 Subject: [PATCH 104/139] fix(mtmd): memory_can_shift() logic bug Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f929bcd150..4e7c045127 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3641,7 +3641,7 @@ def __call__( # Stage 5: Multimodal Physical OOM Defense if n_past + chunk_n_tokens > llama.n_ctx(): - if llama._ctx.memory_can_shift(): + if not llama._ctx.memory_can_shift(): raise RuntimeError( f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " f"(n_pos_per_embd > 1 or incompatible M-RoPE). " From 07afd3bc02cad3af20f10f83973b9a87c770dccb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 21:07:34 +0800 Subject: [PATCH 105/139] docs(README): document reasoning budget sampler usage - Add README section for first reasoning-block budget control - Document reasoning_budget -1/0/N semantics and related sampler parameters - Explain reasoning_budget_message injection before reasoning_end - Add examples for default tags, Mistral [THINK] tags, and Gemma4 channel tags - Clarify when to use reasoning_start_in_prompt for prefilled thinking tags - Note that reasoning_start_in_prompt is not a generic thinking-enabled switch - Mention verbose transition logs for reasoning-budget state changes Signed-off-by: JamePeng --- README.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/README.md b/README.md index ba1969793c..03f66a8cf9 100644 --- a/README.md +++ b/README.md @@ -899,6 +899,83 @@ Mirostat actively maintains a target entropy (`tau`) during generation to preven * **`logits_processor`** (`LogitsProcessorList`, optional): Custom Python callbacks to modify the logits tensor in-place before sampling. * **`stopping_criteria`** (`StoppingCriteriaList`, optional): Custom Python callbacks to halt generation based on the current sequence or scores. + +### Reasoning Budget (First Reasoning Block) + +`llama-cpp-python` provides a generic reasoning-budget sampler for models that expose their thinking content with visible start/end tags. It controls only the **first visible reasoning block** in the generated output. After that block naturally ends or is forcibly closed, the sampler switches to passthrough mode and later reasoning tags are ignored. + +This feature is intentionally model-agnostic. It does not infer model families, inspect chat templates, or guess thinking tags. If a model uses tags other than `...`, pass the correct `reasoning_start` and `reasoning_end` explicitly. + +| Parameter | Default | Description | +| --- | --- | --- | +| `reasoning_budget` | `-1` | Token budget for the first visible reasoning block. `-1` disables the sampler, `0` forces an immediate end after the block starts, and `N > 0` allows at most `N` generated tokens inside the block. | +| `reasoning_start` | `""` | Token/text sequence that marks the beginning of the first reasoning block. | +| `reasoning_end` | `""` | Token/text sequence that naturally ends the reasoning block. When the budget is exhausted, the sampler forces this sequence. | +| `reasoning_budget_message` | `None` | Optional message inserted before `reasoning_end` when the budget is exhausted. | +| `reasoning_start_in_prompt` | `False` | Set to `True` only when the prompt/chat template has already inserted `reasoning_start`, so the sampler should start counting from the first generated token. | +| `reasoning_start_max_tokens` | `32` | Safety window for non-reasoning outputs. If `reasoning_start` is not generated within this many output tokens, the sampler becomes a no-op. Set to `None` to wait indefinitely. | + +Basic usage with the default `...` tags: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_budget_message="\n[reasoning budget exhausted]\n", + # You can also inject a natural-language transition before reasoning_end: + # reasoning_budget_message="\n...Wait, I have been thinking long enough. Let me start answering the user's question.\n", +) +``` +When the budget is exhausted, the sampler forces: `reasoning_budget_message` + `reasoning_end` + +For Mistral-style thinking tags, pass the tags explicitly: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="[THINK]", + reasoning_end="[/THINK]", +) +``` + +For Gemma4 channel-style thinking, adjust the start and end markers to match the visible channel tags: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="<|channel>", + reasoning_end="", +) +``` + +Use `reasoning_start_in_prompt=True` when the prompt or chat template has already inserted the reasoning start tag. In that case, the sampler will not see the start tag during generation, so it must start directly in `COUNTING` state from the first generated token. This is suitable for thinking models or handlers that prefill the assistant prefix with a thinking tag, for example: + +```text +<|im_start|>assistant\n\n +``` + +Example: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="", + reasoning_end="", + reasoning_start_in_prompt=True, +) +``` + +`reasoning_start_in_prompt` is **not** a generic "thinking enabled" switch. It should only be set when the final prompt already contains `reasoning_start` before generation begins. For templates that merely enable thinking but still expect the model to generate the start tag itself, keep `reasoning_start_in_prompt=False`. + +When `verbose=True`, high-level reasoning-budget transitions are printed to stderr, such as initialization, start-tag detection, budget exhaustion, forced ending, and final passthrough. + ### 🛠️ Usage Example You can pass these parameters directly when calling the model to generate text. From 504f7477847fe9149e185fe00843681e88ec6736 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 21:14:59 +0800 Subject: [PATCH 106/139] docs(README): Update `ReasoningBudgetSampler` quick link Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 03f66a8cf9..c605c94542 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This package provides: - [Dynamic LoRA Example](https://github.com/JamePeng/llama-cpp-python#dynamic-lora-example) - [Control Vector Injection (Representation Engineering)](https://github.com/JamePeng/llama-cpp-python#control-vector-injection-representation-engineering) - [Sampling Configuration & Usage (LlamaSamplingParams)](https://github.com/JamePeng/llama-cpp-python#sampling-configuration--usage-llamasamplingparams) + - [How to use the ReasoningBudgetSampler](https://github.com/JamePeng/llama-cpp-python#reasoning-budget-first-reasoning-block) - [Multi-modal Models Support](https://github.com/JamePeng/llama-cpp-python#multi-modal-models) - Support Models Lists - [Loading a Local Image With Qwen3VL(Thinking/Instruct)](https://github.com/JamePeng/llama-cpp-python#loading-a-local-image-with-qwen3vlthinkinginstruct) From 5929d86c76d248d58daf38045cb38b974e5107d6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 04:49:25 +0800 Subject: [PATCH 107/139] feat(chat-format): Update google/gemma-4 chat template jinja Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 75 ++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 4e7c045127..fb42a59f23 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4820,12 +4820,12 @@ class Gemma4ChatHandler(MTMDChatHandler): GEMMA4_ETR_TOKEN = "" CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required) -%}\n" + "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" " {%- set ns = namespace(found_first=false) -%}\n" " {%- for key, value in properties | dictsort -%}\n" " {%- set add_comma = false -%}\n" - " {%- if key not in standard_keys -%}\n" + " {%- if not filter_keys or key not in standard_keys -%}\n" " {%- if ns.found_first %},{% endif -%}\n" " {%- set ns.found_first = true -%}\n" " {{ key }}:{\n" @@ -4887,7 +4887,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- elif value is mapping -%}\n" " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" " properties:{\n" - " {{- format_parameters(value, value['required'] | default([])) -}}\n" + " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" " }\n" " {%- endif -%}\n" " {%- if value['required'] -%}\n" @@ -4910,10 +4910,10 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set params = tool_data['function']['parameters'] -%}\n" " {%- if params -%}\n" " ,parameters:{\n" - " {%- if params['properties'] -%}\n" + " {%- if params.get('properties') -%}\n" " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" " {%- endif -%}\n" - " {%- if params['required'] -%}\n" + " {%- if params.get('required') -%}\n" " required:[\n" " {%- for item in params['required'] -%}\n" " <|\"|>{{- item -}}<|\"|>\n" @@ -4921,7 +4921,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " ],\n" " {%- endif -%}\n" - " {%- if params['type'] -%}\n" + " {%- if params.get('type') -%}\n" " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" " {%- endif -%}\n" " {%- endif -%}\n" @@ -4978,6 +4978,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " {{- ns.result | trim -}}\n" "{%- endmacro -%}\n" + "\n" "{%- macro format_tool_response_block(tool_name, response) -%}\n" " {{- '<|tool_response>' -}}\n" " {%- if response is mapping -%}\n" @@ -4992,6 +4993,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {{- '' -}}\n" "{%- endmacro -%}\n" + "\n" "{%- set ns = namespace(prev_message_type=None) -%}\n" "{%- set loop_messages = messages -%}\n" "{{- bos_token -}}\n" @@ -5004,7 +5006,13 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set ns.prev_message_type = 'think' -%}\n" " {%- endif -%}\n" " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- messages[0]['content'] | trim -}}\n" + " {%- if messages[0]['content'] is string -%}\n" + " {{- messages[0]['content'] | trim -}}\n" + " {%- elif messages[0]['content'] is sequence -%}\n" + " {%- for item in messages[0]['content'] -%}\n" + " {{- item['text'] | trim + ' '-}}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" " {%- set loop_messages = messages[1:] -%}\n" " {%- endif -%}\n" " {%- if tools -%}\n" @@ -5017,6 +5025,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {{- '\\n' -}}\n" "{%- endif %}\n" + "\n" "{#- Pre-scan: find last user message index for reasoning guard -#}\n" "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" "{%- for i in range(loop_messages | length) -%}\n" @@ -5024,6 +5033,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set ns_turn.last_user_idx = i -%}\n" " {%- endif -%}\n" "{%- endfor -%}\n" + "\n" "{#- Loop through messages -#}\n" "{%- for message in loop_messages -%}\n" " {%- if message['role'] != 'tool' -%}\n" @@ -5045,12 +5055,14 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- if not continue_same_model_turn -%}\n" " {{- '<|turn>' + role + '\\n' }}\n" " {%- endif -%}\n" + "\n" " {#- Render reasoning/reasoning_content as thinking channel -#}\n" " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" " {%- endif -%}\n" - " {%- if message['tool_calls'] -%}\n" + "\n" + " {%- if message.get('tool_calls') -%}\n" " {%- for tool_call in message['tool_calls'] -%}\n" " {%- set function = tool_call['function'] -%}\n" " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" @@ -5068,6 +5080,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " {%- set ns.prev_message_type = 'tool_call' -%}\n" " {%- endif -%}\n" + "\n" " {%- set ns_tr_out = namespace(flag=false) -%}\n" " {%- if message.get('tool_responses') -%}\n" " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" @@ -5104,6 +5117,23 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {%- endfor -%}\n" " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'image_url' -%}\n" + " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" + " {%- if part.get('type') == 'audio_url' -%}\n" + " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif part.get('type') == 'input_audio' -%}\n" + " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + # " {%- elif part.get('type') == 'video_url' -%}\n" + # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" " {%- else -%}\n" " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" " {%- endif -%}\n" @@ -5112,6 +5142,8 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {%- endfor -%}\n" " {%- endif -%}\n" + "\n" + " {%- set captured_content -%}\n" " {%- if message['content'] is string -%}\n" " {%- if role == 'model' -%}\n" " {{- strip_thinking(message['content']) -}}\n" @@ -5130,28 +5162,35 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" " {{- '<|image|>' + url_val -}}\n" " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" + " {%- if item['type'] == 'audio_url' -%}\n" + " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] == 'input_audio' -%}\n" + " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" " {%- set ns.prev_message_type = 'audio' -%}\n" + " {%- endif -%}\n" # " {%- elif item['type'] == 'video_url' -%}\n" # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" # " {{- '<|video|>' + video_val -}}\n" # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endif -%}\n" " {%- endfor -%}\n" " {%- endif -%}\n" + " {%- endset -%}\n" + "\n" + " {{- captured_content -}}\n" + " {%- set has_content = captured_content | trim | length > 0 -%}\n" + "\n" " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not message.get('content')) -%}\n" + " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" " {{- '\\n' -}}\n" " {%- endif -%}\n" " {%- endif -%}\n" "{%- endfor -%}\n" + "\n" "{%- if add_generation_prompt -%}\n" " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" " {{- '<|turn>model\\n' -}}\n" @@ -5180,7 +5219,7 @@ def __call__(self, **kwargs): self.extra_template_arguments["enable_thinking"] = self.enable_thinking # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [ 1, 106, 50] + # generation_config.json: "eos_token_id": [1, 106, 50] kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] if self.verbose: From d154e63e2e916e734bee29b3926abe80a9923fce Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 15:42:15 +0800 Subject: [PATCH 108/139] docs(README): update MinerU2.5-Pro-2605-1.2B OCR model support and link Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c605c94542..433e031ae0 100644 --- a/README.md +++ b/README.md @@ -1034,6 +1034,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | | [deepseek-ocr](https://huggingface.co/JamePeng2023/DeepSeek-OCR-2-GGUF) | `MTMDChatHandler` | `None` | +| [mineru2.5-pro](https://huggingface.co/JamePeng2023/MinerU2.5-Pro-2605-1.2B-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | From db8292d336ae1e708623792426481c414754353e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 17:15:32 +0800 Subject: [PATCH 109/139] Update Submodule vendor/llama.cpp 6b80c74..f71af35 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6b80c74f28..f71af352a5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6b80c74f285390368b3c99c5e750f19e9b096e98 +Subproject commit f71af352a52b8efe824c7a698d0632afa4794c01 From 12861b918f67b62f78f28c5cabb7223f766e1097 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 18:08:31 +0800 Subject: [PATCH 110/139] Bump version to 0.3.40-Milestone - Reasoning Budget Control, Gemma 4 12B Support, Enhanced Jinja2ChatFormatter, NGram k/k4v Speculative Decoding, Faster Native Sampling and Multimodal Improvements Signed-off-by: JamePeng --- CHANGELOG.md | 304 +++++++++++++++++++++++++++++++++++++++++- llama_cpp/__init__.py | 2 +- 2 files changed, 304 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8ebb5cd3e..1865195db3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,308 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.40-Milestone] Reasoning Budget Control, Gemma 4 12B Support, Enhanced Jinja2ChatFormatter, NGram k/k4v Speculative Decoding, Faster Native Sampling and Multimodal Improvements + +- feat(internals): Add `ReasoningBudgetSampler` support + - Add Python-backed `ReasoningBudgetSampler` for first reasoning-block control + - Install the sampler before probability filters to preserve forced end tokens + - Support `reasoning_budget` **-1/0/N** semantics in sampling params + - Force `reasoning_budget_message` + `reasoning_end` when the budget is exhausted + - Add manual `force_reasoning_budget()` at the sampling-context level + - Match llama.cpp force behavior by allowing only `COUNTING -> FORCING` + - Keep DONE as permanent passthrough and ignore later reasoning tags + - Support prefilled reasoning starts with `reasoning_start_in_prompt` + - Preserve UTF-8 boundary safety before forcing the end sequence + - Keep Python-backed custom sampler callbacks alive across C sampler usage + - Avoid shallow-copying custom_samplers when cloning sampler chains + - Add `verbose` parameter to `ReasoningBudgetSampler` to print high-level + state transitions to stderr. + - Log key events: initialization, `reasoning_start matched`, `budget exhausted`, + `forced end sequence`, `UTF-8 boundary waiting`, `manual force`, `natural end`, `reset`. + - Pass `verbose=getattr(model, "verbose", False)` from `LlamaSamplingContext` + when building the sampler chain. + - Preserve verbose flag when cloning the sampler. + +- feat(Llama): pass `reasoning budget` params through Llama APIs + - Add `reasoning budget` params to public completion and chat entry points + - Forward the params from chat handlers into `create_completion` + - Propagate reasoning budget controls down to `generate` and `sampling params` + - Document -1/0/N reasoning_budget behavior in completion docstrings + - Support custom `reasoning_start` and `reasoning_end` tags without model-specific inference + - Support `reasoning_budget_message` and `reasoning_start_in_prompt` + - Wire `MTMD chat handler` to the same reasoning budget controls + +- feat(sampling): add reasoning budget configurations + * Introduce reasoning budget and block control parameters to `LlamaSamplingParams` + to mirror llama.cpp CLI semantics. This includes: + - `reasoning_budget` + - `reasoning_start` / `reasoning_end` + - `reasoning_budget_message` + - `reasoning_start_in_prompt` + - `reasoning_start_max_tokens` + - Fix typo from typ_p to typical_p in logs + - Also updated `print_params()` to include these new metrics. + +- feat: add `ReasoningBudgetState` enum and `TokenMatcher` helper class to _internals.py + * Introduce `ReasoningBudgetState` enum and `TokenMatcher` helper class + to `_internals.py`. This lays the groundwork for the upcoming + `ReasoningBudgetSampler`, mirroring the state machine defined in + `common/reasoning-budget.h`. + + - `ReasoningBudgetState`: Tracks the lifecycle of the first reasoning block. + - `TokenMatcher`: Handles incremental matching for multi-token sequences. + +- docs(README): document reasoning budget sampler usage + - Add README section for first reasoning-block budget control + - Document reasoning_budget -1/0/N semantics and related sampler parameters + - Explain reasoning_budget_message injection before reasoning_end + - Add examples for default tags, Mistral [THINK] tags, and Gemma4 channel tags + - Clarify when to use reasoning_start_in_prompt for prefilled thinking tags + - Note that reasoning_start_in_prompt is not a generic thinking-enabled switch + - Mention verbose transition logs for reasoning-budget state changes + - docs(README): Update ReasoningBudgetSampler quick link + +- feat(chat-format): Update `google/gemma-4` chat template jinja + +- feat(llama): enhance chat template initialization with full special tokens + * Update Llama.__init__ to register additional tokenizer special tokens + and improve stop token handling for chat templates. + + - Expose extra special tokens (EOT, SEP, NL, PAD, MASK) via + `special_tokens_map` to Jinja2ChatFormatter. + - Keep BOS and EOS tokens as explicit parameters, no longer redundantly + put them in `special_tokens_map`. + - Build `stop_token_ids` once, including EOS and EOT tokens, skipping + invalid (-1) ids. + - Update try-block comment: now `{% generation %}` blocks are supported, + guard only against malformed or model-specific templates. + - This ensures better compatibility with HuggingFace-style chat templates + while maintaining llama-cpp-python prompt-rendering behavior. + +- **feat(chat-format): improve Jinja2ChatFormatter HF compatibility** + * Enhance Jinja2ChatFormatter to better support HuggingFace-style chat + templates while keeping the formatter lightweight and aligned with + llama-cpp-python's prompt-rendering needs. + + - Key changes: + - Add IgnoreGenerationTags Jinja extension for HF `{% generation %}` blocks. + - Enable Jinja loop controls for chat templates using break/continue. + - Register Transformers-compatible `tojson` behavior. + - Register `raise_exception` and `strftime_now` as Jinja globals. + - Add `special_tokens_map` support for additional template variables. + - Add optional `documents` argument for document-aware templates. + - Precompute text stop sequences and token-id stopping criteria. + - Improve type normalization for `stop_token_ids`. + - Expand docstrings for formatter initialization and render-time variables. + +- docs(wiki): update SCHEMA.md to v0.4 with full wiki path layout + - Added comprehensive docs/wiki/ directory structure overview. + - Reorganized modules description; removed hardcoded module page list. + - Clarified top-level file purposes and update guidance. + - Updated page type examples and templates (Class/Module, Feature, Example, Development). + - Strengthened cross-linking rules and update/placeholder guidance. + - Bumped schema version from 0.3 → 0.4 and last_modified date. + +- docs(install): add source-aligned build and backend guide + * Document installation workflows for llama-cpp-python with a focus on + the underlying llama.cpp CMake build configuration. + - Add virtual environment, source install, editable install, rebuild, and + verification guidance. + - Document common CMake options such as GGML_NATIVE, + GGML_BACKEND_DL, GGML_CPU_ALL_VARIANTS, and compiler selection. + - Summarize backend-specific build flags for CUDA, BLAS, Metal, Vulkan, + OpenVINO, HIP, SYCL, OpenCL, CANN, ZenDNN, and zDNN. + - Include backend runtime notes and common installation pitfalls while + keeping server-related installation content out of the page. + - docs(wiki): link installation guide from index + * Promote the completed installation guide into the wiki entry point so + new users can find build and backend setup instructions before reading + API-specific documentation. + - Add a Getting Started section that links to install.md. + - Move installation to the top of the recommended reading order. + - Mark install.md as an available page. + - Remove installation from the planned documentation areas. + - docs(readme): link detailed installation wiki guide + +- feat(mtmd): improve fallback chat template for multimodal models + - Add BOS/EOS token handling to the default MTMD chat format. + - Use a clearer role-based template with explicit USER and ASSISTANT prefixes. + - Append a newline after each message to keep generated prompts readable. + - Treat EOS as the end marker for the serialized conversation history before + the optional generation prompt. + - Improve fallback behavior for multimodal GGUF models that do not provide a + chat template, such as OCR-oriented models like `DeepSeek-OCR 1/2`. + - Make the default system prompt a single normalized string while preserving + its original meaning. + - Clean up minor formatting around MTMD context parameter initialization. + - docs(Readme): Update `Deepseek-OCR-2-GGUF` Link + - docs(README): update `MinerU2.5-Pro-2605-1.2B` OCR model support and link + + This improves prompt compatibility for multimodal models that either lack a + GGUF chat template or are not yet covered by a complete custom chat handler. + +- refactor(internals): align model metadata wrappers with llama.cpp API + - Use `llama_vocab_n_tokens()` instead of the old vocab size helper. + - Add Python wrappers for model description, size, chat template, and + trained RoPE frequency scaling. + - Clarify model capability helpers with docstrings matching llama.cpp + semantics. + - Rename `desc()` and `size()` to `model_desc()` and `model_size()` to + make their scope explicit. + - Drop the unused `get_tensor()` stub since llama.cpp does not expose it. + - Route rerank template lookup through `LlamaModel.model_chat_template()` for + consistency with the internal model abstraction. + +- feat(chat_handler): update multimodal handlers for Qwen2.5-VL, Qwen3-VL, and PaddleOCR + - Update PaddleOCRChatHandler to support version 1.6 + - Add token configuration and stop sequences for Qwen2.5-VL and Qwen3-VL + - Standardize input_ids initialization in __call__ methods for Qwen2.5-VL, Qwen3-ASR, and Qwen3-VL handlers + +- **perf(eval): skip unnecessary logit array copies during native sampling** + * Introduce the `copy_logits` parameter to `Llama.eval()` to control + whether C-level logits are copied into the Python `self.scores` array. + - Automatically disable `copy_logits` during the generation loop unless + Python-side hooks (`logits_processor`, `stopping_criteria`) or + `logits_all` explicitly require them. + - Skip logit copies entirely for intermediate prompt evaluations (e.g., + before hybrid checkpoints). + - Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch + the final token's logits when copying is required. + + In a PDF-reading summarization workload, this reduced the end-to-end completion + time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path + also improved noticeably: + + - `_create_completion`: 41.32s -> 25.93s + - `generate`: 37.82s -> below the top sampled entries + - `eval`: 35.14s -> 21.96s + - logits retrieval/copy path: 29.89s `get_logits()` -> 18.68s `get_logits_ith()` + - `decode`: 3.89s -> 2.25s + - `detokenize`: 2.60s -> 1.33s + - `sample`: 2.35s -> 2.03s + + This significantly reduces CPU overhead and memory bandwidth during generation, + as the native `llama.cpp` sampler reads directly from the C context without + needing to expose the `n_vocab` array to Python on every token. + +- docs(CUDA): Add note about PDL optimization for newer NVIDIA GPUs (CC ≥ 90) + +- docs(readme/wiki): update supported embeddings models table + - Add `jina-embeddings-v2-base-zh` + - Add `jina-embeddings-v3` + - Minor table formatting clean up + +- docs(development): add AI agent prompt for git commit generation + * Introduce `git-commit-generation-agent.md` to the development wiki to + standardize the creation of high-quality git commit messages using LLM + assistants. + + - Define the system persona, core principles (Conventional Commits, DCO), + and strict formatting rules for generating commits. + - Provide concrete template examples for build, performance, and + documentation updates. + - Ensure future maintainers and contributors can easily generate + consistent, maintainer-level commits that explicitly explain the "Why" + and "How" of code changes. + +- docs(wiki): add development helper to index + * Introduce the development section in the wiki index so maintainer-facing + workflows and LLM-assisted helper tools are discoverable from the main + navigation. + + - Add a Development section with a link to the Git commit generation agent. + Include the helper in the recommended reading order for new wiki users. + - Add development/git-commit-generation-agent.md to the available pages list. + +- feat(LlamaContext): add safety checks and docstrings to logits retrieval + - Add explicit null pointer validation to `get_logits` and `get_logits_ith`. + These methods now raise a `RuntimeError` instead of silently returning + invalid pointers when logits are unavailable or the index is out of bounds. + - Add comprehensive docstrings to both methods, detailing the underlying + buffer shape and memory layout. + - Include a performance warning in `get_logits_ith` about the internal + synchronization/reordering overhead to discourage its use on the hot path. + +- **feat(speculative): upgrade ngram map decoder with k/k4v modes +Enhance `LlamaNGramMapDecoding` to align with the upstream llama.cpp +ngram-map algorithm, offering better memory management and draft quality.** + - Introduce `mode` selection ("k" and "k4v"): "k" stores only historical + positions for memory efficiency, while "k4v" caches continuation values + directly for faster lookups. + - Add `min_hits` threshold to filter out low-confidence drafts. + - Implement `max_entries_per_key` to cap dictionary growth and prevent + memory bloat during long-context generations. + - Improve state synchronization (`_sync_and_index`) using `sync_check_tokens` + to safely verify incremental history appends. + - Add explicit lifecycle management methods (`clear`, `close`, `accept`) + for better API symmetry and resource cleanup. + - examples: add benchmark script for speculative decoding + - Add `benchmark_speculative.py` to the `examples/benchmark` directory. + - Test `LlamaPromptLookupDecoding` and `LlamaNGramMapDecoding` (k/k4v). + - Include diverse test scenarios (code, JSON logs, tables, essays) to + measure tokens-per-second (TPS) speedup compared to baseline generation. + +- docs(speculative): update wiki for NGramMap k/k4v modes and lifecycle APIs +Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in +the official documentation. + + - Document the new `__init__` parameters (`mode`, `min_hits`, + `max_entries_per_key`, `sync_check_tokens`) and their validation rules. + - Add a detailed comparison table explaining the memory and behavior + differences between the `"k"` and `"k4v"` lookup modes. + - Document the newly exposed lifecycle methods (`clear`, `close`, `accept`). + - Add comprehensive usage examples demonstrating `k4v` mode with memory caps. + - Update internal state descriptions (replacing `_ngram_map` with `_map_k` + and `_map_k4v`). + - Add a strong production warning against the legacy `LlamaPromptLookupDecoding` + and cross-link the new `benchmark_speculative.py` script. + +- docs(readme): revamp speculative decoding documentation +Expand the Speculative Decoding section to fully document the +new `LlamaNGramMapDecoding` capabilities and configuration options. + + - Clarify that `LlamaNGramMapDecoding` is a model-free prompt lookup + decoder that does not require a secondary GGUF draft model. + - Add a detailed parameter table explaining `mode` (k vs. k4v), + `min_hits`, memory caps, and sync thresholds. + - Provide usage examples and tuning recommendations for different + hardware (e.g., lowering `num_pred_tokens` for CPU setups). + - Demote the older `LlamaPromptLookupDecoding` to a legacy section, + warning about its sliding-window overhead on long contexts. + - Add practical notes on performance and state management (`clear()`). + +- docs(readme): Removed outdated macOS installation guides and added the latest installation notes. + +- docs(readme): Add Windows ROCm build instructions(by **@0xDELUXA**) + - Optimize the formatting of the ROCm section in README.md. + +- fix: wire LFM VL chat handlers into server loader(by **@JayAnderson360**) + +- build(cmake): disable building of upstream unified binary + - Set `LLAMA_BUILD_APP` to `OFF` to prevent the compilation of the new + unified `llama` binary introduced in upstream llama.cpp. + + - Since the Python package only requires the underlying shared libraries + and specific targets, explicitly disabling the standalone application + reduces build times and prevents unnecessary executable artifacts from + being compiled. + +- build(deps): align Jinja2 minimum with Transformers + - Require Jinja2 >= 3.1.0 for HuggingFace-style chat template support. + + - The updated Jinja2ChatFormatter relies on behavior aligned with Transformers' + chat-template runtime, which also requires Jinja2 3.1 or newer. Updating the + minimum dependency avoids parser/runtime differences with older Jinja versions. + +- ci : update metal build/test job to macos-26/macos-15-intel + - Build on the Tahoe runners in order to enable the tensor API for M5 and A19. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f71af352a52b8efe824c7a698d0632afa4794c01](https://github.com/ggml-org/llama.cpp/commit/f71af352a52b8efe824c7a698d0632afa4794c01) + +- feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260606 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/a778c57d73ec7d4f43e2518a513e7d4cf68a0df8...db8292d336ae1e708623792426481c414754353e + ## [0.3.39] Dynamic GGML Backends, Qwen3-ASR/MiniCPM-V-4.6, On-Device Hybrid Checkpoint, and Granular Logging - **ci(cu131/128/126/124): build wheels with GGML dynamic backends for windows/Linux** @@ -513,7 +815,7 @@ This commit significantly overhauls the media parsing and loading pipeline in `M - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea](https://github.com/ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea) -## [0.3.30] Milestone Release +## [0.3.30-Milestone] Milestone Release I will update the release notes for version 0.3.30 in the [discussion](https://github.com/JamePeng/llama-cpp-python/discussions). diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index ec28faae66..1650e6af69 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.39" +__version__ = "0.3.40" From b1ad4452e24baac561e75b254192ebf55f1fbd3c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 01:03:44 +0800 Subject: [PATCH 111/139] Update Submodule vendor/llama.cpp f71af35..f0156d1 Signed-off-by: JamePeng --- llama_cpp/llama.py | 1 + llama_cpp/llama_cpp.py | 7 +++++++ vendor/llama.cpp | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2bab3709e6..d89f4c361d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -503,6 +503,7 @@ def __init__( self.context_params.n_threads_batch = self.n_threads_batch self.context_params.ctx_type = ctx_type + self.context_params.ctx_other = None self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9c911bcb14..1e81d80f65 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -898,6 +898,9 @@ class llama_sampler_seq_config(ctypes.Structure): # // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) # struct llama_sampler_seq_config * samplers; # size_t n_samplers; +# // a source/target/parent context +# // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts +# struct llama_context * ctx_other; # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -945,6 +948,8 @@ class llama_context_params(ctypes.Structure): samplers(llama_sampler_seq_config *): the samplers must be sampler chains (i.e. use llama_sampler_chain_init) n_samplers(size_t): numbers of sampler chains + + ctx_other(llama_context *): a source/target/parent context can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts """ if TYPE_CHECKING: @@ -983,6 +988,7 @@ class llama_context_params(ctypes.Structure): kv_unified:bool samplers: ctypes.c_void_p n_samplers: int + ctx_other: ctypes.c_void_p _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -1020,6 +1026,7 @@ class llama_context_params(ctypes.Structure): ("kv_unified", ctypes.c_bool), ("samplers", llama_sampler_seq_config_p), ("n_samplers", ctypes.c_int), + ("ctx_other", ctypes.c_void_p), ] llama_context_params_p = ctypes.POINTER(llama_context_params) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f71af352a5..f0156d1401 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f71af352a52b8efe824c7a698d0632afa4794c01 +Subproject commit f0156d1401500512ad85042ccf38970568b12253 From 7a8272e6b928974efc8c131d518b1363e2e47263 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 01:24:53 +0800 Subject: [PATCH 112/139] feat(_ctypes_extensions): improve error diagnostics for shared library loading When `load_shared_library` fails, the resulting `RuntimeError` now includes a listing of the contents of the searched directories. This provides immediate context to help developers diagnose missing, misplaced, or incorrectly named library files. - Added `_format_library_dir_contents` to safely format directory listings. - Appended the directory listing to the failure message. - Confined this diagnostic work strictly to the failure path to avoid any performance overhead during successful imports. Signed-off-by: JamePeng --- llama_cpp/_ctypes_extensions.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index a8936fa2bf..1a9f8eb8c5 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -18,6 +18,37 @@ ) from typing_extensions import TypeAlias +def _format_library_dir_contents(base_paths: list[pathlib.Path]) -> str: + """Format directory contents for diagnostics after library loading fails.""" + sections = [] + + for base_path in base_paths: + p = pathlib.Path(base_path) + + if not p.exists(): + sections.append(f"{p}: ") + continue + + if not p.is_dir(): + sections.append(f"{p}: ") + continue + + try: + # Only list files when reporting a final loading failure. + files = sorted(x.name for x in p.iterdir()) + except Exception as e: + sections.append(f"{p}: ") + continue + + if files: + sections.append( + f"{p}:\n" + + "\n".join(f" - {name}" for name in files) + ) + else: + sections.append(f"{p}: ") + + return "\n".join(sections) # Load the library def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list[pathlib.Path]]): @@ -114,9 +145,12 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list except Exception as e: errors.append(f"{lib_path}: {e}") + # Include directory contents only in the failure path to avoid extra work during successful imports. raise RuntimeError( f"Failed to load '{lib_base_name}' from {base_paths}\n" + "\n".join(errors) + + "\nLibrary search path contents:\n" + + _format_library_dir_contents(base_paths) ) From 323da373ad2f30409123bfba8322041113f0eba8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 23:08:36 +0800 Subject: [PATCH 113/139] build(CMakelists): Improve Windows LLVM OpenMP runtime discovery - Also improve diagnostics by reporting the selected runtime source and path, warning when an explicit override points to a missing file, and keeping a clear runtime warning when no OpenMP DLL can be found. Signed-off-by: JamePeng --- CMakeLists.txt | 79 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f09cdb783..1ace43c4aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,8 @@ function(llama_cpp_python_install_target target) endfunction() -# Install an extra Windows runtime DLL into the Python package runtime directory. +# Copy an extra Windows runtime DLL into the Python package runtime directory +# during the CMake install step. # # Some dynamically loaded backend libraries depend on runtime DLLs that are not # always discoverable through $. One important example @@ -75,7 +76,10 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) endif() if(NOT EXISTS "${runtime_file}") - message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}") + message(WARNING + "Windows runtime DLL was selected but does not exist and will not be copied: " + "${runtime_file}" + ) return() endif() @@ -92,6 +96,11 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) foreach(DIR ${INSTALL_DIRS}) file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + message(STATUS + "Will copy Windows runtime DLL during install: " + "${runtime_file_cmake} -> ${DIR_CMAKE}" + ) + install( FILES "${runtime_file_cmake}" DESTINATION "${DIR_CMAKE}" @@ -115,42 +124,62 @@ function(llama_cpp_python_install_windows_openmp_runtime) endif() set(OPENMP_RUNTIME_DLL "") + set(OPENMP_RUNTIME_SOURCE "") + set(FOUND_OPENMP_DLLS "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL) + if(EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "LLAMA_CPP_OPENMP_RUNTIME_DLL") + else() + message(WARNING + "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " + "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " + "LLVM OpenMP runtime discovery." + ) + endif() + endif() - if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - else() + if(NOT OPENMP_RUNTIME_DLL) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_ROOTS - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - ) + set(VS_OPENMP_SEARCH_PATTERNS + # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS}) - if(EXISTS "${ROOT}") - file( - GLOB_RECURSE FOUND_OPENMP_DLLS - "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll" - "${ROOT}/**/libomp140.x86_64.dll" - ) + # Keep these as secondary fallbacks for non-standard installs. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "C:/Windows/System32/libomp140.x86_64.dll" + ) - if(FOUND_OPENMP_DLLS) - list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - break() - endif() - endif() + foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS}) + file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") + list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() + + if(FOUND_OPENMP_DLLS) + list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) + list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback") + endif() endif() if(OPENMP_RUNTIME_DLL) - message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}") + message(STATUS + "Selected Windows LLVM OpenMP runtime from ${OPENMP_RUNTIME_SOURCE}: " + "${OPENMP_RUNTIME_DLL}" + ) llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") else() message(WARNING - "Could not find libomp140.x86_64.dll. " + "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 " + "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), " + "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " From 111819832614d488c840b266ad95f894f420bfea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 23:48:49 +0800 Subject: [PATCH 114/139] ci(test): add cuda 13.0.2 build workflow Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu130-win.yml | 249 +++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 .github/workflows/build-wheels-cu130-win.yml diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml new file mode 100644 index 0000000000..790d7c9665 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -0,0 +1,249 @@ +name: Build Wheels (CU130) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu130 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.0.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Inspect Visual Studio OpenMP runtime paths + run: | + Write-Output "ProgramFiles=$env:ProgramFiles" + Write-Output "ProgramFiles(x86)=${env:ProgramFiles(x86)}" + Write-Output "" + + $vsRoots = @( + "$env:ProgramFiles\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "$env:ProgramFiles\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC" + ) + + foreach ($root in $vsRoots) { + Write-Output "Checking root: $root" + + if (Test-Path $root) { + Write-Output " Exists: yes" + Write-Output " MSVC version directories:" + + Get-ChildItem $root -Directory -ErrorAction SilentlyContinue | + Sort-Object Name | + ForEach-Object { + Write-Output " $($_.FullName)" + } + + Write-Output " OpenMP runtime candidates:" + + Get-ChildItem $root -Recurse -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue | + Sort-Object FullName | + ForEach-Object { + $sizeKB = [Math]::Round($_.Length / 1KB, 2) + $sizeMB = [Math]::Round($_.Length / 1MB, 4) + + Write-Output " Path: $($_.FullName)" + Write-Output " Size: $($_.Length) bytes / $sizeKB KB / $sizeMB MB" + } + } else { + Write-Output " Exists: no" + } + + Write-Output "" + } + + Write-Output "Checking System32 fallback:" + $system32OpenMP = "C:\Windows\System32\libomp140.x86_64.dll" + + if (Test-Path $system32OpenMP) { + $dll = Get-Item $system32OpenMP + $sizeKB = [Math]::Round($dll.Length / 1KB, 2) + $sizeMB = [Math]::Round($dll.Length / 1MB, 4) + + Write-Output " Path: $($dll.FullName)" + Write-Output " Size: $($dll.Length) bytes / $sizeKB KB / $sizeMB MB" + } else { + Write-Output " Not found: $system32OpenMP" + } + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 7a6ee9fcd57438a950eb2ee6c8e079f2409c2765 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 00:33:10 +0800 Subject: [PATCH 115/139] =?UTF-8?q?build(CMakeLists):=20prefer=20VS=202022?= =?UTF-8?q?=20VC143=20OpenMP=20redist=20and=20keep=20System32=20as=20final?= =?UTF-8?q?=20fallback=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: JamePeng --- CMakeLists.txt | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ace43c4aa..5b2cfeeb8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,7 +135,7 @@ function(llama_cpp_python_install_windows_openmp_runtime) message(WARNING "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " - "LLVM OpenMP runtime discovery." + "VC143 LLVM OpenMP runtime discovery." ) endif() endif() @@ -144,18 +144,19 @@ function(llama_cpp_python_install_windows_openmp_runtime) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_PATTERNS - # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout. + set(VS_OPENMP_VC143_PATTERNS + # Prefer VS 2022 VC143 LLVM OpenMP redist paths. + # The MSVC version directory is intentionally globbed because + # GitHub runners may contain versions such as 14.44.35112 or 14.44.35207. "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - # Keep these as secondary fallbacks for non-standard installs. + # Secondary VS layout fallbacks for unusual installations. "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - "C:/Windows/System32/libomp140.x86_64.dll" ) - foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS}) + foreach(PATTERN ${VS_OPENMP_VC143_PATTERNS}) file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() @@ -164,7 +165,16 @@ function(llama_cpp_python_install_windows_openmp_runtime) list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback") + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 VC143 LLVM OpenMP redist") + endif() + endif() + + if(NOT OPENMP_RUNTIME_DLL) + set(SYSTEM32_OPENMP_RUNTIME_DLL "C:/Windows/System32/libomp140.x86_64.dll") + + if(EXISTS "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "System32 fallback") endif() endif() @@ -177,9 +187,10 @@ function(llama_cpp_python_install_windows_openmp_runtime) else() message(WARNING "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " - "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 " - "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), " - "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL, Visual Studio 2022 " + "Enterprise/BuildTools VC143 redist paths under Program Files and " + "Program Files (x86), with a fuzzy MSVC version match such as " + "14.44.35112 or 14.44.35207, and C:/Windows/System32 as a final fallback. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " From 50bbdd61fdf7e2e1cd7582a2183e476c98a47c17 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 02:54:00 +0800 Subject: [PATCH 116/139] Update Submodule vendor/llama.cpp f0156d1..7d2b45b Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f0156d1401..7d2b45b4f7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f0156d1401500512ad85042ccf38970568b12253 +Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545 From 55e855b75f901b494259a1c81b45ac80f0e3013f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 05:03:15 +0800 Subject: [PATCH 117/139] Update mtmd API 20260609 Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 293 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 283 insertions(+), 10 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 4542555c65..61fb0e7859 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -10,12 +10,14 @@ c_uint8, c_int32, c_uint32, + c_int64, c_float, c_void_p, c_size_t, POINTER, _Pointer, # type: ignore Structure, + CFUNCTYPE ) import pathlib from typing import ( @@ -318,6 +320,16 @@ def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: """ ... +# // get the current marker string +# MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_get_marker", [mtmd_context_p_ctypes], c_char_p) +def mtmd_get_marker(ctx: mtmd_context_p) -> c_char_p: + """ + get the current marker string + """ + ... + # // mtmd_bitmap # // # // if bitmap is image: @@ -420,6 +432,58 @@ def mtmd_bitmap_set_id( ... +# // mtmd_bitmap lazy +# // +# // this is a special bitmap that: +# // - does not hold the actual data +# // - can be expanded into one or more chunks (either media to text chunks) +# // user must provide a callback to fill in the data when mtmd_tokenize() is called +# // this is useful for large video inputs: +# // - allow reading video frame by frame, without loading the entire video into memory +# // - allow tracking the whole video with a single ID (for example, the file hash) + +# // set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically +# // set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically +# // either out_bitmap or out_text can be set, but not both +# // out_bitmap cannot be another lazy bitmap (no nested lazy allowed) +# // return value: +# // 0 on success +# // -1 on EOF (signal to mtmd_tokenize to move on) +# // -2 on error (signal to mtmd_tokenize to abort) +# typedef int(* mtmd_bitmap_lazy_callback)( +# size_t chunk_idx, +# void * user_data, +# mtmd_bitmap ** out_bitmap, +# char ** out_text); +mtmd_bitmap_lazy_callback = CFUNCTYPE( + c_int, + c_size_t, # chunk_idx + c_void_p, # user_data + POINTER(mtmd_bitmap_p), # mtmd_bitmap ** out_bitmap + POINTER(c_char_p), # char ** out_text +) + +# MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, +# const char * id, // usually set to file hash +# void * user_data, +# mtmd_bitmap_lazy_callback callback); +@ctypes_function_mtmd( + "mtmd_input_chunks_get", [ + mtmd_context_p_ctypes, + c_char_p, + c_void_p, + mtmd_bitmap_lazy_callback, + ], mtmd_bitmap_p_ctypes) +def mtmd_input_chunks_get( + ctx: mtmd_context_p, + id: c_char_p, + user_data: c_void_p, + callback: mtmd_bitmap_lazy_callback, # type: ignore + /, +) -> mtmd_bitmap_p: + ... + + # // mtmd_input_chunks # // # // this is simply a list of mtmd_input_chunk @@ -772,6 +836,9 @@ def mtmd_test_create_input_chunks() -> mtmd_input_chunk_p: # // BREAKING CHANGES are expected. # // +# struct mtmd_helper_video; +mtmd_helper_video_p = NewType("mtmd_helper_video_p", int) +mtmd_helper_video_p_ctypes = c_void_p # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. @@ -786,11 +853,33 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # ... +# // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). +# MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_helper_support_video", [mtmd_context_p], c_bool) +def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: + """ + Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). + """ + ... + + +# struct mtmd_helper_bitmap_wrapper { +# mtmd_bitmap * bitmap; +# mtmd_helper_video * video_ctx; +# }; +class mtmd_helper_bitmap_wrapper(Structure): + _fields_ = [ + ("bitmap", mtmd_bitmap_p), + ("video_ctx", mtmd_helper_video_p), + ] +mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper) + # // helper function to construct a mtmd_bitmap from a file # // it calls mtmd_helper_bitmap_init_from_buf() internally # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); +# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); @ctypes_function_mtmd( "mtmd_helper_bitmap_init_from_file", [ @@ -798,14 +887,14 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # c_char_p, c_bool, ], - mtmd_bitmap_p_ctypes + mtmd_helper_bitmap_wrapper ) def mtmd_helper_bitmap_init_from_file( ctx: mtmd_context_p, fname: c_char_p, placeholder: c_bool, /, -) -> mtmd_bitmap_p: +) -> mtmd_helper_bitmap_wrapper: """ helper function to construct a mtmd_bitmap from a file it calls mtmd_helper_bitmap_init_from_buf() internally @@ -818,10 +907,13 @@ def mtmd_helper_bitmap_init_from_file( # // supported formats: # // image: formats supported by stb_image: jpg, png, bmp, gif, etc. # // audio: formats supported by miniaudio: wav, mp3, flac -# // note: audio files will be auto-detected based on magic bytes +# // note: +# // - for now, video input is only supported via C++ helper functions +# // - audio files will be auto-detected based on magic bytes +# // - output bitmap will have FNV hash as the ID # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); +# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); @ctypes_function_mtmd( "mtmd_helper_bitmap_init_from_buf", [ mtmd_context_p_ctypes, @@ -829,7 +921,7 @@ def mtmd_helper_bitmap_init_from_file( c_size_t, c_bool, ], - mtmd_bitmap_p_ctypes + mtmd_helper_bitmap_wrapper ) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, @@ -837,13 +929,16 @@ def mtmd_helper_bitmap_init_from_buf( len: c_size_t, placeholder: c_bool, /, -) -> mtmd_bitmap_p: +) -> mtmd_helper_bitmap_wrapper: """ helper function to construct a mtmd_bitmap from a buffer containing a file supported formats: - image: formats supported by stb_image: jpg, png, bmp, gif, etc. - audio: formats supported by miniaudio: wav, mp3, flac - note: audio files will be auto-detected based on magic bytes + image: formats supported by stb_image: jpg, png, bmp, gif, etc. + audio: formats supported by miniaudio: wav, mp3, flac + note: + - for now, video input is only supported via C++ helper functions + - audio files will be auto-detected based on magic bytes + - output bitmap will have FNV hash as the ID returns nullptr on failure """ ... @@ -1020,3 +1115,181 @@ def mtmd_helper_decode_image_chunk( ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure """ ... + +# // +# // video input helpers (requires ffmpeg/ffprobe installed on the system) +# // the notion of video only exists at the helper level, it is not visible to the core mtmd library +# // +# // NOTE: this implementation is model-agnostic, it can be used with any vision-capable model +# // however, it may not be accurate for some specific models +# // (this is expected for now, to keep the implementation simple) +# // + +# struct mtmd_helper_video_info { +# uint32_t width; +# uint32_t height; +# float fps; // effective fps (fps_target if set, else original video fps) +# int32_t n_frames; // estimated total frames at effective fps (-1 if unknown) +# }; +class mtmd_helper_video_info(Structure): + _fields_ = [ + ("width", c_uint32), + ("height", c_uint32), + ("fps", c_float), + ("n_frames", c_int32), + ] +mtmd_helper_video_info_p_ctypes = POINTER(mtmd_helper_video_info) + + +# struct mtmd_helper_video_init_params { +# float fps_target; // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f +# const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH +# int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms +# // TODO @ngxson : allow "placeholder" bitmap output for counting tokens +# }; +class mtmd_helper_video_init_params(Structure): + _fields_ = [ + ("fps_target", c_float), + ("ffmpeg_bin_dir", c_char_p), + ("timestamp_interval_ms", c_int64), + ] +mtmd_helper_video_init_params_p_ctypes = POINTER(mtmd_helper_video_init_params) + + +# MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void); +@ctypes_function_mtmd( + "mtmd_helper_video_init_params_default", + [], + mtmd_helper_video_init_params, +) +def mtmd_helper_video_init_params_default( + /, +) -> mtmd_helper_video_init_params: + """ + get default init params for mtmd_helper_video + """ + ... + + +# // returns NULL on failure (ffprobe not found, file unreadable, etc.) +# MTMD_API mtmd_helper_video * mtmd_helper_video_init( +# struct mtmd_context * mctx, +# const char * path, +# struct mtmd_helper_video_init_params params); +@ctypes_function_mtmd( + "mtmd_helper_video_init", [ + mtmd_context_p_ctypes, + c_char_p, + mtmd_helper_video_init_params, + ], + mtmd_helper_video_p) +def mtmd_helper_video_init( + mctx: mtmd_context_p, + path: c_char_p, + params: mtmd_helper_video_init_params, + /, +) -> mtmd_helper_video_p: + """ + helper function to init an mtmd_helper_video object + returns NULL on failure (ffprobe not found, file unreadable, etc.) + """ + ... + + +# // Same as mtmd_helper_video_init(), but reads from an in-memory buffer. +# // The buffer is copied internally; the caller does not need to keep it alive. +# // Note: pipe input is not seekable, so seeking will use output-side seeking +# // (ffmpeg decodes and discards frames up to the target position). +# MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf( +# struct mtmd_context * mctx, +# const unsigned char * buf, size_t len, +# struct mtmd_helper_video_init_params params); +@ctypes_function_mtmd( + "mtmd_helper_video_init_from_buf", + [ + mtmd_context_p_ctypes, + c_char_p, + c_size_t, + mtmd_helper_video_init_params, + ], + mtmd_helper_video_p_ctypes, +) +def mtmd_helper_video_init_from_buf( + mctx: mtmd_context_p, + buf: c_char_p, + len: int, + params: mtmd_helper_video_init_params, + /, +) -> mtmd_helper_video_p: + """ + helper function to init an mtmd_helper_video object from an in-memory video buffer + + The buffer is copied internally, so the caller does not need to keep it alive + after this function returns. + """ + ... + + +# MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx); +@ctypes_function_mtmd("mtmd_helper_video_free", [mtmd_helper_video_p_ctypes], None) +def mtmd_helper_video_free( + ctx: mtmd_helper_video_p, + /, +) -> None: + """ + free an mtmd_helper_video object + """ + ... + + +# MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx); +@ctypes_function_mtmd("mtmd_helper_video_get_info", [mtmd_helper_video_p_ctypes], mtmd_helper_video_info) +def mtmd_helper_video_get_info( + ctx: mtmd_helper_video_p, + /, +) -> mtmd_helper_video_info: + """ + get video information from an mtmd_helper_video object + """ + ... + + +# // Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call. +# // *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free() +# // *out_text - heap-allocated (always via strdup/malloc); caller must free with free() +# // returns 0 on success, -1 on EOF, -2 on error +# MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, +# mtmd_bitmap ** out_bitmap, +# char ** out_text); +@ctypes_function_mtmd( + "mtmd_helper_video_read_next", + [ + mtmd_helper_video_p_ctypes, + POINTER(mtmd_bitmap_p_ctypes), + POINTER(c_char_p), + ], + c_int32, +) +def mtmd_helper_video_read_next( + ctx: mtmd_helper_video_p, + out_bitmap: POINTER(mtmd_bitmap_p_ctypes), # type: ignore + out_text: POINTER(c_char_p), # type: ignore + /, +) -> int: + """ + read the next item from the video stream + + Exactly one of out_bitmap or out_text is set per successful call. + + out_bitmap: + heap-allocated bitmap; caller must free it with mtmd_bitmap_free() + + out_text: + heap-allocated string via strdup/malloc; caller must free it with free() + + returns: + 0 on success + -1 on EOF + -2 on error + """ + ... From 10b4addb9d5f2ff71bddde34f43f8a43fac44b61 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 05:08:49 +0800 Subject: [PATCH 118/139] feat(mtmd): add video input support to MTMDChatHandler - Add video_url handling to the MTMD chat template and media extraction pipeline. Detect whether the loaded libmtmd build supports video helpers and reject video inputs early when MTMD_VIDEO is unavailable. - Update media loading and bitmap creation for the new helper wrapper API. mtmd_helper_bitmap_init_from_buf now returns a bitmap wrapper containing both the decoded bitmap and an optional video helper context, so keep the video context alive until mtmd_tokenize completes and release it afterward. - Also consolidate duplicated audio/video byte loading into a shared _load_bytes helper, reuse it for image loading, and add richer default HTTP headers for remote media requests. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 173 ++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 58 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index fb42a59f23..2224466436 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3064,6 +3064,8 @@ class MTMDChatHandler: "{% else %}" "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" "{% elif content.type == 'text' %}" "{{ content.text }}" "{% endif %}" @@ -3114,6 +3116,10 @@ def __init__( self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + if not os.path.exists(clip_model_path): raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") @@ -3182,6 +3188,15 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if self.verbose: print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" if getattr(self, "mtmd_ctx", None) is not None: @@ -3259,7 +3274,16 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa if url: media_items.append({"url": url, "type": "audio"}) - # 3. Text & Unknown Types + # 3. Video Processing + elif content_type == "video_url": + if not self.is_support_video: + raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") + + video_url = content["video_url"] + url = video_url if isinstance(video_url, str) else video_url["url"] + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types elif content_type == "text": continue else: @@ -3274,6 +3298,7 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): Supported formats: - Images (via stb_image): jpg, png, bmp, etc. - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. Note: - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. @@ -3283,25 +3308,35 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): media_bytes (bytes): The raw byte content of the media file. Returns: - mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features. + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL """ if self.mtmd_ctx is None: raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") - # Create bitmap from buffer using helper function - bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, - (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), + buf, len(media_bytes), False, ) - if bitmap is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load image or audio file from media bytes " - "(unsupported media format or corrupted data).") + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) - return bitmap + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx def _process_mtmd_prompt( @@ -3360,16 +3395,17 @@ def _process_mtmd_prompt( # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding bitmaps = [None] * len(media_items) bitmap_cleanup = [] + video_cleanup = [] chunks = None try: # Concurrent Media Decoding import concurrent.futures if media_items: - def _create_bitmap_func(idx: int, item: str): + def _create_bitmap_func(idx: int, item: dict): media_bytes = self.load_media(item["url"], item["type"]) - bitmap = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, # which can be used in the future to process large numbers of video frames. max_workers = min(llama.n_threads, len(media_items)) @@ -3377,10 +3413,14 @@ def _create_bitmap_func(idx: int, item: str): futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] for future in concurrent.futures.as_completed(futures): - idx, bitmap = future.result() + idx, bitmap, video_ctx = future.result() + bitmaps[idx] = bitmap bitmap_cleanup.append(bitmap) + if video_ctx: + video_cleanup.append(video_ctx) + # Strict validation: Abort if any thread failed to decode its assigned media if any(b is None for b in bitmaps): raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") @@ -3415,6 +3455,12 @@ def _create_bitmap_func(idx: int, item: str): if result != 0: raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + # 6. Virtual Token Ledger Construction full_prompt_ids = [] chunk_token_spans = [] @@ -3424,6 +3470,7 @@ def _create_bitmap_func(idx: int, item: str): # Cursor to track the actual media contents (URLs or base64 data) provided by the user media_items_count = len(media_items) media_items_cur = 0 + last_media_id = None for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) @@ -3463,7 +3510,11 @@ def _create_bitmap_func(idx: int, item: str): # while instantly breaking the match if the image content changes. # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id else: # Magic Negative Number as fallback :) media_id = -314159 @@ -3492,6 +3543,12 @@ def _create_bitmap_func(idx: int, item: str): for bitmap in bitmap_cleanup: self._mtmd_cpp.mtmd_bitmap_free(bitmap) bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + bitmaps = None raise e @@ -3825,18 +3882,22 @@ def __call__( def load_media(self, media_url: str, media_type: str) -> bytes: """ Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image or audio processor based on the media_type. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. """ if media_type == "image": return self._load_image(media_url) + elif media_type == "audio": - audio_bytes = self._load_audio(media_url) - # Apply ironclad magic bytes validation before returning + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") try: self.detect_audio_format(audio_bytes) except ValueError as e: raise ValueError(f"{self.log_prefix}(load_media): {e}") return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + else: raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") @@ -3876,41 +3937,51 @@ def detect_audio_format(audio_bytes: bytes) -> str: "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." ) + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + @staticmethod - def _load_audio(audio_url: str) -> bytes: + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: """ - Load audio from either a URL, local path, or a data URI and return raw bytes. + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. """ + media_bytes = b"" - audio_bytes = b"" - - # 1. Handle data URI (base64) - if audio_url.strip().startswith("data:"): - comma_pos = audio_url.find(",") + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") if comma_pos == -1: raise ValueError("Invalid data URI: missing comma separator") - base64_data = audio_url[comma_pos + 1 :] - audio_bytes = base64.b64decode(base64_data) + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) # 2. Handle local file path - elif os.path.exists(audio_url): - with open(audio_url, "rb") as f: - audio_bytes = f.read() + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() # 3. Handle remote URL via HTTP/HTTPS else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(audio_url, headers=headers) + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) try: - with urllib.request.urlopen(req, timeout=15) as f: - audio_bytes = f.read() + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download audio from {audio_url}: {e}") + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") - if not audio_bytes: - raise ValueError("Empty audio data received") + if not media_bytes: + raise ValueError(f"Empty {kind} data received") - return audio_bytes + return media_bytes @staticmethod def _load_image(image_url: str) -> bytes: @@ -3926,28 +3997,14 @@ def _load_image(image_url: str) -> bytes: Returns: JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. """ - image_bytes = b"" - - # 1. Handle data URI (base64) - if image_url.strip().startswith("data:"): - # Split only once from the right to correctly handle mime types containing commas - comma_pos = image_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = image_url[comma_pos + 1 :] - image_bytes = base64.b64decode(base64_data) - - # 2. Handle local/remote URL - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(image_url, headers=headers) - - try: - with urllib.request.urlopen(req, timeout=15) as f: - image_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download image from {image_url}: {e}") + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + # 2. Check if image_bytes is empty. if not image_bytes: raise ValueError("Empty image data received") From e4dcac1af57b58973ecf7e206a3c25b3c367d881 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 22:22:15 +0800 Subject: [PATCH 119/139] Update Submodule vendor/llama.cpp 7d2b45b..d6d0ce8 Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 14 ++++++-------- vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 61fb0e7859..30ca8fab90 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -459,8 +459,8 @@ def mtmd_bitmap_set_id( c_int, c_size_t, # chunk_idx c_void_p, # user_data - POINTER(mtmd_bitmap_p), # mtmd_bitmap ** out_bitmap - POINTER(c_char_p), # char ** out_text + POINTER(mtmd_bitmap_p_ctypes), # mtmd_bitmap ** out_bitmap + POINTER(c_char_p), # char ** out_text ) # MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, @@ -856,7 +856,7 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # # // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). # MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); @ctypes_function_mtmd( - "mtmd_helper_support_video", [mtmd_context_p], c_bool) + "mtmd_helper_support_video", [mtmd_context_p_ctypes], c_bool) def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: """ Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). @@ -870,8 +870,8 @@ def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: # }; class mtmd_helper_bitmap_wrapper(Structure): _fields_ = [ - ("bitmap", mtmd_bitmap_p), - ("video_ctx", mtmd_helper_video_p), + ("bitmap", mtmd_bitmap_p_ctypes), + ("video_ctx", mtmd_helper_video_p_ctypes), ] mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper) @@ -1162,9 +1162,7 @@ class mtmd_helper_video_init_params(Structure): [], mtmd_helper_video_init_params, ) -def mtmd_helper_video_init_params_default( - /, -) -> mtmd_helper_video_init_params: +def mtmd_helper_video_init_params_default() -> mtmd_helper_video_init_params: """ get default init params for mtmd_helper_video """ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d2b45b4f7..d6d0ce8215 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545 +Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f From 54f56bd8f89769f2021f31eba0aa377dc290f203 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 13 Jun 2026 00:46:09 +0800 Subject: [PATCH 120/139] Update Submodule vendor/llama.cpp d6d0ce8..ebc1077 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d6d0ce8215..ebc10770ac 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f +Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3 From 6d1bd3b8d751a3a2ac86d377ecd34a3b37278b15 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 00:04:56 +0800 Subject: [PATCH 121/139] Update Submodule vendor/llama.cpp ebc1077..e8067a8 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ebc10770ac..e8067a8b36 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3 +Subproject commit e8067a8b3624aa40cc88ecb2940060e5d65b7532 From 971ee384227f6268f244c93f620b12f0a6ff47c0 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 01:03:09 +0800 Subject: [PATCH 122/139] Update(mtmd): Append mtmd batching API - Sync upstream: mtmd: add batching API (#24384) Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 + llama_cpp/mtmd_cpp.py | 142 ++++++++++++++++++++++++++++++--- 2 files changed, 134 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2224466436..520d2429d4 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3094,6 +3094,7 @@ def __init__( use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1, + batch_max_tokens: int = 1024, **kwargs ): @@ -3108,6 +3109,7 @@ def __init__( self.clip_model_path = clip_model_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens self.use_gpu = use_gpu self.verbose = verbose @@ -3152,6 +3154,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens # Cache the model's eos token and bos token self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 30ca8fab90..4513761a63 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -153,6 +153,21 @@ class mtmd_pos_type(enum.IntEnum): mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) mtmd_input_chunks_p_ctypes = c_void_p +# struct mtmd_batch { +# mtmd_context * ctx; +# std::vector entries; +# std::vector output_embd; // aggregated output embedding for the whole batch +# mtmd_batch(mtmd_context * ctx): ctx(ctx) {} +# int32_t n_tokens() const { +# int32_t n = 0; +# for (const auto * chunk : entries) { +# n += mtmd_input_chunk_get_n_tokens(chunk); +# } +# return n; +# } +# }; +mtmd_batch_p = NewType("mtmd_batch_p", int) +mtmd_batch_p_ctypes = c_void_p # struct mtmd_input_text { # const char * text; @@ -210,6 +225,11 @@ class clip_context_params(Structure): # // callback function passed over to mtmd proper # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; +# +# // batching params +# int32_t batch_max_tokens; // maximum number of output tokens in a batch +# // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit) +# // (default: 1024) # }; class mtmd_context_params(Structure): _fields_ = [ @@ -224,6 +244,7 @@ class mtmd_context_params(Structure): ("image_max_tokens", c_int), ("cb_eval", ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), + ("batch_max_tokens", c_int32), ] mtmd_context_params_p_ctypes = POINTER(mtmd_context_params) @@ -731,8 +752,8 @@ def mtmd_tokenize( # // returns 0 on success # // TODO: deprecate -# MTMD_API int32_t mtmd_encode(mtmd_context * ctx, -# const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens), +# "use mtmd_encode_chunk() instead"); @ctypes_function_mtmd( "mtmd_encode", [ mtmd_context_p_ctypes, @@ -745,10 +766,15 @@ def mtmd_encode( image_tokens: mtmd_image_tokens_p, /, ) -> c_int32: + """ + DEPRECATED: use mtmd_encode_chunk() instead + """ ... +# // text chunk will be ignored silently, only media chunk will be encoded # // returns 0 on success +# // returns 1 on generic error # MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, # const mtmd_input_chunk * chunk); @ctypes_function_mtmd( @@ -763,6 +789,11 @@ def mtmd_encode_chunk( chunk: mtmd_input_chunk_p, /, ) -> c_int32: + """ + text chunk will be ignored silently, only media chunk will be encoded + returns 0 on success + returns 1 on generic error + """ ... # // get output embeddings from the last encode pass @@ -778,6 +809,95 @@ def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): # type: ignor ... +# // batch encoding API +# // chunks are not owned by the batch, they will not be freed by mtmd_batch_free() +# // batch is valid for a given context, cannot be shared across contexts +# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_batch_init", + [mtmd_context_p_ctypes], + mtmd_batch_p_ctypes, +) +def mtmd_batch_init(ctx: mtmd_context_p, /) -> mtmd_batch_p: + ... + + +# MTMD_API void mtmd_batch_free(mtmd_batch * batch); +@ctypes_function_mtmd( + "mtmd_batch_free", + [mtmd_batch_p_ctypes], + None, +) +def mtmd_batch_free(batch: mtmd_batch_p, /): + """ + chunks are not owned by the batch, they will not be freed by mtmd_batch_free() + batch is valid for a given context, cannot be shared across contexts + """ + ... + + +# // only media chunks are allowed, text chunks will be rejected +# // returns 0 on success +# // returns 1 on generic error +# // returns 2 if the batch is too large (chunk won't be added) +# // returns 3 if it cannot be batched with the existing chunks in the batch +# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function_mtmd( + "mtmd_batch_add_chunk", + [ + mtmd_batch_p_ctypes, + mtmd_input_chunk_p_ctypes, + ], + c_int32, +) +def mtmd_batch_add_chunk( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> c_int32: + """ + only media chunks are allowed, text chunks will be rejected + returns 0 on success + returns 1 on generic error + returns 2 if the batch is too large (chunk won't be added) + returns 3 if it cannot be batched with the existing chunks in the batch + """ + ... + + +# // returns 0 on success +# // returns 1 on generic error +# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +@ctypes_function_mtmd( + "mtmd_batch_encode", + [mtmd_batch_p_ctypes], + c_int32, +) +def mtmd_batch_encode(batch: mtmd_batch_p, /) -> c_int32: + """ + returns 0 on success + returns 1 on generic error + """ + ... + + +# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function_mtmd( + "mtmd_batch_get_output_embd", + [ + mtmd_batch_p_ctypes, + mtmd_input_chunk_p_ctypes, + ], + POINTER(c_float), +) +def mtmd_batch_get_output_embd( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> POINTER(c_float): # type: ignore + ... + + # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @@ -947,8 +1067,8 @@ def mtmd_helper_bitmap_init_from_buf( # // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @ctypes_function_mtmd( - "mtmd_helper_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) -def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t: + "mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) +def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p) -> c_size_t: """ helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache """ @@ -959,8 +1079,8 @@ def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t: # // normally, n_pos is equal to n_tokens, but for M-RoPE it is different # MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); @ctypes_function_mtmd( - "mtmd_helper_get_n_pos", [mtmd_input_chunk_p_ctypes], c_int32) -def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32: + "mtmd_helper_get_n_pos", [mtmd_input_chunks_p_ctypes], c_int32) +def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p) -> c_int32: """ helper to count the total position of tokens from a list of chunks, useful to keep track of n_past normally, n_pos is equal to n_tokens, but for M-RoPE it is different @@ -991,8 +1111,8 @@ def mtmd_helper_image_get_decoder_pos( # // helper function that automatically: # // 1. run llama_decode() on text chunks -# // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -# // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +# // 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode() +# // if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error # // otherwise, returns 0 on success # // this function is NOT thread-safe # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, @@ -1007,7 +1127,7 @@ def mtmd_helper_image_get_decoder_pos( "mtmd_helper_eval_chunks", [ mtmd_context_p_ctypes, llama_cpp.llama_context_p_ctypes, - mtmd_input_chunk_p_ctypes, + mtmd_input_chunks_p_ctypes, c_int32, c_int32, c_int32, @@ -1018,7 +1138,7 @@ def mtmd_helper_image_get_decoder_pos( def mtmd_helper_eval_chunks( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, - chunks: mtmd_input_chunk_p, + chunks: mtmd_input_chunks_p, n_past: c_int32, seq_id: c_int32, n_batch: c_int32, @@ -1106,7 +1226,7 @@ def mtmd_helper_decode_image_chunk( n_past: c_int32, seq_id: c_int32, n_batch: c_int32, - new_n_past: c_int32, + new_n_past: POINTER(c_int32), # type: ignore /, ) -> c_int32: """ From cb299e67e51e5aff061ebcf9f1521695ad3f1a5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 03:05:32 +0800 Subject: [PATCH 123/139] Update(MTMDChatHandler): add chunk type helpers - Add small helper methods `_is_text_chunk`/`_is_image_chunk`/`_is_audio_chunk` for checking MTMD text, image, and audio chunk type enum values. - This keeps MTMD prompt processing easier to read and avoids repeating direct enum comparisons when building token spans for text and media chunks. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 36 +++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 520d2429d4..aadec4600e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3341,6 +3341,26 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): return wrapper.bitmap, wrapper.video_ctx + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) def _process_mtmd_prompt( self, @@ -3480,7 +3500,7 @@ def _create_bitmap_func(idx: int, item: dict): if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + if self._is_text_chunk(chunk_type): # Extract standard text token IDs n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) @@ -3489,10 +3509,7 @@ def _create_bitmap_func(idx: int, item: dict): chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) full_prompt_ids.extend(tokens) current_idx += len(tokens) - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): # Extract media properties # Note(JamePeng): # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). @@ -3673,7 +3690,7 @@ def __call__( if end_idx <= n_past: continue - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + if self._is_text_chunk(chunk_type): unprocessed_start = max(start_idx, n_past) - start_idx n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) @@ -3689,14 +3706,11 @@ def __call__( llama.eval(tokens_to_eval) n_past = llama.n_tokens - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) if self.verbose: - media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO" + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) # Stage 5: Multimodal Physical OOM Defense From d8ee3eed7163c6c1f3802a9b979f9009e5e96c53 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sun, 14 Jun 2026 08:00:09 +0200 Subject: [PATCH 124/139] Change 'clip_model_path' to 'mmproj_path'. Implemented 'chat_template_override'. Only the chat template is passed from llama to the chat handler; not the entire model's metadata. --- llama_cpp/llama.py | 10 ++++----- llama_cpp/llama_chat_format.py | 39 +++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 544e755ea9..1f5ffa20b5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -96,7 +96,7 @@ class Llama: def __init__( self, model_path: str, - clip_model_path: Optional[str] = None, + mmproj_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -710,13 +710,13 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) - if clip_model_path is not None: + if mmproj_path is not None: if self.chat_handler is not None and self.verbose: - print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) self.chat_handler = llama_chat_format.GenericMTMDChatHandler( - gguf_metadata = self.metadata, - clip_model_path = clip_model_path, + chat_format = self.metadata.get("tokenizer.chat_template", None), + mmproj_path = mmproj_path, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 254195f95a..966c2e28fa 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2856,11 +2856,12 @@ class MTMDChatHandler: def __init__( self, - clip_model_path: str, + mmproj_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, **kwargs ): @@ -2872,7 +2873,7 @@ def __init__( f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." ) - self.clip_model_path = clip_model_path + self.mmproj_path = mmproj_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens self.use_gpu = use_gpu @@ -2883,20 +2884,25 @@ def __init__( self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} - if not os.path.exists(clip_model_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") + if not os.path.exists(mmproj_path): + raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}") # Pre-compile Jinja template - if not hasattr(self, "chat_format") or self.chat_format is None: + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override self._chat_format_parser_tags = [] - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True, - ).from_string(self.chat_format) + self.change_chat_template(self.chat_format) self._exit_stack = ExitStack() + + def change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) def _init_mtmd_context(self, llama_model: llama_core.Llama): """Initialize mtmd context with the llama model.""" @@ -2929,13 +2935,13 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), + self.mmproj_path.encode(), llama_model.model, self.mctx_params ) if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") # Check if vision is supported self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) @@ -3835,7 +3841,7 @@ def from_pretrained( model_path = os.path.join(local_dir, filename) return cls( - clip_model_path=model_path, + mmproj_path=model_path, **kwargs, ) @@ -3852,13 +3858,12 @@ class GenericMTMDChatHandler(MTMDChatHandler): def __init__( self, - gguf_metadata: Dict[str, Any], - clip_model_path: str, + chat_format: str, + mmproj_path: str, verbose: bool = True, **kwargs ) -> None: - self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.chat_format = chat_format if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) @@ -3866,7 +3871,7 @@ def __init__( if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") - super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) def __call__(self, **kwargs): self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] From 1965d5f6c3c949cab7f7ef934266c8062ebc0f45 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 20:19:43 +0800 Subject: [PATCH 125/139] refactor(mtmd): move multimodal handlers to separate module `llama_multimodal` - Move MTMDChatHandler, GenericMTMDChatHandler, and model-specific multimodal chat handlers out of llama_chat_format.py into llama_multimodal.py. - llama_chat_format.py has grown too large and difficult to maintain, especially as MTMD support expands beyond image-only use cases. Splitting multimodal handling into its own module makes the chat formatting layer smaller and keeps media loading, MTMD tokenization, multimodal KV-cache bookkeeping, and handler implementations in a dedicated place. - This also prepares the codebase for broader multimodal support and future video frame / image batch evaluation, where the media-processing path will need to evolve independently from text-only chat formatting. - Keep backward-compatible re-exports from llama_chat_format.py so existing imports continue to work. - Also keep `clip_model_path` as a deprecated initialization alias for `mmproj_path` in the base MTMD handler. Signed-off-by: JamePeng --- llama_cpp/llama.py | 8 +- llama_cpp/llama_chat_format.py | 3811 ++------------------------------ llama_cpp/llama_multimodal.py | 3473 +++++++++++++++++++++++++++++ 3 files changed, 3690 insertions(+), 3602 deletions(-) create mode 100644 llama_cpp/llama_multimodal.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ec202568f1..dbc60eaf76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -45,6 +45,7 @@ from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp.llama_multimodal as llama_multimodal from llama_cpp.llama_speculative import LlamaDraftModel @@ -711,20 +712,19 @@ def __init__( self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) - - if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) if mmproj_path is not None: if self.chat_handler is not None and self.verbose: print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) - self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + self.chat_handler = llama_multimodal.GenericMTMDChatHandler( chat_format = self.metadata.get("tokenizer.chat_template", None), mmproj_path = mmproj_path, verbose = self.verbose, **chat_handler_kwargs ) + + if self.verbose: print(f"Model desc: {self.model_desc}, " f"Model size: {self.model_size / (1024 * 1024):.2f} MB, " f"Model metadata: {self.metadata}", diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0e5c9d4906..6ffe68e5e3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,7 +1,5 @@ from __future__ import annotations -import base64 -import ctypes import dataclasses import datetime import json @@ -9,9 +7,7 @@ import random import string import sys -import zlib -from contextlib import ExitStack from typing import ( Any, Dict, @@ -32,16 +28,11 @@ import numpy as np import numpy.typing as npt -import urllib.request -from urllib.error import URLError, HTTPError - -import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama as llama_core import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar -from ._ggml import GGMLLogLevel -from ._logger import logger, ggml_log_callback +from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -3037,3612 +3028,204 @@ def generate_streaming(tools, functions, function_call, prompt): ) -class MTMDChatHandler: - DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " -"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." - ) - - CHAT_FORMAT = ( - "{{ bos_token if bos_token is defined else '' }}" +@register_chat_completion_handler("chatml-function-calling") +def chatml_function_calling( + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs, # type: ignore +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + function_calling_template = ( "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% elif message.role == 'user' %}" - "USER: " - "{% if message.content is string %}" - "{{ message.content }}" - "{% elif message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% elif content.type == 'audio_url' %}" - "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" - "{% elif content.type == 'input_audio' %}" - "{% if content.input_audio is string %}" - "{{ content.input_audio }}" - "{% else %}" - "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" - "{% endif %}" - "{% elif content.type == 'video_url' %}" - "{{ content.video_url if content.video_url is string else content.video_url.url }}" - "{% elif content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - - "{% elif message.role == 'assistant' and message.content is not none %}" - "ASSISTANT: {{ message.content }}" - "{% endif %}" - "{{ \"\n\" }}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\n\nYou can respond to users messages with either a single message or one or more function calls." + "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" + "\n\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Reglar message + "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" + "message:\n" + "{% endif %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if 'tool_calls' in message %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" "{% endfor %}" - - "{% if eos_token is defined %}" - "{{ eos_token }}" + "<|im_end|>\n" "{% endif %}" - - "{% if add_generation_prompt %}" - "ASSISTANT: " "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) + template_renderer = ImmutableSandboxedEnvironment( + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) - def __init__( - self, - mmproj_path: str, - verbose: bool = True, - use_gpu: bool = True, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - chat_template_override: Optional[str] = None, - batch_max_tokens: int = 1024, - **kwargs - ): - - self.log_prefix = self.__class__.__name__ - if kwargs: - unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) - raise TypeError( - f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" - f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." - ) - - self.mmproj_path = mmproj_path - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - self.batch_max_tokens = batch_max_tokens - self.use_gpu = use_gpu - self.verbose = verbose - - import llama_cpp.mtmd_cpp as mtmd_cpp - self._mtmd_cpp = mtmd_cpp - self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None - self.extra_template_arguments: dict[str, Any] = {} - - self.is_support_vision = False - self.is_support_audio = False - self.is_support_video = False - - if not os.path.exists(mmproj_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}") - - # Pre-compile Jinja template - if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: - self.chat_format = self.CHAT_FORMAT - elif chat_template_override is not None: - self.chat_format = chat_template_override - - self._chat_format_parser_tags = [] - self.change_chat_template(self.chat_format) - - self._exit_stack = ExitStack() - - def change_chat_template(self, new_template: str): - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True - ).from_string(new_template) - - def _init_mtmd_context(self, llama_model: llama_core.Llama): - """Initialize mtmd context with the llama model.""" - if self.mtmd_ctx is not None: - return # Already initialized - - self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) - - # Get default parameters - self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() - self.mctx_params.use_gpu = self.use_gpu - self.mctx_params.print_timings = self.verbose - self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - self.mctx_params.warmup = True - if self.image_min_tokens > 0: - self.mctx_params.image_min_tokens = self.image_min_tokens - if self.image_max_tokens > 0: - self.mctx_params.image_max_tokens = self.image_max_tokens - if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " - f"cannot be less than image_min_tokens ({self.image_min_tokens}).") - self.mctx_params.batch_max_tokens = self.batch_max_tokens - - # Cache the model's eos token and bos token - self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') - self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') - - # Cache the mtmd_default_marker - self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Initialize mtmd context - self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.mmproj_path.encode(), - llama_model.model, - self.mctx_params - ) - - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") - - # Check if vision is supported - self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) - if self.is_support_vision: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if audio is supported - self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) - if self.is_support_audio: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if video is supported - self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) - if self.is_support_video: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) - - def close(self) -> None: - """Explicitly free the mtmd context and vision model resources.""" - if getattr(self, "mtmd_ctx", None) is not None: - try: - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) - except Exception: - pass - self.mtmd_ctx = None - self.mctx_params = None - self.chat_template = None - - if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): - self._exit_stack.close() - self._exit_stack = None - - def __del__(self) -> None: - self.close() - - def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: - """ - Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. - Strictly enforces capability checks, raising exceptions if unsupported media is passed. - - Returns: - media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). - """ - media_items: List[Dict[str, str]] = [] - for message in messages: - if isinstance(message.get("content"), list): - for content in message["content"]: - content_type = content.get("type", "") - - # 1. Vision Processing - if content_type == "image_url": - if not self.is_support_vision: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") - - url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] - media_items.append({"url": url, "type": "image"}) - - # 2. Audio Processing - elif content_type in ["audio", "audio_url", "input_audio"]: - if not self.is_support_audio: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") - - # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url" or content_type == "audio": - audio_url = content[content_type] - url = audio_url if isinstance(audio_url, str) else audio_url["url"] - media_items.append({"url": url, "type": "audio"}) - # Case B: Handle OpenAI standard input_audio format - elif content_type == "input_audio": - input_audio = content.get("input_audio", {}) - if isinstance(input_audio, dict) and "data" in input_audio: - # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic - # input_audio: { - # data: audio.base64Data, - # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' - # } - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - - # Strictly align with llama.cpp (require wav/mp3) - if audio_format not in ["wav", "mp3"]: - raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") - - # Format as a Data URI to reuse the unified load_media logic - media_items.append({ - "url": f"data:audio/{audio_format};base64,{audio_data}", - "type": "audio" - }) - else: - # Just a raw base64 data - url = input_audio if isinstance(input_audio, str) else "" - if url: - media_items.append({"url": url, "type": "audio"}) - - # 3. Video Processing - elif content_type == "video_url": - if not self.is_support_video: - raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") - - video_url = content["video_url"] - url = video_url if isinstance(video_url, str) else video_url["url"] - media_items.append({"url": url, "type": "video"}) - - # 4. Text & Unknown Types - elif content_type == "text": - continue - else: - if self.verbose: - print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) - return media_items - - def _create_bitmap_from_bytes(self, media_bytes: bytes): - """ - Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. - - Supported formats: - - Images (via stb_image): jpg, png, bmp, etc. - - Audio (via miniaudio): wav, mp3, flac. - - Video: depends on whether MTMD_VIDEO was enabled at build time. - - Note: - - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. - - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. - - Args: - media_bytes (bytes): The raw byte content of the media file. - - Returns: - bitmap: mtmd_bitmap * - video_ctx: mtmd_helper_video * or NULL - """ - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] - if not media_bytes: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } - buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + stop = ( + [stop, "<|im_end|>"] + if isinstance(stop, str) + else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + ) - wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( - self.mtmd_ctx, - buf, - len(media_bytes), - False, + # Case 1: No tool choice by user + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, + tools=[], + tool_calls=None, + add_generation_prompt=True, ) - if not wrapper.bitmap: - if wrapper.video_ctx: - self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) - - raise ValueError( - f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load media from bytes " - "(unsupported media format, corrupted data, or missing helper support)." - ) - - return wrapper.bitmap, wrapper.video_ctx - - def _is_text_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD text chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT - ) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) - def _is_image_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD image chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, ) - def _is_audio_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + # Case 2: Tool choice by user + if isinstance(tool_choice, dict): + tool_name = tool_choice["function"]["name"] + tool = next( + (tool for tool in tools if tool["function"]["name"] == tool_name), None ) - - def _process_mtmd_prompt( - self, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - add_generation_prompt: bool = True, - ) -> Tuple[List[int], List[tuple], Any, List[Any]]: - """ - Core multimodal preprocessing pipeline. - Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. - - Features: - - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. - - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. - - Strict RAII-style C++ memory management to prevent leaks on failure. - - Returns: - full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. - chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). - chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). - bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. - """ - # 1. Inject default system prompt if omitted by the user - system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") - if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: - messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages - - media_items = self._get_media_items(messages) - media_marker = self.media_marker - - # 2. Render the chat template and replace actual URLs with C++ media markers - text = self.chat_template.render( + if tool is None: + raise ValueError(f"Tool with name '{tool_name}' not found in tools") + prompt = template_renderer.render( messages=messages, - add_generation_prompt=add_generation_prompt, - eos_token=self.mtmd_eos_token, - bos_token=self.mtmd_bos_token, - functions=functions, - function_call=function_call, tools=tools, - tool_choice=tool_choice, - **getattr(self, 'extra_template_arguments', {}) + tool_calls=True, + add_generation_prompt=True, ) - - for tag in self._chat_format_parser_tags: - if tag not in text: - continue - - text = text.replace(tag, media_marker) - - # Replace image_url by media_marker in text - for item in media_items: - text = text.replace(item["url"], media_marker) - - if self.verbose: - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) - - # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding - bitmaps = [None] * len(media_items) - bitmap_cleanup = [] - video_cleanup = [] - chunks = None - - try: - # Concurrent Media Decoding - import concurrent.futures - if media_items: - def _create_bitmap_func(idx: int, item: dict): - media_bytes = self.load_media(item["url"], item["type"]) - bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap, video_ctx - # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, - # which can be used in the future to process large numbers of video frames. - max_workers = min(llama.n_threads, len(media_items)) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] - - for future in concurrent.futures.as_completed(futures): - idx, bitmap, video_ctx = future.result() - - bitmaps[idx] = bitmap - bitmap_cleanup.append(bitmap) - - if video_ctx: - video_cleanup.append(video_ctx) - - # Strict validation: Abort if any thread failed to decode its assigned media - if any(b is None for b in bitmaps): - raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") - else: - if self.verbose: - print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") - else: - # If there are no images, set the bitmaps to empty. - bitmaps = [] - - # 4. Initialize mtmd_input_chunks - input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') - input_text.add_special = (llama.n_tokens == 0) - input_text.parse_special = True - - chunks = self._mtmd_cpp.mtmd_input_chunks_init() - if chunks is None: - raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") - - # 5. Hybrid Tokenization (Text + Media binding) - if len(bitmaps) > 0: - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) - ) - else: - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") - - # Video helper contexts only need to stay alive until mtmd_tokenize() completes. - if video_cleanup: - for video_ctx in video_cleanup: - self._mtmd_cpp.mtmd_helper_video_free(video_ctx) - video_cleanup.clear() - - # 6. Virtual Token Ledger Construction - full_prompt_ids = [] - chunk_token_spans = [] - current_idx = 0 - n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - - # Cursor to track the actual media contents (URLs or base64 data) provided by the user - media_items_count = len(media_items) - media_items_cur = 0 - last_media_id = None - - for i in range(n_chunks): - chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: continue - chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - - if self._is_text_chunk(chunk_type): - # Extract standard text token IDs - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) - if tokens_ptr and n_tokens_out.value > 0: - tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) - full_prompt_ids.extend(tokens) - current_idx += len(tokens) - elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): - # Extract media properties - # Note(JamePeng): - # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). - # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. - # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - - if media_items_cur < media_items_count: - # The C++ parser only sees identical placeholders (e.g., "<__media__>"). - # We MUST inject the actual media content's identity here. - real_media_url = media_items[media_items_cur]["url"] - # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) - # Generate a deterministic, unique negative ID for this specific image/audio. - # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). - # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with - # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). - # This empowers `longest_token_prefix` to correctly identify and reuse cached images, - # while instantly breaking the match if the image content changes. - # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 - media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 - last_media_id = media_id - media_items_cur += 1 - elif last_media_id is not None: - # video may expand into multiple image chunks from one media marker - media_id = last_media_id - else: - # Magic Negative Number as fallback :) - media_id = -314159 - - if self.verbose: - print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") - - chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) - - # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache - full_prompt_ids.extend([media_id] * chunk_n_tokens) - current_idx += chunk_n_tokens - else: - raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") - - return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup - - except Exception as e: - # Ensure no useless pointers remain upon any failure - # Free chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Free bitmaps - if len(bitmap_cleanup) > 0: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup = None - # Free videos - if len(video_cleanup) > 0: - for video_ctx in video_cleanup: - self._mtmd_cpp.mtmd_helper_video_free(video_ctx) - video_cleanup = None - - bitmaps = None - - raise e - - def __call__( - self, - *, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - seed: Optional[int] = None, - response_format: Optional[ - llama_types.ChatCompletionRequestResponseFormat - ] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logit_bias: Optional[Dict[str, float]] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - add_generation_prompt: bool = True, - reasoning_budget: int = -1, - reasoning_start: str = "", - reasoning_end: str = "", - reasoning_budget_message: Optional[str] = None, - reasoning_start_in_prompt: bool = False, - reasoning_start_max_tokens: Optional[int] = 32, - **kwargs, # type: ignore - ) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], - ]: - # 1. Initialize mtmd context - self._init_mtmd_context(llama) - assert self.mtmd_ctx is not None - - # 2. Concurrent Preprocessing & Ledger Construction - full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( - llama=llama, - messages=messages, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - add_generation_prompt=add_generation_prompt, - ) - - if self.verbose: - print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) - - try: - # 3. KV Cache Synchronization & State Rollback - # Compares the virtual ledger with physical history to prevent Cache Poisoning. - current_history = llama.input_ids[:llama.n_tokens].tolist() - longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) - - if longest_prefix < llama.n_tokens: - if llama.is_hybrid and llama._hybrid_cache_mgr is not None: - if llama._hybrid_cache_mgr.max_checkpoints > 0: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " - f"Searching for nearest checkpoint...", file=sys.stderr) - - best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) - if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - llama.n_tokens = best_ckpt.pos - if self.verbose: - print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) - llama._ctx.memory_seq_rm(0, longest_prefix, -1) - llama.n_tokens = longest_prefix - - n_past = llama.n_tokens - - for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: - # Skip previously matched chunks - if end_idx <= n_past: - continue - - if self._is_text_chunk(chunk_type): - unprocessed_start = max(start_idx, n_past) - start_idx - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) - - if tokens_ptr and n_tokens_out.value > 0: - all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - tokens_to_eval = all_tokens[unprocessed_start:] - - if tokens_to_eval: - if self.verbose: - print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - # Text evaluation delegates shift and chunking to native llama.eval - llama.eval(tokens_to_eval) - n_past = llama.n_tokens - - elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) - - if self.verbose: - media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" - print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - - # Stage 5: Multimodal Physical OOM Defense - if n_past + chunk_n_tokens > llama.n_ctx(): - if not llama._ctx.memory_can_shift(): - raise RuntimeError( - f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " - f"(n_pos_per_embd > 1 or incompatible M-RoPE). " - f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " - f"You MUST increase n_ctx to fit the dialogue." - ) - else: - # Safely discard oldest tokens while preserving system prompts - n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch - n_keep = min(llama.n_keep, n_past) - n_discard = min(n_discard, n_past - n_keep) - - if n_discard <= 0: - raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") - - if self.verbose: - print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) - - # Execute physical memory shift - llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) - llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) - - # Shift python virtual array to match - remaining_len = n_past - (n_keep + n_discard) - if remaining_len > 0: - llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] - - n_past -= n_discard - llama.n_tokens = n_past - - # Execute C++ Multimodal Black-box Extraction - new_n_past = llama_cpp_lib.llama_pos(0) - result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( - self.mtmd_ctx, - llama._ctx.ctx, - chunk_ptr, - llama_cpp_lib.llama_pos(n_past), - llama_cpp_lib.llama_seq_id(0), - llama.n_batch, - True, # logits_last = True, drastically saves computational overhead - ctypes.byref(new_n_past) - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") - - # Update Ledger with "Negative Reverse Vocabulary" IDs - llama.input_ids[n_past : new_n_past.value] = media_id - n_past = new_n_past.value - llama.n_tokens = n_past - - # Extract the final, perfectly synchronized prompt sequence - prompt = llama.input_ids[: llama.n_tokens].tolist() - - # End-of-Turn Checkpoint - # Anchors the state ONLY after the entire multi-modal turn is processed - if ( - llama.is_hybrid - and llama._hybrid_cache_mgr is not None - and llama._hybrid_cache_mgr.max_checkpoints > 0 - ): - if self.verbose: - print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) - - llama._hybrid_cache_mgr.save_checkpoint( - current_pos=llama.n_tokens, - tokens=prompt, - seq_id=0 - ) - finally: - # Cleanup chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Cleanup bitmaps - if bitmap_cleanup: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup.clear() - bitmap_array = None - - # Handle response format and tools (same as before) - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - tool = None - if ( - tool_choice is not None - and isinstance(tool_choice, dict) - and tools is not None - ): - name = tool_choice["function"]["name"] - tool = next((t for t in tools if t["function"]["name"] == name), None) - if tool is None: - raise ValueError(f"Tool choice '{name}' not found in tools.") - schema = tool["function"]["parameters"] - try: - # create grammar from json schema - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(schema), verbose=llama.verbose - ) - except Exception as e: - if llama.verbose: - print(str(e), file=sys.stderr) - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF, verbose=llama.verbose - ) - - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - logprobs=top_logprobs if logprobs else None, - stream=stream, - stop=stop, - seed=seed, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logit_bias=logit_bias, - reasoning_budget=reasoning_budget, - reasoning_start=reasoning_start, - reasoning_end=reasoning_end, - reasoning_budget_message=reasoning_budget_message, - reasoning_start_in_prompt=reasoning_start_in_prompt, - reasoning_start_max_tokens=reasoning_start_max_tokens, - ) - - if tool is not None: - tool_name = tool["function"]["name"] - return _convert_completion_to_chat_function( - tool_name, completion_or_chunks, stream - ) - return _convert_completion_to_chat(completion_or_chunks, stream=stream) - - def load_media(self, media_url: str, media_type: str) -> bytes: - """ - Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image, audio, or video processor based on the media_type. - """ - if media_type == "image": - return self._load_image(media_url) - - elif media_type == "audio": - audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") - try: - self.detect_audio_format(audio_bytes) - except ValueError as e: - raise ValueError(f"{self.log_prefix}(load_media): {e}") - return audio_bytes - - elif media_type == "video": - return self._load_bytes(media_url, timeout=30, kind="video") - - else: - raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") - - @staticmethod - def detect_audio_format(audio_bytes: bytes) -> str: - """ - Pure utility function: Detects the audio format from magic bytes. - Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility - and avoid false positives (e.g., AVI files disguised as RIFF). - """ - length = len(audio_bytes) - - if length < 12: - raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") - - # RIFF & WAVE magic bytes verification - is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" - - # ID3 metadata or MPEG sync word verification - is_mp3 = length >= 3 and ( - audio_bytes.startswith(b"ID3") or - (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) - ) - - # FLAC magic bytes verification - is_flac = audio_bytes.startswith(b"fLaC") - - if is_wav: - return "wav" - elif is_mp3: - return "mp3" - elif is_flac: - return "flac" - else: - raise ValueError( - "Unsupported audio format detected via magic bytes. " - "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." - ) - - DEFAULT_HTTP_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/148.0.0.0 Safari/537.36" - ), - } - - @staticmethod - def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: - """ - Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. - """ - media_bytes = b"" - - # 1. Handle data URI - if media_url.strip().startswith("data:"): - comma_pos = media_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - - base64_data = media_url[comma_pos + 1:] - media_bytes = base64.b64decode(base64_data) - - # 2. Handle local file path - elif os.path.exists(media_url): - with open(media_url, "rb") as f: - media_bytes = f.read() - - # 3. Handle remote URL via HTTP/HTTPS - else: - req = urllib.request.Request( - media_url, - headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, - ) - try: - with urllib.request.urlopen(req, timeout=timeout) as f: - media_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") - - if not media_bytes: - raise ValueError(f"Empty {kind} data received") - - return media_bytes - - @staticmethod - def _load_image(image_url: str) -> bytes: - """ - Load an image from either a URL or a data URI and return it as JPEG bytes. - - Supports: - - Remote images via HTTP/HTTPS (with proper User-Agent) - - Data URIs (base64-encoded, e.g., data:image/png;base64,...) - - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background - - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html - - Returns: - JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. - """ - # 1. Load image bytes from image_url - image_bytes = MTMDChatHandler._load_bytes( - image_url, - timeout=15, - kind="image", - ) - - # 2. Check if image_bytes is empty. - if not image_bytes: - raise ValueError("Empty image data received") - - # 3. Open image with Pillow - try: - from PIL import Image, ImageStat - except ImportError: - raise ImportError("Pillow is required for image processing. Install with: pip install pillow") - - import io - image = Image.open(io.BytesIO(image_bytes)) - - # 4. Handle transparency (RGBA, LA, P with transparency, etc.) - if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): - # Use alpha channel as mask - if image.mode == "P": - image = image.convert("RGBA") - - alpha = image.split()[-1] # Last channel is alpha - # Compute average brightness of visible (non-transparent) pixels - stat = ImageStat.Stat(image.convert("L"), mask=alpha) - - # Choose background: white for dark content, black for bright content - bg_color = (255, 255, 255) # white - if stat.count[0] > 0 and stat.mean[0] > 127: - bg_color = (0, 0, 0) # black - - background = Image.new("RGB", image.size, bg_color) - background.paste(image, mask=alpha) - image = background - - # 5. Ensure RGB mode for formats like CMYK, palette, etc. - elif image.mode != "RGB": - image = image.convert("RGB") - - # 6. Save as high-quality JPEG, suitable for most vision models. - output = io.BytesIO() - image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) - return output.getvalue() - - @classmethod - def from_pretrained( - cls, - repo_id: str, - filename: Optional[str], - local_dir: Optional[Union[str, os.PathLike[str]]] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - cache_dir: Optional[Union[str, os.PathLike[str]]] = None, - **kwargs: Any, - ) -> "MTMDChatHandler": - import fnmatch - from pathlib import Path - - try: - from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore - from huggingface_hub.utils import validate_repo_id # type: ignore - except ImportError: - raise ImportError( - "Llama.from_pretrained requires the huggingface_hub package. " - "You can install it with `pip install --upgrade huggingface_hub`." - ) - - validate_repo_id(repo_id) - - hffs = HfFileSystem() - - files = [ - file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id) # type: ignore - ] - - # split each file into repo_id, subfolder, filename - file_list: List[str] = [] - for file in files: - rel_path = Path(file).relative_to(repo_id) - file_list.append(str(rel_path)) - - matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore - - if len(matching_files) == 0: - raise ValueError( - f"No file found in {repo_id} that match {filename}\n\n" - f"Available Files:\n{json.dumps(file_list)}" - ) - - if len(matching_files) > 1: - raise ValueError( - f"Multiple files found in {repo_id} matching {filename}\n\n" - f"Available Files:\n{json.dumps(files)}" - ) - - (matching_file,) = matching_files - - subfolder = str(Path(matching_file).parent) - filename = Path(matching_file).name - - # download the file - hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=cast(Union[str, Path, None], local_dir), - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - ) - - if local_dir is None: - model_path = hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=local_dir, - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - local_files_only=True, - ) - else: - model_path = os.path.join(local_dir, filename) - - return cls( - mmproj_path=model_path, - **kwargs, - ) - -class GenericMTMDChatHandler(MTMDChatHandler): - KNOWN_MEDIA_TAGS = [ - "<|image_pad|>", - "<|audio_pad|>", - "<|video_pad|>", - "<|image|>", - "<|audio|>", - "<|video|>", - "[IMG]" - ] - - def __init__( - self, - chat_format: str, - mmproj_path: str, - verbose: bool = True, - **kwargs - ) -> None: - self.chat_format = chat_format - - if verbose: - print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.chat_format is None: - raise ValueError("Failed to get model chat template automatically.") - - super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) - - def __call__(self, **kwargs): - self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Llava15ChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "\nASSISTANT: " - "{% endif %}" - ) - - -class ObsidianChatHandler(MTMDChatHandler): - # Prompt Format - # The model followed ChatML format. However, with ### as the seperator - - # <|im_start|>user - # What is this sign about?\n - # ### - # <|im_start|>assistant - # The sign is about bullying, and it is placed on a black background with a red background. - # ### - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}\n" - "###\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "###\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "###\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MoondreamChatHandler(MTMDChatHandler): - # Chat Format: - # f"\n\n{chat_history}Question: {question}\n\nAnswer:" - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "Question: {{ content.text }}\n\n" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "Question: {{ message.content }}\n\n" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "Answer:{{ message.content }}\n\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class Llava16ChatHandler(MTMDChatHandler): - # Example prompt - # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "{{ message.content }}" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class NanoLlavaChatHandler(MTMDChatHandler): - # Prompt Format - # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: - - # <|im_start|>system - # Answer the question<|im_end|><|im_start|>user - # - # What is the picture about?<|im_end|><|im_start|>assistant - DEFAULT_SYSTEM_MESSAGE = "Answer the question" - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "<|im_end|>" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class Llama3VisionAlphaChatHandler(MTMDChatHandler): - # question = "" + q - - # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "<|start_header_id|>" - "{% if message.role == 'user' %}" - "user<|end_header_id|>\n\n" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "assistant<|end_header_id|>\n\n" - "{{ message.content }}" - "{% endif %}" - "<|eot_id|>" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - "{% endif %}" - ) - - -# alias -Llama3VisionAlpha = Llama3VisionAlphaChatHandler - - -class MiniCPMv26ChatHandler(MTMDChatHandler): - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and messages[0]['role'] != 'system' %}" - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is iterable %}" - "{% for content in message['content'] %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - - "{% for content in message['content'] %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% endif %}" - "<|im_end|>\n" - "{% endfor %}" - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MiniCPMv45ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V 4.5 models. - - Supports: - - Multi-step tool calls with and XML tags. - - Integrated reasoning (thinking) process with tags. - - Specialized system prompt handling with tool definitions. - - Global image numbering for multi-image processing. - """ - - # Model specific control tokens - MINICPMV_BOS_TOKEN = "<|im_start|>" - MINICPMV_EOS_TOKEN = "<|im_end|>" - MINICPMV_PAD_TOKEN = "<|endoftext|>" - - # Image placeholder tags - MINICPMV_IMAGE_START_TOKEN = "" - MINICPMV_IMAGE_END_TOKEN = "" - MINICPMV_IMAGE_ID_START_TOKEN = "" - MINICPMV_IMAGE_ID_END_TOKEN = "" - - CHAT_FORMAT = ( - # --- 1. First System Message & Tools Definitions --- - "{%- if tools %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" - "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" - "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" - "{{- 'You are provided with function signatures within XML tags:\\n' }}" - "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" - "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- elif messages[0].role == 'system' %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - - # --- 2. Message Stream Processing --- - "{% set image_count = namespace(value=0) %}" - "{%- for message in messages %}" - # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- - "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" - "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" - - "{%- set content = message.content %}" - "{%- if content is not string %}" - "{%- set ns = namespace(content_str='') %}" - "{%- for item in content %}" - # --- Explicit image_url type and value checking --- - "{%- if item.type == 'image_url' %}" - "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" - "{%- set image_count.value = image_count.value + 1 %}" - # Format: N: IMAGE_URL - "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" - "{%- elif item.type == 'text' %}" - "{%- set ns.content_str = ns.content_str + item.text %}" - "{%- endif %}" - "{%- endfor %}" - "{%- set content = ns.content_str %}" - "{%- endif %}" - - "{{- content -}}" - - # Append tool_calls to assistant messages if they exist - "{%- if message.role == 'assistant' and message.tool_calls %}" - "{%- for tool_call in message.tool_calls %}" - "{%- set tc = tool_call.function if tool_call.function else tool_call %}" - "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" - "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" - "{{- '}\\n' }}" - "{%- endfor %}" - "{%- endif %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - - # --- Specialized Tool Response Handling --- - # Group consecutive tool responses under a single user-like block - "{%- elif message.role == 'tool' %}" - "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" - "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" - "{%- endif %}" - "{{- '\\n\\n' + message.content + '\\n' }}" - "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- endfor %}" - - # --- 3. Generation Prompt --- - "{%- if add_generation_prompt %}" - "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" - # Handle thinking/reasoning block visibility based on configuration - "{%- if enable_thinking is defined and enable_thinking is false %}" - "{{- '\\n\\n\\n\\n' }}" - "{%- elif enable_thinking is defined and enable_thinking is true %}" - "{{- '\\n' }}" - "{%- endif %}" - "{%- endif %}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V 4.5 Handler. - - Args: - enable_thinking (bool): If True, model generates reasoning before the final answer. - **kwargs: Additional arguments for the base MTMDChatHandler. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject thinking control flag into the template - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set stop token patch - kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class MiniCPMV46ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V-4.6 models. - - Features: - - Aligned with official tokenizer_config.json special tokens. - - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. - - Integrated MTMD-style URL and Base64 injection for visual content. - - Specialized `` and `` block generation. - - Autonomously folds previous reasoning paths using `last_query_index`. - - Toggles `` block generation via `enable_thinking` (Defaults to False). - """ - - # Core tokens - MINICPM_BOS_TOKEN = "<|im_start|>" - MINICPM_EOS_TOKEN = "<|im_end|>" - MINICPM_PAD_TOKEN = "<|endoftext|>" - - # Vision tokens - MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" - MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" - MINICPM_IMAGE_TOKEN = "<|image_pad|>" - MINICPM_VIDEO_TOKEN = "<|video_pad|>" - - CHAT_FORMAT = ( - "{%- if enable_thinking is not defined -%}\n" - " {%- set enable_thinking = false -%}\n" - "{%- endif -%}\n" - "{%- macro render_content(content, is_system_content=false) -%}\n" - " {%- if content is string -%}\n" - " {{- content -}}\n" - " {%- elif content is iterable and content is not mapping -%}\n" - " {%- set ns = namespace(parts=[]) -%}\n" - " {%- for item in content -%}\n" - " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" - " {%- if is_system_content -%}\n" - " {{- raise_exception('System message cannot contain images.') -}}\n" - " {%- endif -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.type == 'image_url' -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" - # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" - # " {%- if is_system_content -%}\n" - # " {{- raise_exception('System message cannot contain videos.') -}}\n" - # " {%- endif -%}\n" - # " {%- set url_val = '' -%}\n" - # " {%- if item.type == 'video_url' -%}\n" - # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" - # " {%- endif -%}\n" - # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" - " {%- elif 'text' in item -%}\n" - " {%- set ns.parts = ns.parts + [item.text] -%}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected item type in content.') -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.parts | join('\\n') -}}\n" - " {%- elif content is none or content is undefined -%}\n" - " {{- '' -}}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected content type.') -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- if not messages %}\n" - " {{- raise_exception('No messages provided.') }}\n" - "{%- endif %}\n" - "{%- if tools and tools is iterable and tools is not mapping %}\n" - " {{- '<|im_start|>system\\n' }}\n" - " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" - " {%- for tool in tools %}\n" - " {{- '\\n' }}\n" - " {{- tool | tojson }}\n" - " {%- endfor %}\n" - " {{- '\\n' }}\n" - " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {%- if content %}\n" - " {{- '\\n\\n' + content }}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - "{%- else %}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" - "{%- for message in messages[::-1] %}\n" - " {%- set index = (messages|length - 1) - loop.index0 %}\n" - " {%- if ns.multi_step_tool and message.role == 'user' %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if not(content.startswith('') and content.endswith('')) %}\n" - " {%- set ns.multi_step_tool = false %}\n" - " {%- set ns.last_query_index = index %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if ns.multi_step_tool %}\n" - " {{- raise_exception('No user query found in messages.') }}\n" - "{%- endif %}\n" - "{%- for message in messages %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if message.role == 'system' %}\n" - " {%- if not loop.first %}\n" - " {{- raise_exception('System message must be at the beginning.') }}\n" - " {%- endif %}\n" - " {%- elif message.role == 'user' %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" - " {%- elif message.role == 'assistant' %}\n" - " {%- set reasoning_content = '' %}\n" - " {%- if message.reasoning_content is string %}\n" - " {%- set reasoning_content = message.reasoning_content %}\n" - " {%- else %}\n" - " {%- if '' in content %}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {%- set reasoning_content = reasoning_content|trim %}\n" - " {%- if loop.index0 > ns.last_query_index %}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" - " {%- else %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" - " {%- endif %}\n" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" - " {%- for tool_call in message.tool_calls %}\n" - " {%- if tool_call.function is defined %}\n" - " {%- set tool_call = tool_call.function %}\n" - " {%- endif %}\n" - " {%- if loop.first %}\n" - " {%- if content|trim %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- '\\n\\n\\n' }}\n" - " {%- endif %}\n" - " {%- if tool_call.arguments is defined %}\n" - " {%- for args_name, args_value in tool_call.arguments|items %}\n" - " {{- '\\n' }}\n" - " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" - " {{- args_value }}\n" - " {{- '\\n\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif message.role == 'tool' %}\n" - " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" - " {{- '<|im_start|>user' }}\n" - " {%- endif %}\n" - " {{- '\\n\\n' }}\n" - " {{- content }}\n" - " {{- '\\n' }}\n" - " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif loop.last %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- raise_exception('Unexpected message role.') }}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if add_generation_prompt %}\n" - " {{- '<|im_start|>assistant\\n' }}\n" - " {%- if enable_thinking is defined and enable_thinking is false %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V-4.6 Handler. - - Args: - enable_thinking (bool): Controls whether to open a `` block for reasoning. - Defaults to False as per the standard template logic. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # MiniCPM uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class Gemma3ChatHandler(MTMDChatHandler): - - GEMMA3_BOI_TOKEN = "" - GEMMA3_EOI_TOKEN = "" - GEMMA3_BOS_TOKEN = "" - GEMMA3_EOS_TOKEN = "" - - CHAT_FORMAT = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" - "{% if messages[0]['content'] is string %}" - "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" - "{% else %}" - "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" - "{% endif %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set first_user_prefix = '' %}" - "{% endif %}" - - "{% for message in loop_messages %}" - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" - "{% endif %}" - - "{% if message['role'] == 'assistant' %}" - "{% set role = 'model' %}" - "{% else %}" - "{% set role = message['role'] %}" - "{% endif %}" - - "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" - - "{% if message['content'] is string %}" - "{{ message['content'] | trim }}" - "{% elif message['content'] is iterable %}" - "{% for item in message['content'] %}" - "{% if item['type'] == 'image_url' and item['image_url'] is string %}" - "{{ '' + item['image_url'] + '' }}" - "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" - "{{ '' + item['image_url']['url'] + '' }}" - "{% elif item['type'] == 'text' %}" - "{{ item['text'] | trim }}" - "{% endif %}" - "{% endfor %}" - "{% else %}" - "{{ raise_exception('Invalid content type') }}" - "{% endif %}" - - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "model\n" - "{% endif %}" - ) - - -class Gemma4ChatHandler(MTMDChatHandler): - """ - Handler for Gemma 4 models. - - Note on `enable_thinking`: - The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. - It is NOT supported by Gemma4 E2B and E4B models. - - [Important Note for Audio Processing!] - It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. - Other quantizations are known to have degraded performance; - ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 - """ - - # The special token in Gemma 4 - GEMMA4_BOI_TOKEN = "<|image>" - GEMMA4_EOI_TOKEN = "" - GEMMA4_BOA_TOKEN = "<|audio>" - GEMMA4_EOA_TOKEN = "" - GEMMA4_BOS_TOKEN = "" - GEMMA4_EOS_TOKEN = "" - GEMMA4_SOT_TOKEN = "<|turn>" - GEMMA4_EOT_TOKEN = "" - GEMMA4_SOC_TOKEN = "<|channel>" - GEMMA4_EOC_TOKEN = "" - GEMMA4_STC_TOKEN = "<|tool_call>" - GEMMA4_ETC_TOKEN = "" - GEMMA4_STD_TOKEN = "<|tool>" - GEMMA4_ETD_TOKEN = "" - GEMMA4_STR_TOKEN = "<|tool_response>" - GEMMA4_ETR_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" - " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in properties | dictsort -%}\n" - " {%- set add_comma = false -%}\n" - " {%- if not filter_keys or key not in standard_keys -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {{ key }}:{\n" - " {%- if value['description'] -%}\n" - " description:<|\"|>{{ value['description'] }}<|\"|>\n" - " {%- set add_comma = true -%}\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'STRING' -%}\n" - " {%- if value['enum'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " enum:{{ format_argument(value['enum']) }}\n" - " {%- endif -%}\n" - " {%- elif value['type'] | upper == 'ARRAY' -%}\n" - " {%- if value['items'] is mapping and value['items'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " items:{\n" - " {%- set ns_items = namespace(found_first=false) -%}\n" - " {%- for item_key, item_value in value['items'] | dictsort -%}\n" - " {%- if item_value is not none -%}\n" - " {%- if ns_items.found_first %},{% endif -%}\n" - " {%- set ns_items.found_first = true -%}\n" - " {%- if item_key == 'properties' -%}\n" - " properties:{\n" - " {%- if item_value is mapping -%}\n" - " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" - " {%- endif -%}\n" - " }\n" - " {%- elif item_key == 'required' -%}\n" - " required:[\n" - " {%- for req_item in item_value -%}\n" - " <|\"|>{{- req_item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- elif item_key == 'type' -%}\n" - " {%- if item_value is string -%}\n" - " type:{{ format_argument(item_value | upper) }}\n" - " {%- else -%}\n" - " type:{{ format_argument(item_value | map('upper') | list) }}\n" - " {%- endif -%}\n" - " {%- else -%}\n" - " {{ item_key }}:{{ format_argument(item_value) }}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " }\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if value['nullable'] %}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " nullable:true\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'OBJECT' -%}\n" - " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" - " }\n" - " {%- elif value is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" - " }\n" - " {%- endif -%}\n" - " {%- if value['required'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " required:[\n" - " {%- for item in value['required'] | default([]) -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - "{%- endmacro -%}\n" - "{%- macro format_function_declaration(tool_data) -%}\n" - " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" - " {%- set params = tool_data['function']['parameters'] -%}\n" - " {%- if params -%}\n" - " ,parameters:{\n" - " {%- if params.get('properties') -%}\n" - " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" - " {%- endif -%}\n" - " {%- if params.get('required') -%}\n" - " required:[\n" - " {%- for item in params['required'] -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {{- ',' if not loop.last -}}\n" - " {%- endfor -%}\n" - " ],\n" - " {%- endif -%}\n" - " {%- if params.get('type') -%}\n" - " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if 'response' in tool_data['function'] -%}\n" - " {%- set response_declaration = tool_data['function']['response'] -%}\n" - " ,response:{\n" - " {%- if response_declaration['description'] -%}\n" - " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" - " {%- endif -%}\n" - " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" - " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " }\n" - "{%- endmacro -%}\n" - "{%- macro format_argument(argument, escape_keys=True) -%}\n" - " {%- if argument is string -%}\n" - " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" - " {%- elif argument is boolean -%}\n" - " {{- 'true' if argument else 'false' -}}\n" - " {%- elif argument is mapping -%}\n" - " {{- '{' -}}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in argument | dictsort -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {%- if escape_keys -%}\n" - " {{- '<|\"|>' + key + '<|\"|>' -}}\n" - " {%- else -%}\n" - " {{- key -}}\n" - " {%- endif -%}\n" - " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- elif argument is sequence -%}\n" - " {{- '[' -}}\n" - " {%- for item in argument -%}\n" - " {{- format_argument(item, escape_keys=escape_keys) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- ']' -}}\n" - " {%- else -%}\n" - " {{- argument -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- macro strip_thinking(text) -%}\n" - " {%- set ns = namespace(result='') -%}\n" - " {%- for part in text.split('') -%}\n" - " {%- if '<|channel>' in part -%}\n" - " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" - " {%- else -%}\n" - " {%- set ns.result = ns.result + part -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.result | trim -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- macro format_tool_response_block(tool_name, response) -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- if response is mapping -%}\n" - " {{- 'response:' + tool_name + '{' -}}\n" - " {%- for key, value in response | dictsort -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- else -%}\n" - " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" - " {%- endif -%}\n" - " {{- '' -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- set ns = namespace(prev_message_type=None) -%}\n" - "{%- set loop_messages = messages -%}\n" - "{{- bos_token -}}\n" - "{#- Handle System/Tool Definitions Block -#}\n" - "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- '<|turn>system\\n' -}}\n" - " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" - " {%- if enable_thinking is defined and enable_thinking -%}\n" - " {{- '<|think|>\\n' -}}\n" - " {%- set ns.prev_message_type = 'think' -%}\n" - " {%- endif -%}\n" - " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {%- if messages[0]['content'] is string -%}\n" - " {{- messages[0]['content'] | trim -}}\n" - " {%- elif messages[0]['content'] is sequence -%}\n" - " {%- for item in messages[0]['content'] -%}\n" - " {{- item['text'] | trim + ' '-}}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set loop_messages = messages[1:] -%}\n" - " {%- endif -%}\n" - " {%- if tools -%}\n" - " {%- for tool in tools %}\n" - " {{- '<|tool>' -}}\n" - " {{- format_function_declaration(tool) | trim -}}\n" - " {{- '' -}}\n" - " {%- endfor %}\n" - " {%- set ns.prev_message_type = 'tool' -%}\n" - " {%- endif -%}\n" - " {{- '\\n' -}}\n" - "{%- endif %}\n" - "\n" - "{#- Pre-scan: find last user message index for reasoning guard -#}\n" - "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" - "{%- for i in range(loop_messages | length) -%}\n" - " {%- if loop_messages[i]['role'] == 'user' -%}\n" - " {%- set ns_turn.last_user_idx = i -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{#- Loop through messages -#}\n" - "{%- for message in loop_messages -%}\n" - " {%- if message['role'] != 'tool' -%}\n" - " {%- set ns.prev_message_type = None -%}\n" - " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" - " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" - " {%- set prev_nt = namespace(role=None, found=false) -%}\n" - " {%- if loop.index0 > 0 -%}\n" - " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" - " {%- if not prev_nt.found -%}\n" - " {%- if loop_messages[j]['role'] != 'tool' -%}\n" - " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" - " {%- set prev_nt.found = true -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" - " {%- if not continue_same_model_turn -%}\n" - " {{- '<|turn>' + role + '\\n' }}\n" - " {%- endif -%}\n" - "\n" - " {#- Render reasoning/reasoning_content as thinking channel -#}\n" - " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" - " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" - " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" - " {%- endif -%}\n" - "\n" - " {%- if message.get('tool_calls') -%}\n" - " {%- for tool_call in message['tool_calls'] -%}\n" - " {%- set function = tool_call['function'] -%}\n" - " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" - " {%- if function['arguments'] is mapping -%}\n" - " {%- set ns_args = namespace(found_first=false) -%}\n" - " {%- for key, value in function['arguments'] | dictsort -%}\n" - " {%- if ns_args.found_first %},{% endif -%}\n" - " {%- set ns_args.found_first = true -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- endfor -%}\n" - " {%- elif function['arguments'] is string -%}\n" - " {{- function['arguments'] -}}\n" - " {%- endif -%}\n" - " {{- '}' -}}\n" - " {%- endfor -%}\n" - " {%- set ns.prev_message_type = 'tool_call' -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set ns_tr_out = namespace(flag=false) -%}\n" - " {%- if message.get('tool_responses') -%}\n" - " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" - " {%- for tool_response in message['tool_responses'] -%}\n" - " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endfor -%}\n" - " {%- elif message.get('tool_calls') -%}\n" - " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" - " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" - " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" - " {%- if ns_tool_scan.stopped -%}\n" - " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" - " {%- set ns_tool_scan.stopped = true -%}\n" - " {%- else -%}\n" - " {%- set follow = loop_messages[k] -%}\n" - " {#- Resolve tool_call_id to function name -#}\n" - " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" - " {%- for tc in message['tool_calls'] -%}\n" - " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" - " {%- set ns_tname.name = tc['function']['name'] -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {#- Handle content as string or content-parts array -#}\n" - " {%- set tool_body = follow.get('content') -%}\n" - " {%- if tool_body is string -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- elif tool_body is sequence and tool_body is not string -%}\n" - " {%- set ns_txt = namespace(s='') -%}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'text' -%}\n" - " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'image_url' -%}\n" - " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" - " {%- if part.get('type') == 'audio_url' -%}\n" - " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif part.get('type') == 'input_audio' -%}\n" - " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - # " {%- elif part.get('type') == 'video_url' -%}\n" - # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- else -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- endif -%}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set captured_content -%}\n" - " {%- if message['content'] is string -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(message['content']) -}}\n" - " {%- else -%}\n" - " {{- message['content'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif message['content'] is sequence -%}\n" - " {%- for item in message['content'] -%}\n" - " {%- if item['type'] == 'text' -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(item['text']) -}}\n" - " {%- else -%}\n" - " {{- item['text'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif item['type'] == 'image_url' -%}\n" - " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" - " {%- if item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- endif -%}\n" - # " {%- elif item['type'] == 'video_url' -%}\n" - # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endset -%}\n" - "\n" - " {{- captured_content -}}\n" - " {%- set has_content = captured_content | trim | length > 0 -%}\n" - "\n" - " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- if add_generation_prompt -%}\n" - " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" - " {{- '<|turn>model\\n' -}}\n" - " {%- if not enable_thinking | default(false) -%}\n" - " {{- '<|channel>thought\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Gemma 4 Handler. - - Args: - enable_thinking (bool): Controls whether the <|think|> tag is injected and - manages <|channel>thought behavior. - Note: ONLY supported on Gemma4 31B and 26BA4B models. - NOT supported on Gemma4 E2B and E4B models. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [1, 106, 50] - kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GLM41VChatHandler(MTMDChatHandler): - # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. - - GLM41V_EOS_TOKEN = "<|endoftext|>" - GLM41V_PAD_TOKEN = "<|endoftext|>" - GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]\n" - "{%- for msg in messages -%}" - "{%- if msg.role == 'system' -%}" - "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'user' -%}" - "<|user|>\n" - "{%- if msg.content is string -%}" - "{{ msg.content }}" - "{%- else -%}" - "{%- for item in msg.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'assistant' -%}" - "{%- if msg.metadata -%}" - "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- else -%}" - "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{%- endif -%}" - ) - - def __call__(self, **kwargs): - self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN - # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json - stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch - kwargs['stop'] = stop_tokens - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class GLM46VChatHandler(MTMDChatHandler): - GLM46V_EOS_TOKEN = "<|endoftext|>" - GLM46V_PAD_TOKEN = "<|endoftext|>" - GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]" - "{%- if tools -%}" - "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" - "You are provided with function signatures within XML tags:\n\n" - "{%- for tool in tools -%}" - "{{ tool | tojson(ensure_ascii=False) }}\n" - "{%- endfor -%}" - "\n\nFor each function call, output the function name and arguments within the following XML format:\n" - "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" - "{%- endif -%}" - - "{%- for m in messages -%}" - "{%- if m.role == 'system' -%}" - "<|system|>\n{{ m.content }}" - "{%- elif m.role == 'user' -%}" - "<|user|>\n" - "{%- if m.content is string -%}" - "{{ m.content }}" - "{%- else -%}" - "{%- for item in m.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - # If enable_thinking is disabled, insert `/nothink` according to the source code logic. - "{{ '/nothink' if not enable_thinking else '' }}" - "{%- elif m.role == 'assistant' -%}" - "<|assistant|>" - "{%- if enable_thinking -%}" - "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" - "\n{{ reasoning.strip() }}" - "{%- else -%}" - "\n" - "{%- endif -%}" - "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" - "{%- endif -%}" - "{{ GLM46V_EOS_TOKEN }}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{{ '' if enable_thinking else '\n' }}" - "{%- endif -%}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - GLM-4.6V Handler - Parameters: - - enable_thinking (bool): Whether to enable the model's think process. The default is True. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN - - # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json - kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GraniteDoclingChatHandler(MTMDChatHandler): - """ - Handler for Granite-Docling models. - - Format(512x512): Content - - Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! - Since the model does not have special tokens for the start and end of an image, - it is recommended to process only one image at a time. - You can iterate through the images individually for recognition. - - """ - GRANITE_BOS_TOKEN = "<|start_of_role|>" - GRANITE_EOS_TOKEN = "<|end_of_text|>" - GRANITE_PAD_TOKEN = "<|end_of_text|>" - GRANITE_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for part in message['content'] -%}" - "{%- if part['type'] == 'text' -%}" - "{{- part['text'] -}}" - "{%- elif part['type'] == 'image_url' -%}" - "{%- if part.image_url is string -%}" - "{{- part.image_url -}}" - "{%- else -%}" - "{{- part.image_url.url -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '<|end_of_text|>\n' -}}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|start_of_role|>assistant' -}}" - # Support the 'controls' parameter if present in generation arguments - "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" - "{{- '<|end_of_role|>' -}}" - "{%- endif -%}" - ) - - def __init__(self, controls: dict = None, **kwargs): - """ - Granite-Docling Handler - Args: - controls (dict, optional): Operational parameters passed to the assistant role. - - The 'controls' parameter is used to guide the model's behavior or output format. - Common examples for 'controls' include: - - Document Parsing: {"mode": "document_parsing", "format": "json"} - """ - self.controls = controls - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject controls into the template environment - self.extra_template_arguments["controls"] = self.controls - self.DEFAULT_SYSTEM_MESSAGE = None - kwargs['stop'] = [self.GRANITE_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - - return super().__call__(**kwargs) - - -class LFM2VLChatHandler(MTMDChatHandler): - LFM2VL_BOS_TOKEN = "<|startoftext|>" - LFM2VL_EOS_TOKEN = "<|im_end|>" - LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{ '<|im_start|>' + message['role'] + '\n' }}" - "{%- if message['content'] is string -%}" - "{{ message['content'] }}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if 'image_url' in content -%}" - "{%- if content.image_url is string -%}" - "<|image_start|>{{ content.image_url }}<|image_end|>" - "{%- else -%}" - "<|image_start|>{{ content.image_url.url }}<|image_end|>" - "{%- endif -%}" - "{%- elif content['type'] == 'text' -%}" - "{{ content['text'] }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{ '<|im_end|>\n' }}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{ '<|im_start|>assistant\n' }}" - "{%- endif -%}" - ) - - def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): - """ - LFM2-VL Handler - LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 - """ - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) - - def __call__(self, **kwargs): - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class LFM25VLChatHandler(MTMDChatHandler): - """ - Handler for LFM2.5-VL multimodal models. - - Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. - """ - # Aligned with LFM2.5-VL tokenizer_config - LFM25VL_BOS_TOKEN = "<|startoftext|>" - LFM25VL_EOS_TOKEN = "<|im_end|>" - LFM25VL_PAD_TOKEN = "<|pad|>" - - # Image specific tokens - LFM25VL_IMAGE_TOKEN = "" - LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" - LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" - - CHAT_FORMAT = ( - "{{- bos_token -}}\n" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" - "{%- set ns = namespace(system_prompt='', content='') -%}\n" - "{%- if messages[0]['role'] == 'system' -%}\n" - " {%- set ns.system_prompt = messages[0]['content'] -%}\n" - " {%- set messages = messages[1:] -%}\n" - "{%- endif -%}\n" - "{%- if tools -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" - " {%- for tool in tools -%}\n" - " {%- if tool is not string -%}\n" - " {%- set tool = tool | tojson -%}\n" - " {%- endif -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" - " {%- if not loop.last -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" - "{%- endif -%}\n" - "{%- if ns.system_prompt -%}\n" - " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" - "{%- endif -%}\n" - "{%- set ns.last_assistant_index = -1 -%}\n" - "{%- for message in messages -%}\n" - " {%- if message['role'] == 'assistant' -%}\n" - " {%- set ns.last_assistant_index = loop.index0 -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- for message in messages -%}\n" - " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" - " {%- set content = message['content'] -%}\n" - " {%- if content is not string -%}\n" - " {%- set ns.content = '' -%}\n" - " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" - " {%- for item in content -%}\n" - " {%- if item['type'] == 'image_url' -%}\n" - " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {%- set ns.content = ns.content + img_val -%}\n" - " {%- elif item['type'] == 'text' -%}\n" - " {%- set ns.content = ns.content + item['text'] -%}\n" - " {%- else -%}\n" - " {%- set ns.content = ns.content + (item | tojson) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set content = ns.content -%}\n" - " {%- endif -%}\n" - " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" - " {%- if '' in content -%}\n" - " {%- set content = content.split('')[-1] | trim -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {{- content + '<|im_end|>\\n' -}}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, keep_past_thinking: bool = False, **kwargs): - self.keep_past_thinking = keep_past_thinking - super().__init__(**kwargs) - - - def __call__(self, **kwargs): - if self.image_min_tokens > 256: - if self.verbose: - print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") - self.image_min_tokens = -1 - - self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking - - kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class PaddleOCRChatHandler(MTMDChatHandler): - """ - Handler for PaddleOCR 1.5/1.6 multimodal models. - """ - - PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" - PADDLEOCR_BOS_TOKEN = "" - PADDLEOCR_EOS_TOKEN = "" - PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" - PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" - PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" - - CHAT_FORMAT = ( - "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" - "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" - "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" - - "{{- cls_token -}}" - "{%- for message in messages -%}" - "{%- if message['role'] == 'user' -%}" - "{{- 'User: ' -}}" - - # Robust parsing: Check if content is string or list - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - # Pass 1: Render all images first - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" - "{{- '<|IMAGE_START|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|IMAGE_END|>' -}}" - "{%- endif -%}" - "{%- endfor -%}" - - # Pass 2: Render all text second - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '\\n' -}}" - - "{%- elif message['role'] == 'assistant' -%}" - "{{- 'Assistant:\\n' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- eos_token -}}" - - "{%- elif message['role'] == 'system' -%}" - "{%- if message['content'] is string -%}" - "{{- message['content'] + '\\n' -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] + '\\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "{{- 'Assistant:\\n' -}}" - "{%- endif -%}" - ) - - def __init__( - self, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__( - image_min_tokens=self.image_min_tokens, - image_max_tokens=self.image_max_tokens, - **kwargs - ) - - def __call__(self, **kwargs): - # Set the specific stop token defined in the PaddleOCR template - kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class Qwen25VLChatHandler(MTMDChatHandler): - - QWEN25_VL_BOS_TOKEN = "<|endoftext|>" - QWEN25_VL_PAD_TOKEN = "<|endoftext|>" - QWEN25_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and message['role'] != 'system' %}" - "<|im_start|>system\n" - "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is string %}" - "{{ message['content'] }}<|im_end|>\n" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" - "{% else %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" - "{% endif %}" - "{% elif content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% endif %}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endfor %}" - "<|im_start|>assistant\n" - ) - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen3ASRChatHandler(MTMDChatHandler): - """ - Handler for Qwen 3 ASR (Automatic Speech Recognition) models. - - Features: - - Highly specialized for Speech-to-Text tasks. - - Aggregates all system text into a single cohesive system block. - - Drops user text entirely, extracting ONLY audio data into a unified user turn. - - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. - - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. - """ - - DEFAULT_SYSTEM_MESSAGE = """ - You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. - You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. - """ - - QWEN3_ASR_BOS_TOKEN = "<|im_start|>" - QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" - QWEN3_ASR_EOS_TOKEN = "<|im_end|>" - - - QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" - QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" - QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" - - CHAT_FORMAT = ( - "{%- set ns = namespace(system_text='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.role == 'system' -%}\n" - " {%- if m.content is string -%}\n" - " {%- set ns.system_text = ns.system_text + m.content -%}\n" - " {%- else -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'text' and (c.text is defined) -%}\n" - " {%- set ns.system_text = ns.system_text + c.text -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- set ns2 = namespace(audio_tokens='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.content is not string -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" - " {#- MTMD Audio Injection -#}\n" - " {%- set audio_val = '' -%}\n" - " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" - " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" - " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" - " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" - " {%- endif -%}\n" - " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" - "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token - kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") - - return super().__call__(**kwargs) - -class Qwen3VLChatHandler(MTMDChatHandler): - - QWEN3_VL_BOS_TOKEN = "<|endoftext|>" - QWEN3_VL_PAD_TOKEN = "<|endoftext|>" - QWEN3_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{{- '<|im_start|>system\n' -}}" - "{%- if messages[0].content is string and messages[0].role == 'system' -%}" - "{{- messages[0].content -}}" - "{%- elif messages[0].role == 'system' -%}" - "{%- if 'text' in messages[0].content -%}" - "{{- messages[0].content.text -}}" - "{%- else -%}" - "{{- 'You are a helpful assistant.' -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- if tools -%}" - "{{- '\n\n' -}}" - "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" - "{%- for tool in tools -%}" - "{{- '\n' -}}" - "{{- tool | tojson -}}" - "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" - "{%- endif -%}" - "{{- '<|im_end|>\n' -}}" - "{%- set image_count = namespace(value=0) -%}" - #"{%- set video_count = namespace(value=0) -%}" - "{%- for message in messages -%}" - "{%- if message.role == 'tool' -%}" - "{{- '<|im_start|>user\n\n' -}}" - "{%- elif message.role != 'system' -%}" - "{{- '<|im_start|>' + message.role + '\n' -}}" - "{%- endif -%}" - "{%- if message.content is string and message.role != 'system' -%}" - "{{- message.content -}}" - "{%- elif message.role != 'system' -%}" - "{%- for content in message.content -%}" - "{%- if 'image_url' in content -%}" - "{%- set image_count.value = image_count.value + 1 -%}" - "{%- if add_vision_id -%}" - "{{- 'Picture ' -}}" - "{{- image_count.value | string -}}" - "{{- ': ' -}}" - "{%- endif -%}" - "{{- '<|vision_start|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|vision_end|>' -}}" - "{%- endif -%}" - # Video not supported yet - "{%- if 'text' in content -%}" - "{{- content.text -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- if message.role == 'assistant' -%}" - "{%- if message.tool_calls -%}" - "{%- for tool_call in message.tool_calls -%}" - "{%- if (loop.first and message.content) or (not loop.first) -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- if tool_call.function -%}" - "{%- set tool_call = tool_call.function -%}" - "{%- endif -%}" - "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" - "{%- if tool_call.arguments is string -%}" - "{{- tool_call.arguments -}}" - "{%- else -%}" - "{{- tool_call.arguments | tojson -}}" - "{%- endif -%}" - "{{- '}\n' -}}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- elif message.role == 'tool' -%}" - "{{- '' -}}" - "{%- endif -%}" - "{%- if message.role != 'system' -%}" - "{{- '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|im_start|>assistant\n' -}}" - "{%- if force_reasoning -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - force_reasoning: bool = False, - add_vision_id: bool = True, - **kwargs, - ): - """ - Parameters: - - force_reasoning (bool): - - True: Force the reasoning in the model by adding to the chat template. - - False (default): Don't force the reasoning. - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - """ - super().__init__(**kwargs) - self.force_reasoning = force_reasoning - self.extra_template_arguments["force_reasoning"] = force_reasoning - self.extra_template_arguments["add_vision_id"] = add_vision_id - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen35ChatHandler(MTMDChatHandler): - """ - Handler for Qwen3.5/Qwen3.6 models. - """ - CHAT_FORMAT = ( - "{%- set image_count = namespace(value=0) -%}" - "{%- set video_count = namespace(value=0) -%}" - "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" - " {%- if content is string -%}" - " {{- content -}}" - " {%- elif content is iterable and content is not mapping -%}" - " {%- for item in content -%}" - " {%- if 'image_url' in item or item.type == 'image_url' -%}" - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain images.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set image_count.value = image_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Picture ' -}}" - " {{- image_count.value | string -}}" - " {{- ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {%- if item.image_url is string -%}" - " {{- item.image_url -}}" - " {%- else -%}" - " {{- item.image_url.url -}}" - " {%- endif -%}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'video' in item -%}" - " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain videos.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set video_count.value = video_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Video ' ~ video_count.value ~ ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {{- item.video -}}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'text' in item -%}" - " {{- item.text -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected item type in content.') -}}" - " {%- endif -%}" - " {%- endfor -%}" - " {%- elif content is none or content is undefined -%}" - " {{- '' -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected content type.') -}}" - " {%- endif -%}" - "{%- endmacro -%}" - "{%- if not messages -%}" - " {{- raise_exception('No messages provided.') -}}" - "{%- endif -%}" - "{%- if tools and tools is iterable and tools is not mapping -%}" - " {{- '<|im_start|>system\n' -}}" - " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" - " {%- for tool in tools -%}" - " {{- '\n' -}}" - " {{- tool | tojson -}}" - " {%- endfor -%}" - " {{- '\n' -}}" - " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" - " {%- if messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) | trim -%}" - " {%- if content -%}" - " {{- '\n\n' + content -}}" - " {%- endif -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - "{%- elif messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) -%}" - " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" - "{%- for message in messages[::-1] -%}" - " {%- set index = messages | length - 1 - loop.index0 -%}" - " {%- if ns.multi_step_tool and message.role == 'user' -%}" - " {%- set content = render_content(message.content, false) | trim -%}" - " {%- if not (content.startswith('') and content.endswith('')) -%}" - " {%- set ns.multi_step_tool = false -%}" - " {%- set ns.last_query_index = index -%}" - " {%- endif -%}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if ns.multi_step_tool -%}" - " {{- raise_exception('No user query found in messages.') -}}" - "{%- endif -%}" - "{%- for message in messages -%}" - " {%- set content = render_content(message.content, true) | trim -%}" - " {%- if message.role == 'system' -%}" - " {%- if not loop.first -%}" - " {{- raise_exception('System message must be at the beginning.') -}}" - " {%- endif -%}" - " {%- elif message.role == 'user' -%}" - " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" - " {%- elif message.role == 'assistant' -%}" - " {%- set reasoning_content = '' -%}" - " {%- if message.reasoning_content is string -%}" - " {%- set reasoning_content = message.reasoning_content -%}" - " {%- elif '' in content -%}" - " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" - " {%- set content = content.split('')[-1].lstrip('\n') -%}" - " {%- endif -%}" - " {%- set reasoning_content = reasoning_content | trim -%}" - " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" - " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" - " {%- else -%}" - " {{- '<|im_start|>' + message.role + '\n' + content -}}" - " {%- endif -%}" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" - " {%- for tool_call in message.tool_calls -%}" - " {%- if tool_call.function is defined -%}" - " {%- set tool_call = tool_call.function -%}" - " {%- endif -%}" - " {%- if loop.first -%}" - " {%- if content | trim -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- '\n\n\n' -}}" - " {%- endif -%}" - " {%- if tool_call.arguments is defined -%}" - " {%- for (args_name, args_value) in tool_call.arguments | items -%}" - " {{- '\n' -}}" - " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" - " {{- args_value -}}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif message.role == 'tool' -%}" - " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" - " {{- '<|im_start|>user' -}}" - " {%- endif -%}" - " {{- '\n\n' -}}" - " {{- content -}}" - " {{- '\n' -}}" - " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif loop.last -%}" - " {{- '<|im_end|>\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- raise_exception('Unexpected message role.') -}}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - " {{- '<|im_start|>assistant\n' -}}" - " {%- if enable_thinking is defined and enable_thinking is false -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n' -}}" - " {%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - add_vision_id: bool = True, - enable_thinking: bool = True, - preserve_thinking: bool = False, - **kwargs, - ): - """ - Parameters: - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - - enable_thinking (bool): - - True (default): Enables reasoning for better results. - - False: Disables reasoning for faster results. - - preserve_thinking (bool): - - True: Keeps reasoning process for ALL historical conversational turns. - - False (default): Only keeps for the latest assistant reply to save tokens. - """ - super().__init__(**kwargs) - self.enable_thinking = enable_thinking - self.preserve_thinking = preserve_thinking - self.extra_template_arguments["add_vision_id"] = add_vision_id - self.extra_template_arguments["enable_thinking"] = enable_thinking - self.extra_template_arguments["preserve_thinking"] = preserve_thinking - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class Step3VLChatHandler(MTMDChatHandler): - """ - Handler for Step3-VL models. - """ - - STEP3VL_BOS_TOKEN = "<|im_start|>" - STEP3VL_EOS_TOKEN = "<|im_end|>" - STEP3VL_PAD_TOKEN = "<|endoftext|>" - STEP3VL_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro render_content(content) -%}\n" - " {%- if content is none -%}{{- '' -}}\n" - " {%- elif content is string -%}{{- content -}}\n" - " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" - " {%- elif content is iterable -%}\n" - " {%- for item in content -%}\n" - " {%- if item.type == 'text' -%}\n" - " {{- item['value'] if 'value' in item else item['text'] -}}\n" - " {%- elif item.type in ['image', 'image_url'] -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.image_url -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {{- '' + url_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "\n" - "{%- if tools -%}\n" - " {{- '<|im_start|>system\\n' -}}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" - " {%- endif -%}\n" - " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" - " {%- for tool in tools -%}\n" - " {{- '\\n' -}}\n" - " {{- tool | tojson -}}\n" - " {%- endfor -%}\n" - " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" - "{%- else -%}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - "\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" - "{%- for message in messages[::-1] -%}\n" - " {%- set index = (messages|length - 1) - loop.index0 -%}\n" - " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" - " {%- set ns.multi_step_tool = false -%}\n" - " {%- set ns.last_query_index = index -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- for message in messages -%}\n" - " {%- set content = render_content(message.content) -%}\n" - " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" - " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" - " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" - " {%- elif message.role == 'assistant' -%}\n" - " {%- if message.reasoning_content is string -%}\n" - " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" - " {%- else -%}\n" - " {%- if '' in content -%}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" - " {%- else -%}\n" - " {%- set reasoning_content = '' -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if loop.index0 > ns.last_query_index -%}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" - " {%- else -%}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" - " {%- endif -%}\n" - " {%- if message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- for tool_call in message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- if tool_call.function -%}\n" - " {%- set tool_call = tool_call.function -%}\n" - " {%- endif -%}\n" - " {{- '\\n{\"name\": \"' -}}\n" - " {{- tool_call.name -}}\n" - " {{- '\", \"arguments\": ' -}}\n" - " {%- if tool_call.arguments is string -%}\n" - " {{- tool_call.arguments -}}\n" - " {%- else -%}\n" - " {{- tool_call.arguments | tojson -}}\n" - " {%- endif -%}\n" - " {{- '}\\n' -}}\n" - " {%- endfor -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- elif message.role == 'tool' -%}\n" - " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" - " {{- '<|im_start|>tool_response' -}}\n" - " {%- endif -%}\n" - " {{- '\\n\\n' -}}\n" - " {{- content -}}\n" - " {{- '\\n' -}}\n" - " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Step3-VL Handler. - - Args: - enable_thinking (bool): If False, injects an empty block to bypass reasoning. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Pass thinking toggle into Jinja - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Step3 uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -@register_chat_completion_handler("chatml-function-calling") -def chatml_function_calling( - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - **kwargs, # type: ignore -) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], -]: - function_calling_template = ( - "{% for message in messages %}" - "<|im_start|>{{ message.role }}\n" - # System message - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% if tool_calls %}" - "\n\nYou have access to the following functions:\n" - "{% for tool in tools %}" - "\nfunctions.{{ tool.function.name }}:\n" - "{{ tool.function.parameters | tojson }}" - "\n{% endfor %}" - "\n\nYou can respond to users messages with either a single message or one or more function calls." - "\n\nTo respond with a message begin the message with 'message:', use the following format:" - "\n\nmessage:" - "\n" - "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" - "\n\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "{% endif %}" - "<|im_end|>\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - ## Reglar message - "{% if message.content and message.content | length > 0 %}" - "{% if tool_calls %}" - "message:\n" - "{% endif %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - ## Function calls - "{% if 'tool_calls' in message %}" - "{% for tool_call in message.tool_calls %}" - "functions.{{ tool_call.function.name }}:\n" - "{{ tool_call.function.arguments }}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - ) - template_renderer = ImmutableSandboxedEnvironment( - autoescape=jinja2.select_autoescape(["html", "xml"]), - undefined=jinja2.StrictUndefined, - ).from_string(function_calling_template) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - stop = ( - [stop, "<|im_end|>"] - if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] - ) - - # Case 1: No tool choice by user - if ( - tool_choice is None - or (isinstance(tool_choice, str) and tool_choice == "none") - or tools is None - or len(tools) == 0 - ): - prompt = template_renderer.render( - messages=messages, - tools=[], - tool_calls=None, - add_generation_prompt=True, - ) - - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - return _convert_completion_to_chat( - llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logprobs=top_logprobs if logprobs else None, - ), - stream=stream, - ) - - # Case 2: Tool choice by user - if isinstance(tool_choice, dict): - tool_name = tool_choice["function"]["name"] - tool = next( - (tool for tool in tools if tool["function"]["name"] == tool_name), None - ) - if tool is None: - raise ValueError(f"Tool with name '{tool_name}' not found in tools") - prompt = template_renderer.render( - messages=messages, - tools=tools, - tool_calls=True, - add_generation_prompt=True, - ) - prompt += f"functions.{tool_name}:\n" + prompt += f"functions.{tool_name}:\n" try: grammar = llama_grammar.LlamaGrammar.from_json_schema( json.dumps(tool["function"]["parameters"]), verbose=llama.verbose @@ -6956,3 +3539,35 @@ def chatml_function_calling( } raise ValueError("Automatic streaming tool choice is not supported") + +# Backward compatibility re-exports. +# These multimodal chat handlers have been moved to `llama_multimodal`. +# New code should import them from `llama_cpp.llama_multimodal` instead of +# `llama_cpp.llama_chat_format`. +from llama_cpp.llama_multimodal import ( + MTMDChatHandler, + GenericMTMDChatHandler, + Llava15ChatHandler, + ObsidianChatHandler, + MoondreamChatHandler, + Llava16ChatHandler, + NanoLlavaChatHandler, + Llama3VisionAlphaChatHandler, + Llama3VisionAlpha, + MiniCPMv26ChatHandler, + MiniCPMv45ChatHandler, + MiniCPMV46ChatHandler, + Gemma3ChatHandler, + Gemma4ChatHandler, + GLM41VChatHandler, + GLM46VChatHandler, + GraniteDoclingChatHandler, + LFM2VLChatHandler, + LFM25VLChatHandler, + PaddleOCRChatHandler, + Qwen25VLChatHandler, + Qwen3ASRChatHandler, + Qwen3VLChatHandler, + Qwen35ChatHandler, + Step3VLChatHandler +) diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py new file mode 100644 index 0000000000..a055869543 --- /dev/null +++ b/llama_cpp/llama_multimodal.py @@ -0,0 +1,3473 @@ +from __future__ import annotations + +import base64 +import ctypes +import json +import os +import sys +import zlib + +from contextlib import ExitStack +from typing import ( + Any, + Dict, + Iterator, + List, + Literal, + Optional, + Tuple, + Union, + Protocol, + TYPE_CHECKING, + cast, +) + +import urllib.request +from urllib.error import URLError, HTTPError + +import llama_cpp.llama_cpp as llama_cpp_lib +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_grammar as llama_grammar + +if TYPE_CHECKING: + import llama_cpp.llama as llama_core + +from ._logger import ggml_log_callback + +from llama_cpp.llama_chat_format import ( + _convert_completion_to_chat, + _convert_completion_to_chat_function, + _grammar_for_response_format, + ImmutableSandboxedEnvironment +) + +class MTMDChatHandler: + DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." + ) + + CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% elif message.role == 'user' %}" + "USER: " + "{% if message.content is string %}" + "{{ message.content }}" + "{% elif message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% elif content.type == 'audio_url' %}" + "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" + "{% elif content.type == 'input_audio' %}" + "{% if content.input_audio is string %}" + "{{ content.input_audio }}" + "{% else %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" + "{% elif content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" + "{% endif %}" + "{{ \"\n\" }}" + "{% endfor %}" + + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + + "{% if add_generation_prompt %}" + "ASSISTANT: " + "{% endif %}" + ) + + def __init__( + self, + mmproj_path: Optional[str] = None, + verbose: bool = True, + use_gpu: bool = True, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, + batch_max_tokens: int = 1024, + **kwargs + ): + + self.log_prefix = self.__class__.__name__ + self.verbose = verbose + + # Backward compatibility: `clip_model_path` was the old name for `mmproj_path`. + # Accept it for existing user code, warn during initialization, and normalize + # all internal usage to `mmproj_path`. + clip_model_path = kwargs.pop("clip_model_path", None) + if mmproj_path is None and clip_model_path is not None: + mmproj_path = clip_model_path + if self.verbose: + print( + f"{self.log_prefix}(__init__): `clip_model_path` is deprecated; " + "please use `mmproj_path` instead.", + file=sys.stderr, + ) + + if kwargs: + unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) + raise TypeError( + f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" + f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." + ) + + if mmproj_path is None: + raise ValueError( + f"{self.log_prefix}(__init__): `mmproj_path` is required. " + "`clip_model_path` is accepted only as a deprecated compatibility alias." + ) + + self.mmproj_path = mmproj_path + if not os.path.exists(self.mmproj_path): + raise ValueError( + f"{self.log_prefix}(__init__): mmproj path does not exist: {self.mmproj_path}" + ) + + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens + self.use_gpu = use_gpu + + import llama_cpp.mtmd_cpp as mtmd_cpp + self._mtmd_cpp = mtmd_cpp + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + self.extra_template_arguments: dict[str, Any] = {} + + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + + # Pre-compile Jinja template + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: + self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override + + self._chat_format_parser_tags = [] + self._change_chat_template(self.chat_format) + + self._exit_stack = ExitStack() + + def _change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) + + def _init_mtmd_context(self, llama_model: llama_core.Llama): + """Initialize mtmd context with the llama model.""" + if self.mtmd_ctx is not None: + return # Already initialized + + self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) + + # Get default parameters + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True + if self.image_min_tokens > 0: + self.mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + self.mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens + + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.mmproj_path.encode(), + llama_model.model, + self.mctx_params + ) + + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") + + # Check if vision is supported + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if audio is supported + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + + def close(self) -> None: + """Explicitly free the mtmd context and vision model resources.""" + if getattr(self, "mtmd_ctx", None) is not None: + try: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + except Exception: + pass + self.mtmd_ctx = None + self.mctx_params = None + self.chat_template = None + + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None + + def __del__(self) -> None: + self.close() + + def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: + """ + Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. + Strictly enforces capability checks, raising exceptions if unsupported media is passed. + + Returns: + media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). + """ + media_items: List[Dict[str, str]] = [] + for message in messages: + if isinstance(message.get("content"), list): + for content in message["content"]: + content_type = content.get("type", "") + + # 1. Vision Processing + if content_type == "image_url": + if not self.is_support_vision: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") + + url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] + media_items.append({"url": url, "type": "image"}) + + # 2. Audio Processing + elif content_type in ["audio", "audio_url", "input_audio"]: + if not self.is_support_audio: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") + + # Case A: Handle custom/forward-compatible audio_url format + if content_type == "audio_url" or content_type == "audio": + audio_url = content[content_type] + url = audio_url if isinstance(audio_url, str) else audio_url["url"] + media_items.append({"url": url, "type": "audio"}) + # Case B: Handle OpenAI standard input_audio format + elif content_type == "input_audio": + input_audio = content.get("input_audio", {}) + if isinstance(input_audio, dict) and "data" in input_audio: + # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic + # input_audio: { + # data: audio.base64Data, + # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' + # } + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + + # Strictly align with llama.cpp (require wav/mp3) + if audio_format not in ["wav", "mp3"]: + raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") + + # Format as a Data URI to reuse the unified load_media logic + media_items.append({ + "url": f"data:audio/{audio_format};base64,{audio_data}", + "type": "audio" + }) + else: + # Just a raw base64 data + url = input_audio if isinstance(input_audio, str) else "" + if url: + media_items.append({"url": url, "type": "audio"}) + + # 3. Video Processing + elif content_type == "video_url": + if not self.is_support_video: + raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") + + video_url = content["video_url"] + url = video_url if isinstance(video_url, str) else video_url["url"] + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types + elif content_type == "text": + continue + else: + if self.verbose: + print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) + return media_items + + def _create_bitmap_from_bytes(self, media_bytes: bytes): + """ + Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. + + Supported formats: + - Images (via stb_image): jpg, png, bmp, etc. + - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. + + Note: + - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. + - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. + + Args: + media_bytes (bytes): The raw byte content of the media file. + + Returns: + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL + """ + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + buf, + len(media_bytes), + False, + ) + + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) + + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx + + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) + + def _process_mtmd_prompt( + self, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + add_generation_prompt: bool = True, + ) -> Tuple[List[int], List[tuple], Any, List[Any]]: + """ + Core multimodal preprocessing pipeline. + Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. + + Features: + - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. + - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. + - Strict RAII-style C++ memory management to prevent leaks on failure. + + Returns: + full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. + chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). + chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). + bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. + """ + # 1. Inject default system prompt if omitted by the user + system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: + messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages + + media_items = self._get_media_items(messages) + media_marker = self.media_marker + + # 2. Render the chat template and replace actual URLs with C++ media markers + text = self.chat_template.render( + messages=messages, + add_generation_prompt=add_generation_prompt, + eos_token=self.mtmd_eos_token, + bos_token=self.mtmd_bos_token, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + **getattr(self, 'extra_template_arguments', {}) + ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text.replace(tag, media_marker) + + # Replace image_url by media_marker in text + for item in media_items: + text = text.replace(item["url"], media_marker) + + if self.verbose: + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) + + # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding + bitmaps = [None] * len(media_items) + bitmap_cleanup = [] + video_cleanup = [] + chunks = None + + try: + # Concurrent Media Decoding + import concurrent.futures + if media_items: + def _create_bitmap_func(idx: int, item: dict): + media_bytes = self.load_media(item["url"], item["type"]) + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx + # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, + # which can be used in the future to process large numbers of video frames. + max_workers = min(llama.n_threads, len(media_items)) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] + + for future in concurrent.futures.as_completed(futures): + idx, bitmap, video_ctx = future.result() + + bitmaps[idx] = bitmap + bitmap_cleanup.append(bitmap) + + if video_ctx: + video_cleanup.append(video_ctx) + + # Strict validation: Abort if any thread failed to decode its assigned media + if any(b is None for b in bitmaps): + raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") + else: + if self.verbose: + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") + else: + # If there are no images, set the bitmaps to empty. + bitmaps = [] + + # 4. Initialize mtmd_input_chunks + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = (llama.n_tokens == 0) + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") + + # 5. Hybrid Tokenization (Text + Media binding) + if len(bitmaps) > 0: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) + ) + else: + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + + # 6. Virtual Token Ledger Construction + full_prompt_ids = [] + chunk_token_spans = [] + current_idx = 0 + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + # Cursor to track the actual media contents (URLs or base64 data) provided by the user + media_items_count = len(media_items) + media_items_cur = 0 + last_media_id = None + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: continue + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if self._is_text_chunk(chunk_type): + # Extract standard text token IDs + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) + full_prompt_ids.extend(tokens) + current_idx += len(tokens) + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + # Extract media properties + # Note(JamePeng): + # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). + # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. + # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + + if media_items_cur < media_items_count: + # The C++ parser only sees identical placeholders (e.g., "<__media__>"). + # We MUST inject the actual media content's identity here. + real_media_url = media_items[media_items_cur]["url"] + # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) + # Generate a deterministic, unique negative ID for this specific image/audio. + # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). + # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with + # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). + # This empowers `longest_token_prefix` to correctly identify and reuse cached images, + # while instantly breaking the match if the image content changes. + # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id + media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id + else: + # Magic Negative Number as fallback :) + media_id = -314159 + + if self.verbose: + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") + + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) + + # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache + full_prompt_ids.extend([media_id] * chunk_n_tokens) + current_idx += chunk_n_tokens + else: + raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") + + return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup + + except Exception as e: + # Ensure no useless pointers remain upon any failure + # Free chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Free bitmaps + if len(bitmap_cleanup) > 0: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + + bitmaps = None + + raise e + + def __call__( + self, + *, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, + response_format: Optional[ + llama_types.ChatCompletionRequestResponseFormat + ] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, + **kwargs, # type: ignore + ) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], + ]: + # 1. Initialize mtmd context + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None + + # 2. Concurrent Preprocessing & Ledger Construction + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( + llama=llama, + messages=messages, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + add_generation_prompt=add_generation_prompt, + ) + + if self.verbose: + print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) + + try: + # 3. KV Cache Synchronization & State Rollback + # Compares the virtual ledger with physical history to prevent Cache Poisoning. + current_history = llama.input_ids[:llama.n_tokens].tolist() + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) + + if longest_prefix < llama.n_tokens: + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if llama._hybrid_cache_mgr.max_checkpoints > 0: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) + llama._ctx.memory_seq_rm(0, longest_prefix, -1) + llama.n_tokens = longest_prefix + + n_past = llama.n_tokens + + for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: + # Skip previously matched chunks + if end_idx <= n_past: + continue + + if self._is_text_chunk(chunk_type): + unprocessed_start = max(start_idx, n_past) - start_idx + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) + + if tokens_ptr and n_tokens_out.value > 0: + all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + tokens_to_eval = all_tokens[unprocessed_start:] + + if tokens_to_eval: + if self.verbose: + print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + # Text evaluation delegates shift and chunking to native llama.eval + llama.eval(tokens_to_eval) + n_past = llama.n_tokens + + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) + + if self.verbose: + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" + print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + + # Stage 5: Multimodal Physical OOM Defense + if n_past + chunk_n_tokens > llama.n_ctx(): + if not llama._ctx.memory_can_shift(): + raise RuntimeError( + f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " + f"You MUST increase n_ctx to fit the dialogue." + ) + else: + # Safely discard oldest tokens while preserving system prompts + n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch + n_keep = min(llama.n_keep, n_past) + n_discard = min(n_discard, n_past - n_keep) + + if n_discard <= 0: + raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") + + if self.verbose: + print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) + + # Execute physical memory shift + llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) + llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) + + # Shift python virtual array to match + remaining_len = n_past - (n_keep + n_discard) + if remaining_len > 0: + llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] + + n_past -= n_discard + llama.n_tokens = n_past + + # Execute C++ Multimodal Black-box Extraction + new_n_past = llama_cpp_lib.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk_ptr, + llama_cpp_lib.llama_pos(n_past), + llama_cpp_lib.llama_seq_id(0), + llama.n_batch, + True, # logits_last = True, drastically saves computational overhead + ctypes.byref(new_n_past) + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") + + # Update Ledger with "Negative Reverse Vocabulary" IDs + llama.input_ids[n_past : new_n_past.value] = media_id + n_past = new_n_past.value + llama.n_tokens = n_past + + # Extract the final, perfectly synchronized prompt sequence + prompt = llama.input_ids[: llama.n_tokens].tolist() + + # End-of-Turn Checkpoint + # Anchors the state ONLY after the entire multi-modal turn is processed + if ( + llama.is_hybrid + and llama._hybrid_cache_mgr is not None + and llama._hybrid_cache_mgr.max_checkpoints > 0 + ): + if self.verbose: + print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) + + llama._hybrid_cache_mgr.save_checkpoint( + current_pos=llama.n_tokens, + tokens=prompt, + seq_id=0 + ) + finally: + # Cleanup chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Cleanup bitmaps + if bitmap_cleanup: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup.clear() + bitmap_array = None + + # Handle response format and tools (same as before) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) + + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if ( + tool_choice is not None + and isinstance(tool_choice, dict) + and tools is not None + ): + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + if llama.verbose: + print(str(e), file=sys.stderr) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, + stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, + ) + + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + def load_media(self, media_url: str, media_type: str) -> bytes: + """ + Unified dispatcher for loading media payloads. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. + """ + if media_type == "image": + return self._load_image(media_url) + + elif media_type == "audio": + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") + try: + self.detect_audio_format(audio_bytes) + except ValueError as e: + raise ValueError(f"{self.log_prefix}(load_media): {e}") + return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + + else: + raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") + + @staticmethod + def detect_audio_format(audio_bytes: bytes) -> str: + """ + Pure utility function: Detects the audio format from magic bytes. + Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility + and avoid false positives (e.g., AVI files disguised as RIFF). + """ + length = len(audio_bytes) + + if length < 12: + raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") + + # RIFF & WAVE magic bytes verification + is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" + + # ID3 metadata or MPEG sync word verification + is_mp3 = length >= 3 and ( + audio_bytes.startswith(b"ID3") or + (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) + ) + + # FLAC magic bytes verification + is_flac = audio_bytes.startswith(b"fLaC") + + if is_wav: + return "wav" + elif is_mp3: + return "mp3" + elif is_flac: + return "flac" + else: + raise ValueError( + "Unsupported audio format detected via magic bytes. " + "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." + ) + + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + + @staticmethod + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: + """ + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. + """ + media_bytes = b"" + + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) + + # 2. Handle local file path + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() + + # 3. Handle remote URL via HTTP/HTTPS + else: + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) + try: + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") + + if not media_bytes: + raise ValueError(f"Empty {kind} data received") + + return media_bytes + + @staticmethod + def _load_image(image_url: str) -> bytes: + """ + Load an image from either a URL or a data URI and return it as JPEG bytes. + + Supports: + - Remote images via HTTP/HTTPS (with proper User-Agent) + - Data URIs (base64-encoded, e.g., data:image/png;base64,...) + - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background + - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + + Returns: + JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. + """ + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + + # 2. Check if image_bytes is empty. + if not image_bytes: + raise ValueError("Empty image data received") + + # 3. Open image with Pillow + try: + from PIL import Image, ImageStat + except ImportError: + raise ImportError("Pillow is required for image processing. Install with: pip install pillow") + + import io + image = Image.open(io.BytesIO(image_bytes)) + + # 4. Handle transparency (RGBA, LA, P with transparency, etc.) + if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): + # Use alpha channel as mask + if image.mode == "P": + image = image.convert("RGBA") + + alpha = image.split()[-1] # Last channel is alpha + # Compute average brightness of visible (non-transparent) pixels + stat = ImageStat.Stat(image.convert("L"), mask=alpha) + + # Choose background: white for dark content, black for bright content + bg_color = (255, 255, 255) # white + if stat.count[0] > 0 and stat.mean[0] > 127: + bg_color = (0, 0, 0) # black + + background = Image.new("RGB", image.size, bg_color) + background.paste(image, mask=alpha) + image = background + + # 5. Ensure RGB mode for formats like CMYK, palette, etc. + elif image.mode != "RGB": + image = image.convert("RGB") + + # 6. Save as high-quality JPEG, suitable for most vision models. + output = io.BytesIO() + image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) + return output.getvalue() + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "MTMDChatHandler": + import fnmatch + from pathlib import Path + + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface_hub package. " + "You can install it with `pip install --upgrade huggingface_hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + # split each file into repo_id, subfolder, filename + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + # download the file + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + mmproj_path=model_path, + **kwargs, + ) + +# Experiments are not recommended for this purpose at this time. +class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + + def __init__( + self, + chat_format: str, + mmproj_path: str, + verbose: bool = True, + **kwargs + ) -> None: + + self.chat_format = chat_format + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + self.verbose = verbose + if self.verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) + + def __call__(self, **kwargs): + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Llava15ChatHandler(MTMDChatHandler): + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + + "{% if message.role == 'user' %}" + "{% if message.content is string %}" + "\nUSER: {{ message.content }}" + "{% elif message.content is iterable %}" + "\nUSER: " + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% endif %}" + + "{% if message.role == 'assistant' and message.content is not none %}" + "\nASSISTANT: {{ message.content }}" + "{% endif %}" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "\nASSISTANT: " + "{% endif %}" + ) + + +class ObsidianChatHandler(MTMDChatHandler): + # Prompt Format + # The model followed ChatML format. However, with ### as the seperator + + # <|im_start|>user + # What is this sign about?\n + # ### + # <|im_start|>assistant + # The sign is about bullying, and it is placed on a black background with a red background. + # ### + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}\n" + "###\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "###\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "###\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class MoondreamChatHandler(MTMDChatHandler): + # Chat Format: + # f"\n\n{chat_history}Question: {question}\n\nAnswer:" + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "Question: {{ content.text }}\n\n" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "Question: {{ message.content }}\n\n" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "Answer:{{ message.content }}\n\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + + +class Llava16ChatHandler(MTMDChatHandler): + # Example prompt + # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "{{ message.content }}" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + + +class NanoLlavaChatHandler(MTMDChatHandler): + # Prompt Format + # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: + + # <|im_start|>system + # Answer the question<|im_end|><|im_start|>user + # + # What is the picture about?<|im_end|><|im_start|>assistant + DEFAULT_SYSTEM_MESSAGE = "Answer the question" + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "<|im_end|>" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class Llama3VisionAlphaChatHandler(MTMDChatHandler): + # question = "" + q + + # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "<|start_header_id|>" + "{% if message.role == 'user' %}" + "user<|end_header_id|>\n\n" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "assistant<|end_header_id|>\n\n" + "{{ message.content }}" + "{% endif %}" + "<|eot_id|>" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + "{% endif %}" + ) + + +# alias +Llama3VisionAlpha = Llama3VisionAlphaChatHandler + + +class MiniCPMv26ChatHandler(MTMDChatHandler): + + CHAT_FORMAT = ( + "{% set image_count = namespace(value=0) %}" + "{% for message in messages %}" + "{% if loop.first and messages[0]['role'] != 'system' %}" + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is iterable %}" + "{% for content in message['content'] %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{% set image_count.value = image_count.value + 1 %}" + "{{ image_count.value }}: {{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{% set image_count.value = image_count.value + 1 %}" + "{{ image_count.value }}: {{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + + "{% for content in message['content'] %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% if message['content'] is string %}" + "{{ message['content'] }}" + "{% endif %}" + "<|im_end|>\n" + "{% endfor %}" + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class MiniCPMv45ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V 4.5 models. + + Supports: + - Multi-step tool calls with and XML tags. + - Integrated reasoning (thinking) process with tags. + - Specialized system prompt handling with tool definitions. + - Global image numbering for multi-image processing. + """ + + # Model specific control tokens + MINICPMV_BOS_TOKEN = "<|im_start|>" + MINICPMV_EOS_TOKEN = "<|im_end|>" + MINICPMV_PAD_TOKEN = "<|endoftext|>" + + # Image placeholder tags + MINICPMV_IMAGE_START_TOKEN = "" + MINICPMV_IMAGE_END_TOKEN = "" + MINICPMV_IMAGE_ID_START_TOKEN = "" + MINICPMV_IMAGE_ID_END_TOKEN = "" + + CHAT_FORMAT = ( + # --- 1. First System Message & Tools Definitions --- + "{%- if tools %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" + "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" + "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" + "{{- 'You are provided with function signatures within XML tags:\\n' }}" + "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" + "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- elif messages[0].role == 'system' %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + + # --- 2. Message Stream Processing --- + "{% set image_count = namespace(value=0) %}" + "{%- for message in messages %}" + # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- + "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" + "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" + + "{%- set content = message.content %}" + "{%- if content is not string %}" + "{%- set ns = namespace(content_str='') %}" + "{%- for item in content %}" + # --- Explicit image_url type and value checking --- + "{%- if item.type == 'image_url' %}" + "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" + "{%- set image_count.value = image_count.value + 1 %}" + # Format: N: IMAGE_URL + "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" + "{%- elif item.type == 'text' %}" + "{%- set ns.content_str = ns.content_str + item.text %}" + "{%- endif %}" + "{%- endfor %}" + "{%- set content = ns.content_str %}" + "{%- endif %}" + + "{{- content -}}" + + # Append tool_calls to assistant messages if they exist + "{%- if message.role == 'assistant' and message.tool_calls %}" + "{%- for tool_call in message.tool_calls %}" + "{%- set tc = tool_call.function if tool_call.function else tool_call %}" + "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" + "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" + "{{- '}\\n' }}" + "{%- endfor %}" + "{%- endif %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + + # --- Specialized Tool Response Handling --- + # Group consecutive tool responses under a single user-like block + "{%- elif message.role == 'tool' %}" + "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" + "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" + "{%- endif %}" + "{{- '\\n\\n' + message.content + '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + + # --- 3. Generation Prompt --- + "{%- if add_generation_prompt %}" + "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" + # Handle thinking/reasoning block visibility based on configuration + "{%- if enable_thinking is defined and enable_thinking is false %}" + "{{- '\\n\\n\\n\\n' }}" + "{%- elif enable_thinking is defined and enable_thinking is true %}" + "{{- '\\n' }}" + "{%- endif %}" + "{%- endif %}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V 4.5 Handler. + + Args: + enable_thinking (bool): If True, model generates reasoning before the final answer. + **kwargs: Additional arguments for the base MTMDChatHandler. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject thinking control flag into the template + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set stop token patch + kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + return super().__call__(**kwargs) + + +class MiniCPMV46ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V-4.6 models. + + Features: + - Aligned with official tokenizer_config.json special tokens. + - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. + - Integrated MTMD-style URL and Base64 injection for visual content. + - Specialized `` and `` block generation. + - Autonomously folds previous reasoning paths using `last_query_index`. + - Toggles `` block generation via `enable_thinking` (Defaults to False). + """ + + # Core tokens + MINICPM_BOS_TOKEN = "<|im_start|>" + MINICPM_EOS_TOKEN = "<|im_end|>" + MINICPM_PAD_TOKEN = "<|endoftext|>" + + # Vision tokens + MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" + MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" + MINICPM_IMAGE_TOKEN = "<|image_pad|>" + MINICPM_VIDEO_TOKEN = "<|video_pad|>" + + CHAT_FORMAT = ( + "{%- if enable_thinking is not defined -%}\n" + " {%- set enable_thinking = false -%}\n" + "{%- endif -%}\n" + "{%- macro render_content(content, is_system_content=false) -%}\n" + " {%- if content is string -%}\n" + " {{- content -}}\n" + " {%- elif content is iterable and content is not mapping -%}\n" + " {%- set ns = namespace(parts=[]) -%}\n" + " {%- for item in content -%}\n" + " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" + " {%- if is_system_content -%}\n" + " {{- raise_exception('System message cannot contain images.') -}}\n" + " {%- endif -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.type == 'image_url' -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" + # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" + # " {%- if is_system_content -%}\n" + # " {{- raise_exception('System message cannot contain videos.') -}}\n" + # " {%- endif -%}\n" + # " {%- set url_val = '' -%}\n" + # " {%- if item.type == 'video_url' -%}\n" + # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" + # " {%- endif -%}\n" + # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" + " {%- elif 'text' in item -%}\n" + " {%- set ns.parts = ns.parts + [item.text] -%}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected item type in content.') -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.parts | join('\\n') -}}\n" + " {%- elif content is none or content is undefined -%}\n" + " {{- '' -}}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected content type.') -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- if not messages %}\n" + " {{- raise_exception('No messages provided.') }}\n" + "{%- endif %}\n" + "{%- if tools and tools is iterable and tools is not mapping %}\n" + " {{- '<|im_start|>system\\n' }}\n" + " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" + " {%- for tool in tools %}\n" + " {{- '\\n' }}\n" + " {{- tool | tojson }}\n" + " {%- endfor %}\n" + " {{- '\\n' }}\n" + " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {%- if content %}\n" + " {{- '\\n\\n' + content }}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + "{%- else %}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" + "{%- for message in messages[::-1] %}\n" + " {%- set index = (messages|length - 1) - loop.index0 %}\n" + " {%- if ns.multi_step_tool and message.role == 'user' %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if not(content.startswith('') and content.endswith('')) %}\n" + " {%- set ns.multi_step_tool = false %}\n" + " {%- set ns.last_query_index = index %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if ns.multi_step_tool %}\n" + " {{- raise_exception('No user query found in messages.') }}\n" + "{%- endif %}\n" + "{%- for message in messages %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if message.role == 'system' %}\n" + " {%- if not loop.first %}\n" + " {{- raise_exception('System message must be at the beginning.') }}\n" + " {%- endif %}\n" + " {%- elif message.role == 'user' %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" + " {%- elif message.role == 'assistant' %}\n" + " {%- set reasoning_content = '' %}\n" + " {%- if message.reasoning_content is string %}\n" + " {%- set reasoning_content = message.reasoning_content %}\n" + " {%- else %}\n" + " {%- if '' in content %}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {%- set reasoning_content = reasoning_content|trim %}\n" + " {%- if loop.index0 > ns.last_query_index %}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" + " {%- else %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" + " {%- endif %}\n" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" + " {%- for tool_call in message.tool_calls %}\n" + " {%- if tool_call.function is defined %}\n" + " {%- set tool_call = tool_call.function %}\n" + " {%- endif %}\n" + " {%- if loop.first %}\n" + " {%- if content|trim %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- '\\n\\n\\n' }}\n" + " {%- endif %}\n" + " {%- if tool_call.arguments is defined %}\n" + " {%- for args_name, args_value in tool_call.arguments|items %}\n" + " {{- '\\n' }}\n" + " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" + " {{- args_value }}\n" + " {{- '\\n\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif message.role == 'tool' %}\n" + " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" + " {{- '<|im_start|>user' }}\n" + " {%- endif %}\n" + " {{- '\\n\\n' }}\n" + " {{- content }}\n" + " {{- '\\n' }}\n" + " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif loop.last %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- raise_exception('Unexpected message role.') }}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if add_generation_prompt %}\n" + " {{- '<|im_start|>assistant\\n' }}\n" + " {%- if enable_thinking is defined and enable_thinking is false %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V-4.6 Handler. + + Args: + enable_thinking (bool): Controls whether to open a `` block for reasoning. + Defaults to False as per the standard template logic. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # MiniCPM uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class Gemma3ChatHandler(MTMDChatHandler): + + GEMMA3_BOI_TOKEN = "" + GEMMA3_EOI_TOKEN = "" + GEMMA3_BOS_TOKEN = "" + GEMMA3_EOS_TOKEN = "" + + CHAT_FORMAT = ( + "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" + "{% if messages[0]['content'] is string %}" + "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" + "{% else %}" + "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" + "{% endif %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set first_user_prefix = '' %}" + "{% endif %}" + + "{% for message in loop_messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + "{% endif %}" + + "{% if message['role'] == 'assistant' %}" + "{% set role = 'model' %}" + "{% else %}" + "{% set role = message['role'] %}" + "{% endif %}" + + "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" + + "{% if message['content'] is string %}" + "{{ message['content'] | trim }}" + "{% elif message['content'] is iterable %}" + "{% for item in message['content'] %}" + "{% if item['type'] == 'image_url' and item['image_url'] is string %}" + "{{ '' + item['image_url'] + '' }}" + "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" + "{{ '' + item['image_url']['url'] + '' }}" + "{% elif item['type'] == 'text' %}" + "{{ item['text'] | trim }}" + "{% endif %}" + "{% endfor %}" + "{% else %}" + "{{ raise_exception('Invalid content type') }}" + "{% endif %}" + + "\n" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + +class Gemma4ChatHandler(MTMDChatHandler): + """ + Handler for Gemma 4 models. + + Note on `enable_thinking`: + The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. + It is NOT supported by Gemma4 E2B and E4B models. + + [Important Note for Audio Processing!] + It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. + Other quantizations are known to have degraded performance; + ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 + """ + + # The special token in Gemma 4 + GEMMA4_BOI_TOKEN = "<|image>" + GEMMA4_EOI_TOKEN = "" + GEMMA4_BOA_TOKEN = "<|audio>" + GEMMA4_EOA_TOKEN = "" + GEMMA4_BOS_TOKEN = "" + GEMMA4_EOS_TOKEN = "" + GEMMA4_SOT_TOKEN = "<|turn>" + GEMMA4_EOT_TOKEN = "" + GEMMA4_SOC_TOKEN = "<|channel>" + GEMMA4_EOC_TOKEN = "" + GEMMA4_STC_TOKEN = "<|tool_call>" + GEMMA4_ETC_TOKEN = "" + GEMMA4_STD_TOKEN = "<|tool>" + GEMMA4_ETD_TOKEN = "" + GEMMA4_STR_TOKEN = "<|tool_response>" + GEMMA4_ETR_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" + " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in properties | dictsort -%}\n" + " {%- set add_comma = false -%}\n" + " {%- if not filter_keys or key not in standard_keys -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {{ key }}:{\n" + " {%- if value['description'] -%}\n" + " description:<|\"|>{{ value['description'] }}<|\"|>\n" + " {%- set add_comma = true -%}\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'STRING' -%}\n" + " {%- if value['enum'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " enum:{{ format_argument(value['enum']) }}\n" + " {%- endif -%}\n" + " {%- elif value['type'] | upper == 'ARRAY' -%}\n" + " {%- if value['items'] is mapping and value['items'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " items:{\n" + " {%- set ns_items = namespace(found_first=false) -%}\n" + " {%- for item_key, item_value in value['items'] | dictsort -%}\n" + " {%- if item_value is not none -%}\n" + " {%- if ns_items.found_first %},{% endif -%}\n" + " {%- set ns_items.found_first = true -%}\n" + " {%- if item_key == 'properties' -%}\n" + " properties:{\n" + " {%- if item_value is mapping -%}\n" + " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" + " {%- endif -%}\n" + " }\n" + " {%- elif item_key == 'required' -%}\n" + " required:[\n" + " {%- for req_item in item_value -%}\n" + " <|\"|>{{- req_item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- elif item_key == 'type' -%}\n" + " {%- if item_value is string -%}\n" + " type:{{ format_argument(item_value | upper) }}\n" + " {%- else -%}\n" + " type:{{ format_argument(item_value | map('upper') | list) }}\n" + " {%- endif -%}\n" + " {%- else -%}\n" + " {{ item_key }}:{{ format_argument(item_value) }}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " }\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if value['nullable'] %}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " nullable:true\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'OBJECT' -%}\n" + " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" + " }\n" + " {%- elif value is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" + " }\n" + " {%- endif -%}\n" + " {%- if value['required'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " required:[\n" + " {%- for item in value['required'] | default([]) -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + "{%- endmacro -%}\n" + "{%- macro format_function_declaration(tool_data) -%}\n" + " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" + " {%- set params = tool_data['function']['parameters'] -%}\n" + " {%- if params -%}\n" + " ,parameters:{\n" + " {%- if params.get('properties') -%}\n" + " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" + " {%- endif -%}\n" + " {%- if params.get('required') -%}\n" + " required:[\n" + " {%- for item in params['required'] -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {{- ',' if not loop.last -}}\n" + " {%- endfor -%}\n" + " ],\n" + " {%- endif -%}\n" + " {%- if params.get('type') -%}\n" + " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if 'response' in tool_data['function'] -%}\n" + " {%- set response_declaration = tool_data['function']['response'] -%}\n" + " ,response:{\n" + " {%- if response_declaration['description'] -%}\n" + " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" + " {%- endif -%}\n" + " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" + " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " }\n" + "{%- endmacro -%}\n" + "{%- macro format_argument(argument, escape_keys=True) -%}\n" + " {%- if argument is string -%}\n" + " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" + " {%- elif argument is boolean -%}\n" + " {{- 'true' if argument else 'false' -}}\n" + " {%- elif argument is mapping -%}\n" + " {{- '{' -}}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in argument | dictsort -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {%- if escape_keys -%}\n" + " {{- '<|\"|>' + key + '<|\"|>' -}}\n" + " {%- else -%}\n" + " {{- key -}}\n" + " {%- endif -%}\n" + " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- elif argument is sequence -%}\n" + " {{- '[' -}}\n" + " {%- for item in argument -%}\n" + " {{- format_argument(item, escape_keys=escape_keys) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- ']' -}}\n" + " {%- else -%}\n" + " {{- argument -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- macro strip_thinking(text) -%}\n" + " {%- set ns = namespace(result='') -%}\n" + " {%- for part in text.split('') -%}\n" + " {%- if '<|channel>' in part -%}\n" + " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" + " {%- else -%}\n" + " {%- set ns.result = ns.result + part -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.result | trim -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- macro format_tool_response_block(tool_name, response) -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- if response is mapping -%}\n" + " {{- 'response:' + tool_name + '{' -}}\n" + " {%- for key, value in response | dictsort -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- else -%}\n" + " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" + " {%- endif -%}\n" + " {{- '' -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- set ns = namespace(prev_message_type=None) -%}\n" + "{%- set loop_messages = messages -%}\n" + "{{- bos_token -}}\n" + "{#- Handle System/Tool Definitions Block -#}\n" + "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" + " {{- '<|turn>system\\n' -}}\n" + " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" + " {%- if enable_thinking is defined and enable_thinking -%}\n" + " {{- '<|think|>\\n' -}}\n" + " {%- set ns.prev_message_type = 'think' -%}\n" + " {%- endif -%}\n" + " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" + " {%- if messages[0]['content'] is string -%}\n" + " {{- messages[0]['content'] | trim -}}\n" + " {%- elif messages[0]['content'] is sequence -%}\n" + " {%- for item in messages[0]['content'] -%}\n" + " {{- item['text'] | trim + ' '-}}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set loop_messages = messages[1:] -%}\n" + " {%- endif -%}\n" + " {%- if tools -%}\n" + " {%- for tool in tools %}\n" + " {{- '<|tool>' -}}\n" + " {{- format_function_declaration(tool) | trim -}}\n" + " {{- '' -}}\n" + " {%- endfor %}\n" + " {%- set ns.prev_message_type = 'tool' -%}\n" + " {%- endif -%}\n" + " {{- '\\n' -}}\n" + "{%- endif %}\n" + "\n" + "{#- Pre-scan: find last user message index for reasoning guard -#}\n" + "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" + "{%- for i in range(loop_messages | length) -%}\n" + " {%- if loop_messages[i]['role'] == 'user' -%}\n" + " {%- set ns_turn.last_user_idx = i -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{#- Loop through messages -#}\n" + "{%- for message in loop_messages -%}\n" + " {%- if message['role'] != 'tool' -%}\n" + " {%- set ns.prev_message_type = None -%}\n" + " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" + " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" + " {%- set prev_nt = namespace(role=None, found=false) -%}\n" + " {%- if loop.index0 > 0 -%}\n" + " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" + " {%- if not prev_nt.found -%}\n" + " {%- if loop_messages[j]['role'] != 'tool' -%}\n" + " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" + " {%- set prev_nt.found = true -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" + " {%- if not continue_same_model_turn -%}\n" + " {{- '<|turn>' + role + '\\n' }}\n" + " {%- endif -%}\n" + "\n" + " {#- Render reasoning/reasoning_content as thinking channel -#}\n" + " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" + " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" + " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" + " {%- endif -%}\n" + "\n" + " {%- if message.get('tool_calls') -%}\n" + " {%- for tool_call in message['tool_calls'] -%}\n" + " {%- set function = tool_call['function'] -%}\n" + " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" + " {%- if function['arguments'] is mapping -%}\n" + " {%- set ns_args = namespace(found_first=false) -%}\n" + " {%- for key, value in function['arguments'] | dictsort -%}\n" + " {%- if ns_args.found_first %},{% endif -%}\n" + " {%- set ns_args.found_first = true -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- endfor -%}\n" + " {%- elif function['arguments'] is string -%}\n" + " {{- function['arguments'] -}}\n" + " {%- endif -%}\n" + " {{- '}' -}}\n" + " {%- endfor -%}\n" + " {%- set ns.prev_message_type = 'tool_call' -%}\n" + " {%- endif -%}\n" + "\n" + " {%- set ns_tr_out = namespace(flag=false) -%}\n" + " {%- if message.get('tool_responses') -%}\n" + " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" + " {%- for tool_response in message['tool_responses'] -%}\n" + " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endfor -%}\n" + " {%- elif message.get('tool_calls') -%}\n" + " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" + " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" + " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" + " {%- if ns_tool_scan.stopped -%}\n" + " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" + " {%- set ns_tool_scan.stopped = true -%}\n" + " {%- else -%}\n" + " {%- set follow = loop_messages[k] -%}\n" + " {#- Resolve tool_call_id to function name -#}\n" + " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" + " {%- for tc in message['tool_calls'] -%}\n" + " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" + " {%- set ns_tname.name = tc['function']['name'] -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {#- Handle content as string or content-parts array -#}\n" + " {%- set tool_body = follow.get('content') -%}\n" + " {%- if tool_body is string -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- elif tool_body is sequence and tool_body is not string -%}\n" + " {%- set ns_txt = namespace(s='') -%}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'text' -%}\n" + " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'image_url' -%}\n" + " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" + " {%- if part.get('type') == 'audio_url' -%}\n" + " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif part.get('type') == 'input_audio' -%}\n" + " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + # " {%- elif part.get('type') == 'video_url' -%}\n" + # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- else -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- endif -%}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "\n" + " {%- set captured_content -%}\n" + " {%- if message['content'] is string -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(message['content']) -}}\n" + " {%- else -%}\n" + " {{- message['content'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif message['content'] is sequence -%}\n" + " {%- for item in message['content'] -%}\n" + " {%- if item['type'] == 'text' -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(item['text']) -}}\n" + " {%- else -%}\n" + " {{- item['text'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif item['type'] == 'image_url' -%}\n" + " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- set ns.prev_message_type = 'image' -%}\n" + " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" + " {%- if item['type'] == 'audio_url' -%}\n" + " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] == 'input_audio' -%}\n" + " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + " {%- set ns.prev_message_type = 'audio' -%}\n" + " {%- endif -%}\n" + # " {%- elif item['type'] == 'video_url' -%}\n" + # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + # " {%- set ns.prev_message_type = 'video' -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endset -%}\n" + "\n" + " {{- captured_content -}}\n" + " {%- set has_content = captured_content | trim | length > 0 -%}\n" + "\n" + " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- if add_generation_prompt -%}\n" + " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" + " {{- '<|turn>model\\n' -}}\n" + " {%- if not enable_thinking | default(false) -%}\n" + " {{- '<|channel>thought\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Gemma 4 Handler. + + Args: + enable_thinking (bool): Controls whether the <|think|> tag is injected and + manages <|channel>thought behavior. + Note: ONLY supported on Gemma4 31B and 26BA4B models. + NOT supported on Gemma4 E2B and E4B models. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set the stop token based on Gemma 4's format () + # generation_config.json: "eos_token_id": [1, 106, 50] + kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class GLM41VChatHandler(MTMDChatHandler): + # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. + + GLM41V_EOS_TOKEN = "<|endoftext|>" + GLM41V_PAD_TOKEN = "<|endoftext|>" + GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]\n" + "{%- for msg in messages -%}" + "{%- if msg.role == 'system' -%}" + "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'user' -%}" + "<|user|>\n" + "{%- if msg.content is string -%}" + "{{ msg.content }}" + "{%- else -%}" + "{%- for item in msg.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'assistant' -%}" + "{%- if msg.metadata -%}" + "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- else -%}" + "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{%- endif -%}" + ) + + def __call__(self, **kwargs): + self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN + # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json + stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch + kwargs['stop'] = stop_tokens + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + + +class GLM46VChatHandler(MTMDChatHandler): + GLM46V_EOS_TOKEN = "<|endoftext|>" + GLM46V_PAD_TOKEN = "<|endoftext|>" + GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]" + "{%- if tools -%}" + "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" + "You are provided with function signatures within XML tags:\n\n" + "{%- for tool in tools -%}" + "{{ tool | tojson(ensure_ascii=False) }}\n" + "{%- endfor -%}" + "\n\nFor each function call, output the function name and arguments within the following XML format:\n" + "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" + "{%- endif -%}" + + "{%- for m in messages -%}" + "{%- if m.role == 'system' -%}" + "<|system|>\n{{ m.content }}" + "{%- elif m.role == 'user' -%}" + "<|user|>\n" + "{%- if m.content is string -%}" + "{{ m.content }}" + "{%- else -%}" + "{%- for item in m.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + # If enable_thinking is disabled, insert `/nothink` according to the source code logic. + "{{ '/nothink' if not enable_thinking else '' }}" + "{%- elif m.role == 'assistant' -%}" + "<|assistant|>" + "{%- if enable_thinking -%}" + "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" + "\n{{ reasoning.strip() }}" + "{%- else -%}" + "\n" + "{%- endif -%}" + "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" + "{%- endif -%}" + "{{ GLM46V_EOS_TOKEN }}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{{ '' if enable_thinking else '\n' }}" + "{%- endif -%}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + GLM-4.6V Handler + Parameters: + - enable_thinking (bool): Whether to enable the model's think process. The default is True. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN + + # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json + kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class GraniteDoclingChatHandler(MTMDChatHandler): + """ + Handler for Granite-Docling models. + + Format(512x512): Content + + Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! + Since the model does not have special tokens for the start and end of an image, + it is recommended to process only one image at a time. + You can iterate through the images individually for recognition. + + """ + GRANITE_BOS_TOKEN = "<|start_of_role|>" + GRANITE_EOS_TOKEN = "<|end_of_text|>" + GRANITE_PAD_TOKEN = "<|end_of_text|>" + GRANITE_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for part in message['content'] -%}" + "{%- if part['type'] == 'text' -%}" + "{{- part['text'] -}}" + "{%- elif part['type'] == 'image_url' -%}" + "{%- if part.image_url is string -%}" + "{{- part.image_url -}}" + "{%- else -%}" + "{{- part.image_url.url -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '<|end_of_text|>\n' -}}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|start_of_role|>assistant' -}}" + # Support the 'controls' parameter if present in generation arguments + "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" + "{{- '<|end_of_role|>' -}}" + "{%- endif -%}" + ) + + def __init__(self, controls: dict = None, **kwargs): + """ + Granite-Docling Handler + Args: + controls (dict, optional): Operational parameters passed to the assistant role. + + The 'controls' parameter is used to guide the model's behavior or output format. + Common examples for 'controls' include: + - Document Parsing: {"mode": "document_parsing", "format": "json"} + """ + self.controls = controls + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject controls into the template environment + self.extra_template_arguments["controls"] = self.controls + self.DEFAULT_SYSTEM_MESSAGE = None + kwargs['stop'] = [self.GRANITE_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + + return super().__call__(**kwargs) + + +class LFM2VLChatHandler(MTMDChatHandler): + LFM2VL_BOS_TOKEN = "<|startoftext|>" + LFM2VL_EOS_TOKEN = "<|im_end|>" + LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{ '<|im_start|>' + message['role'] + '\n' }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] }}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if 'image_url' in content -%}" + "{%- if content.image_url is string -%}" + "<|image_start|>{{ content.image_url }}<|image_end|>" + "{%- else -%}" + "<|image_start|>{{ content.image_url.url }}<|image_end|>" + "{%- endif -%}" + "{%- elif content['type'] == 'text' -%}" + "{{ content['text'] }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{ '<|im_end|>\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ '<|im_start|>assistant\n' }}" + "{%- endif -%}" + ) + + def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): + """ + LFM2-VL Handler + LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 + """ + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) + + def __call__(self, **kwargs): + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + return super().__call__(**kwargs) + + +class LFM25VLChatHandler(MTMDChatHandler): + """ + Handler for LFM2.5-VL multimodal models. + + Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. + """ + # Aligned with LFM2.5-VL tokenizer_config + LFM25VL_BOS_TOKEN = "<|startoftext|>" + LFM25VL_EOS_TOKEN = "<|im_end|>" + LFM25VL_PAD_TOKEN = "<|pad|>" + + # Image specific tokens + LFM25VL_IMAGE_TOKEN = "" + LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" + LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" + + CHAT_FORMAT = ( + "{{- bos_token -}}\n" + "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" + "{%- set ns = namespace(system_prompt='', content='') -%}\n" + "{%- if messages[0]['role'] == 'system' -%}\n" + " {%- set ns.system_prompt = messages[0]['content'] -%}\n" + " {%- set messages = messages[1:] -%}\n" + "{%- endif -%}\n" + "{%- if tools -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" + " {%- for tool in tools -%}\n" + " {%- if tool is not string -%}\n" + " {%- set tool = tool | tojson -%}\n" + " {%- endif -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" + " {%- if not loop.last -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" + "{%- endif -%}\n" + "{%- if ns.system_prompt -%}\n" + " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" + "{%- endif -%}\n" + "{%- set ns.last_assistant_index = -1 -%}\n" + "{%- for message in messages -%}\n" + " {%- if message['role'] == 'assistant' -%}\n" + " {%- set ns.last_assistant_index = loop.index0 -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- for message in messages -%}\n" + " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" + " {%- set content = message['content'] -%}\n" + " {%- if content is not string -%}\n" + " {%- set ns.content = '' -%}\n" + " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" + " {%- for item in content -%}\n" + " {%- if item['type'] == 'image_url' -%}\n" + " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {%- set ns.content = ns.content + img_val -%}\n" + " {%- elif item['type'] == 'text' -%}\n" + " {%- set ns.content = ns.content + item['text'] -%}\n" + " {%- else -%}\n" + " {%- set ns.content = ns.content + (item | tojson) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set content = ns.content -%}\n" + " {%- endif -%}\n" + " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" + " {%- if '' in content -%}\n" + " {%- set content = content.split('')[-1] | trim -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {{- content + '<|im_end|>\\n' -}}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, keep_past_thinking: bool = False, **kwargs): + self.keep_past_thinking = keep_past_thinking + super().__init__(**kwargs) + + + def __call__(self, **kwargs): + if self.image_min_tokens > 256: + if self.verbose: + print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") + self.image_min_tokens = -1 + + self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking + + kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") + return super().__call__(**kwargs) + + +class PaddleOCRChatHandler(MTMDChatHandler): + """ + Handler for PaddleOCR 1.5/1.6 multimodal models. + """ + + PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" + PADDLEOCR_BOS_TOKEN = "" + PADDLEOCR_EOS_TOKEN = "" + PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" + PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" + PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" + + CHAT_FORMAT = ( + "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" + "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" + "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" + + "{{- cls_token -}}" + "{%- for message in messages -%}" + "{%- if message['role'] == 'user' -%}" + "{{- 'User: ' -}}" + + # Robust parsing: Check if content is string or list + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + # Pass 1: Render all images first + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" + "{{- '<|IMAGE_START|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|IMAGE_END|>' -}}" + "{%- endif -%}" + "{%- endfor -%}" + + # Pass 2: Render all text second + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '\\n' -}}" + + "{%- elif message['role'] == 'assistant' -%}" + "{{- 'Assistant:\\n' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- eos_token -}}" + + "{%- elif message['role'] == 'system' -%}" + "{%- if message['content'] is string -%}" + "{{- message['content'] + '\\n' -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] + '\\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "{{- 'Assistant:\\n' -}}" + "{%- endif -%}" + ) + + def __init__( + self, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + **kwargs + ): + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__( + image_min_tokens=self.image_min_tokens, + image_max_tokens=self.image_max_tokens, + **kwargs + ) + + def __call__(self, **kwargs): + # Set the specific stop token defined in the PaddleOCR template + kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + return super().__call__(**kwargs) + + +class Qwen25VLChatHandler(MTMDChatHandler): + + QWEN25_VL_BOS_TOKEN = "<|endoftext|>" + QWEN25_VL_PAD_TOKEN = "<|endoftext|>" + QWEN25_VL_EOS_TOKEN = "<|im_end|>" + + CHAT_FORMAT = ( + "{% set image_count = namespace(value=0) %}" + "{% for message in messages %}" + "{% if loop.first and message['role'] != 'system' %}" + "<|im_start|>system\n" + "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is string %}" + "{{ message['content'] }}<|im_end|>\n" + "{% else %}" + "{% for content in message['content'] %}" + "{% if content['type'] == 'image_url' %}" + "{% if content.image_url is string %}" + "{% set image_count.value = image_count.value + 1 %}" + "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" + "{% else %}" + "{% set image_count.value = image_count.value + 1 %}" + "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" + "{% endif %}" + "{% elif content['type'] == 'text' %}" + "{{ content['text'] }}" + "{% endif %}" + "{% endfor %}" + "<|im_end|>\n" + "{% endif %}" + "{% endfor %}" + "<|im_start|>assistant\n" + ) + + def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Qwen3ASRChatHandler(MTMDChatHandler): + """ + Handler for Qwen 3 ASR (Automatic Speech Recognition) models. + + Features: + - Highly specialized for Speech-to-Text tasks. + - Aggregates all system text into a single cohesive system block. + - Drops user text entirely, extracting ONLY audio data into a unified user turn. + - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. + - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. + """ + + DEFAULT_SYSTEM_MESSAGE = """ + You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. + You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. + """ + + QWEN3_ASR_BOS_TOKEN = "<|im_start|>" + QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" + QWEN3_ASR_EOS_TOKEN = "<|im_end|>" + + + QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" + QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" + QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" + + CHAT_FORMAT = ( + "{%- set ns = namespace(system_text='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.role == 'system' -%}\n" + " {%- if m.content is string -%}\n" + " {%- set ns.system_text = ns.system_text + m.content -%}\n" + " {%- else -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'text' and (c.text is defined) -%}\n" + " {%- set ns.system_text = ns.system_text + c.text -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- set ns2 = namespace(audio_tokens='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.content is not string -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" + " {#- MTMD Audio Injection -#}\n" + " {%- set audio_val = '' -%}\n" + " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" + " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" + " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" + " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" + " {%- endif -%}\n" + " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" + "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token + kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") + + return super().__call__(**kwargs) + +class Qwen3VLChatHandler(MTMDChatHandler): + + QWEN3_VL_BOS_TOKEN = "<|endoftext|>" + QWEN3_VL_PAD_TOKEN = "<|endoftext|>" + QWEN3_VL_EOS_TOKEN = "<|im_end|>" + + CHAT_FORMAT = ( + "{{- '<|im_start|>system\n' -}}" + "{%- if messages[0].content is string and messages[0].role == 'system' -%}" + "{{- messages[0].content -}}" + "{%- elif messages[0].role == 'system' -%}" + "{%- if 'text' in messages[0].content -%}" + "{{- messages[0].content.text -}}" + "{%- else -%}" + "{{- 'You are a helpful assistant.' -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- if tools -%}" + "{{- '\n\n' -}}" + "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" + "{%- for tool in tools -%}" + "{{- '\n' -}}" + "{{- tool | tojson -}}" + "{%- endfor -%}" + "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" + "{%- endif -%}" + "{{- '<|im_end|>\n' -}}" + "{%- set image_count = namespace(value=0) -%}" + #"{%- set video_count = namespace(value=0) -%}" + "{%- for message in messages -%}" + "{%- if message.role == 'tool' -%}" + "{{- '<|im_start|>user\n\n' -}}" + "{%- elif message.role != 'system' -%}" + "{{- '<|im_start|>' + message.role + '\n' -}}" + "{%- endif -%}" + "{%- if message.content is string and message.role != 'system' -%}" + "{{- message.content -}}" + "{%- elif message.role != 'system' -%}" + "{%- for content in message.content -%}" + "{%- if 'image_url' in content -%}" + "{%- set image_count.value = image_count.value + 1 -%}" + "{%- if add_vision_id -%}" + "{{- 'Picture ' -}}" + "{{- image_count.value | string -}}" + "{{- ': ' -}}" + "{%- endif -%}" + "{{- '<|vision_start|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|vision_end|>' -}}" + "{%- endif -%}" + # Video not supported yet + "{%- if 'text' in content -%}" + "{{- content.text -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- if message.role == 'assistant' -%}" + "{%- if message.tool_calls -%}" + "{%- for tool_call in message.tool_calls -%}" + "{%- if (loop.first and message.content) or (not loop.first) -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- if tool_call.function -%}" + "{%- set tool_call = tool_call.function -%}" + "{%- endif -%}" + "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" + "{%- if tool_call.arguments is string -%}" + "{{- tool_call.arguments -}}" + "{%- else -%}" + "{{- tool_call.arguments | tojson -}}" + "{%- endif -%}" + "{{- '}\n' -}}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- elif message.role == 'tool' -%}" + "{{- '' -}}" + "{%- endif -%}" + "{%- if message.role != 'system' -%}" + "{{- '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|im_start|>assistant\n' -}}" + "{%- if force_reasoning -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + force_reasoning: bool = False, + add_vision_id: bool = True, + **kwargs, + ): + """ + Parameters: + - force_reasoning (bool): + - True: Force the reasoning in the model by adding to the chat template. + - False (default): Don't force the reasoning. + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + """ + super().__init__(**kwargs) + self.force_reasoning = force_reasoning + self.extra_template_arguments["force_reasoning"] = force_reasoning + self.extra_template_arguments["add_vision_id"] = add_vision_id + + def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Qwen35ChatHandler(MTMDChatHandler): + """ + Handler for Qwen3.5/Qwen3.6 models. + """ + CHAT_FORMAT = ( + "{%- set image_count = namespace(value=0) -%}" + "{%- set video_count = namespace(value=0) -%}" + "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" + " {%- if content is string -%}" + " {{- content -}}" + " {%- elif content is iterable and content is not mapping -%}" + " {%- for item in content -%}" + " {%- if 'image_url' in item or item.type == 'image_url' -%}" + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain images.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set image_count.value = image_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Picture ' -}}" + " {{- image_count.value | string -}}" + " {{- ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {%- if item.image_url is string -%}" + " {{- item.image_url -}}" + " {%- else -%}" + " {{- item.image_url.url -}}" + " {%- endif -%}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'video' in item -%}" + " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain videos.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set video_count.value = video_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Video ' ~ video_count.value ~ ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {{- item.video -}}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'text' in item -%}" + " {{- item.text -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected item type in content.') -}}" + " {%- endif -%}" + " {%- endfor -%}" + " {%- elif content is none or content is undefined -%}" + " {{- '' -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected content type.') -}}" + " {%- endif -%}" + "{%- endmacro -%}" + "{%- if not messages -%}" + " {{- raise_exception('No messages provided.') -}}" + "{%- endif -%}" + "{%- if tools and tools is iterable and tools is not mapping -%}" + " {{- '<|im_start|>system\n' -}}" + " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" + " {%- for tool in tools -%}" + " {{- '\n' -}}" + " {{- tool | tojson -}}" + " {%- endfor -%}" + " {{- '\n' -}}" + " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" + " {%- if messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) | trim -%}" + " {%- if content -%}" + " {{- '\n\n' + content -}}" + " {%- endif -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + "{%- elif messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) -%}" + " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" + "{%- for message in messages[::-1] -%}" + " {%- set index = messages | length - 1 - loop.index0 -%}" + " {%- if ns.multi_step_tool and message.role == 'user' -%}" + " {%- set content = render_content(message.content, false) | trim -%}" + " {%- if not (content.startswith('') and content.endswith('')) -%}" + " {%- set ns.multi_step_tool = false -%}" + " {%- set ns.last_query_index = index -%}" + " {%- endif -%}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if ns.multi_step_tool -%}" + " {{- raise_exception('No user query found in messages.') -}}" + "{%- endif -%}" + "{%- for message in messages -%}" + " {%- set content = render_content(message.content, true) | trim -%}" + " {%- if message.role == 'system' -%}" + " {%- if not loop.first -%}" + " {{- raise_exception('System message must be at the beginning.') -}}" + " {%- endif -%}" + " {%- elif message.role == 'user' -%}" + " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" + " {%- elif message.role == 'assistant' -%}" + " {%- set reasoning_content = '' -%}" + " {%- if message.reasoning_content is string -%}" + " {%- set reasoning_content = message.reasoning_content -%}" + " {%- elif '' in content -%}" + " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" + " {%- set content = content.split('')[-1].lstrip('\n') -%}" + " {%- endif -%}" + " {%- set reasoning_content = reasoning_content | trim -%}" + " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" + " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" + " {%- else -%}" + " {{- '<|im_start|>' + message.role + '\n' + content -}}" + " {%- endif -%}" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" + " {%- for tool_call in message.tool_calls -%}" + " {%- if tool_call.function is defined -%}" + " {%- set tool_call = tool_call.function -%}" + " {%- endif -%}" + " {%- if loop.first -%}" + " {%- if content | trim -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- '\n\n\n' -}}" + " {%- endif -%}" + " {%- if tool_call.arguments is defined -%}" + " {%- for (args_name, args_value) in tool_call.arguments | items -%}" + " {{- '\n' -}}" + " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" + " {{- args_value -}}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif message.role == 'tool' -%}" + " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" + " {{- '<|im_start|>user' -}}" + " {%- endif -%}" + " {{- '\n\n' -}}" + " {{- content -}}" + " {{- '\n' -}}" + " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif loop.last -%}" + " {{- '<|im_end|>\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- raise_exception('Unexpected message role.') -}}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + " {{- '<|im_start|>assistant\n' -}}" + " {%- if enable_thinking is defined and enable_thinking is false -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n' -}}" + " {%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + add_vision_id: bool = True, + enable_thinking: bool = True, + preserve_thinking: bool = False, + **kwargs, + ): + """ + Parameters: + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + - enable_thinking (bool): + - True (default): Enables reasoning for better results. + - False: Disables reasoning for faster results. + - preserve_thinking (bool): + - True: Keeps reasoning process for ALL historical conversational turns. + - False (default): Only keeps for the latest assistant reply to save tokens. + """ + super().__init__(**kwargs) + self.enable_thinking = enable_thinking + self.preserve_thinking = preserve_thinking + self.extra_template_arguments["add_vision_id"] = add_vision_id + self.extra_template_arguments["enable_thinking"] = enable_thinking + self.extra_template_arguments["preserve_thinking"] = preserve_thinking + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + + +class Step3VLChatHandler(MTMDChatHandler): + """ + Handler for Step3-VL models. + """ + + STEP3VL_BOS_TOKEN = "<|im_start|>" + STEP3VL_EOS_TOKEN = "<|im_end|>" + STEP3VL_PAD_TOKEN = "<|endoftext|>" + STEP3VL_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro render_content(content) -%}\n" + " {%- if content is none -%}{{- '' -}}\n" + " {%- elif content is string -%}{{- content -}}\n" + " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" + " {%- elif content is iterable -%}\n" + " {%- for item in content -%}\n" + " {%- if item.type == 'text' -%}\n" + " {{- item['value'] if 'value' in item else item['text'] -}}\n" + " {%- elif item.type in ['image', 'image_url'] -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.image_url -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {{- '' + url_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "\n" + "{%- if tools -%}\n" + " {{- '<|im_start|>system\\n' -}}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" + " {%- endif -%}\n" + " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" + " {%- for tool in tools -%}\n" + " {{- '\\n' -}}\n" + " {{- tool | tojson -}}\n" + " {%- endfor -%}\n" + " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" + "{%- else -%}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + "\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" + "{%- for message in messages[::-1] -%}\n" + " {%- set index = (messages|length - 1) - loop.index0 -%}\n" + " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" + " {%- set ns.multi_step_tool = false -%}\n" + " {%- set ns.last_query_index = index -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- for message in messages -%}\n" + " {%- set content = render_content(message.content) -%}\n" + " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" + " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" + " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" + " {%- elif message.role == 'assistant' -%}\n" + " {%- if message.reasoning_content is string -%}\n" + " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" + " {%- else -%}\n" + " {%- if '' in content -%}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" + " {%- else -%}\n" + " {%- set reasoning_content = '' -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if loop.index0 > ns.last_query_index -%}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" + " {%- else -%}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" + " {%- endif -%}\n" + " {%- if message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- for tool_call in message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- if tool_call.function -%}\n" + " {%- set tool_call = tool_call.function -%}\n" + " {%- endif -%}\n" + " {{- '\\n{\"name\": \"' -}}\n" + " {{- tool_call.name -}}\n" + " {{- '\", \"arguments\": ' -}}\n" + " {%- if tool_call.arguments is string -%}\n" + " {{- tool_call.arguments -}}\n" + " {%- else -%}\n" + " {{- tool_call.arguments | tojson -}}\n" + " {%- endif -%}\n" + " {{- '}\\n' -}}\n" + " {%- endfor -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- elif message.role == 'tool' -%}\n" + " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" + " {{- '<|im_start|>tool_response' -}}\n" + " {%- endif -%}\n" + " {{- '\\n\\n' -}}\n" + " {{- content -}}\n" + " {{- '\\n' -}}\n" + " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Step3-VL Handler. + + Args: + enable_thinking (bool): If False, injects an empty block to bypass reasoning. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Pass thinking toggle into Jinja + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Step3 uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) From d84b0c21fa4a131df5c17ddd1b2447929dc1973f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 22:08:32 +0800 Subject: [PATCH 126/139] fix(model): handle missing chat templates - Update LlamaModel.model_chat_template() to return Optional[str] and accept name=None for the default model chat template. - llama_model_chat_template() may return nullptr when no chat template is available. Handle that case explicitly instead of decoding a null pointer, and return None so callers can apply their own fallback logic. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 434921e6bd..91befb2247 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -152,12 +152,17 @@ def model_size(self) -> int: """ return llama_cpp.llama_model_size(self.model) - def model_chat_template(self, name: bytes) -> str: + def model_chat_template(self, name: Optional[bytes] = None) -> Optional[str]: """ - Get the default chat template. Returns nullptr if not available - If name is NULL, returns the default chat template + Get a chat template from the model. + + If name is None, returns the default chat template. + Returns None if no chat template is available. """ - return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8") + template = llama_cpp.llama_model_chat_template(self.model, name) + if template is None: + return None + return template.decode("utf-8") def n_params(self) -> int: """ From c9745316d748cec408b07c2f3a43fd97fa921e73 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 23:43:33 +0800 Subject: [PATCH 127/139] feat(mtmd): enhance generic chat template support - Enhance GenericMTMDChatHandler to better support model-provided chat templates. - Allow the generic handler to accept an optional named chat template, load it from the model at call time via llama_model_chat_template(), fall back to the model's default chat template, and finally use the built-in MTMD CHAT_FORMAT when no model template is available. - Also expand the generic media placeholder list for common multimodal templates and document the handler as a template-driven MTMD implementation. This prepares the generic path for a later render-driven placeholder replacement pass. Signed-off-by: JamePeng --- llama_cpp/llama.py | 2 + llama_cpp/llama_multimodal.py | 118 +++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index dbc60eaf76..b6a2c8d5a7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -174,6 +174,7 @@ def __init__( log_filters: Optional[Sequence[str]] = None, log_filters_case_sensitive: bool = True, # Extra Params + chat_template_name: Optional[str] = None, chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): @@ -721,6 +722,7 @@ def __init__( chat_format = self.metadata.get("tokenizer.chat_template", None), mmproj_path = mmproj_path, verbose = self.verbose, + chat_template_name=chat_template_name, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py index a055869543..a0f7e594e4 100644 --- a/llama_cpp/llama_multimodal.py +++ b/llama_cpp/llama_multimodal.py @@ -91,6 +91,8 @@ class MTMDChatHandler: "{% endif %}" ) + KNOWN_MEDIA_TAGS: List[str] = [] + def __init__( self, mmproj_path: Optional[str] = None, @@ -1189,41 +1191,137 @@ def from_pretrained( **kwargs, ) -# Experiments are not recommended for this purpose at this time. +# Generic template-driven MTMD handler. class GenericMTMDChatHandler(MTMDChatHandler): + """ + Generic MTMD chat handler backed by the model-provided chat template. + + This handler is intentionally template-driven. It renders the model's + tokenizer.chat_template first, then normalizes rendered media URLs or + placeholder tokens into MTMD media markers before tokenization. + + It is designed for model templates that emit media placeholders such as + <|image_pad|>, <|image|>, , [IMG], or Kimi-style <|media_pad|>. + Model-specific handlers may still be preferable when a model requires + special stop tokens, generation flags, or non-standard template arguments. + """ + KNOWN_MEDIA_TAGS = [ + # Pad placeholders inside model-specific wrappers. "<|image_pad|>", "<|audio_pad|>", "<|video_pad|>", + + # Direct placeholders inside Gemma/Llama/GLM-style wrappers. "<|image|>", "<|audio|>", "<|video|>", - "[IMG]" + + # LLaVA / LFM / Mistral-style placeholders. + "", + "