From a81b6dc14e2c6f2e72cd0605f024befec81573f7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 25 Nov 2025 19:23:59 +0800 Subject: [PATCH 001/518] Update Submodule vendor/llama.cpp 23bc779..064c90d --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 23bc779a6e..064c90d843 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 23bc779a6e58762ea892eca1801b2ea1b9050c00 +Subproject commit 064c90d84396644c8568e3fccdc26d1f3915bbfd From 7354e166f7c06fbb0eb8c556b7a30473fe3b5164 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 25 Nov 2025 19:43:01 +0800 Subject: [PATCH 002/518] Sync llama: introduce support for model-embedded sampling parameters --- llama_cpp/llama.py | 8 ++++---- llama_cpp/llama_cpp.py | 46 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2bb1c8769a..9c815539d9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -273,17 +273,17 @@ def __init__( if isinstance(v, bool): self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_BOOL.value self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_INT.value self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_FLOAT.value self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") @@ -292,7 +292,7 @@ def __init__( v_bytes = v_bytes.ljust(128, b"\0") self._kv_overrides_array[ i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR + ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_STR.value # copy min(v_bytes, 128) to str_value address = typing.cast( int, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d3890c0c1b..9b67621518 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -584,10 +584,40 @@ class llama_batch(ctypes.Structure): # LLAMA_KV_OVERRIDE_TYPE_BOOL, # LLAMA_KV_OVERRIDE_TYPE_STR, # }; -LLAMA_KV_OVERRIDE_TYPE_INT = 0 -LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1 -LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 -LLAMA_KV_OVERRIDE_TYPE_STR = 3 +class LlamaModelKVOverrideType(enum.IntEnum): + LLAMA_KV_OVERRIDE_TYPE_INT = 0 + LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1 + LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 + LLAMA_KV_OVERRIDE_TYPE_STR = 3 + + +# enum llama_model_meta_key { +# LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE, +# LLAMA_MODEL_META_KEY_SAMPLING_TOP_K, +# LLAMA_MODEL_META_KEY_SAMPLING_TOP_P, +# LLAMA_MODEL_META_KEY_SAMPLING_MIN_P, +# LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY, +# LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD, +# LLAMA_MODEL_META_KEY_SAMPLING_TEMP, +# LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N, +# LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA, +# }; +class LlamaModelMetaKey(enum.IntEnum): + LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0 + LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1 + LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2 + LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3 + LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4 + LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5 + LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6 + LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7 + LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8 + LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9 + LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10 + LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11 # struct llama_model_kv_override { @@ -1511,6 +1541,14 @@ def llama_model_meta_count(model: llama_model_p, /) -> int: ... +# // Get sampling metadata key name. Returns nullptr if the key is invalid +# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key); +@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p) +def llama_model_meta_key_str(key: int, /) -> ctypes.c_char_p: + """Get sampling metadata key name. Returns nullptr if the key is invalid""" + ... + + # // Get metadata key name by index # LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size); @ctypes_function( From c278dd4b0efa07308b9a3af79f90cb7730ebd8e5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 28 Nov 2025 20:53:47 +0800 Subject: [PATCH 003/518] Update Submodule vendor/llama.cpp 064c90d..7d2add5 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 064c90d843..7d2add51d8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 064c90d84396644c8568e3fccdc26d1f3915bbfd +Subproject commit 7d2add51d8e3759020d70f2ff3a76b5795ff67bc From 66c7d75edecbfd25f18ad89defe4e0e18b39c26f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 00:01:46 +0800 Subject: [PATCH 004/518] Enhanced text-based bootstrapping for Vulkan compilation and improved recognition of Win32 Vulkan libraries. Signed-off-by: JamePeng --- README.md | 6 +++++- llama_cpp/_ctypes_extensions.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 287bf3a34e..581e4f387d 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,11 @@ CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python
Vulkan -To install with Vulkan support, set the `GGML_VULKAN=on` environment variable before installing: +- For Windows User: Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. + +- For Linux User: Follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide. + +Then install with Vulkan support by set the `GGML_VULKAN=on` environment variable before installing: ```bash CMAKE_ARGS="-DGGML_VULKAN=on" pip install llama-cpp-python diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index 3df9b353c6..4ad6b0d1ba 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -67,6 +67,11 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): if "HIP_PATH" in os.environ: os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin")) os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) + + if "VULKAN_SDK" in os.environ: + os.add_dll_directory(os.path.join(os.environ["VULKAN_SDK"], "Bin")) + os.add_dll_directory(os.path.join(os.environ["VULKAN_SDK"], "Lib")) + cdll_args["winmode"] = ctypes.RTLD_GLOBAL # Try to load the shared library, handling potential errors From 2942ccf6448c0501c88a77a412cbac389911ebd8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 10:07:56 +0800 Subject: [PATCH 005/518] Update Submodule vendor/llama.cpp 7d2add5..c7af376 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d2add51d8..c7af376c29 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d2add51d8e3759020d70f2ff3a76b5795ff67bc +Subproject commit c7af376c298b7d09c280233548668ba6fcc17deb From e10e36e6e2e41fe9eec6abd9a2a36a0ca04ff8bd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 10:12:17 +0800 Subject: [PATCH 006/518] Update llama_grammar.py from vendor/llama.cpp/examples/json-schema-to-grammar.py --- llama_cpp/llama_grammar.py | 867 ++++++++++++++++++++----------------- 1 file changed, 464 insertions(+), 403 deletions(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 1079c1d2ee..7759082f38 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -2,13 +2,15 @@ # flake8: noqa from pathlib import Path - -from itertools import groupby +import itertools +import json +import re +import sys from typing import ( Any, - Set, List, Optional, + Set, Tuple, Union, ) @@ -242,78 +244,190 @@ def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGramm """ """llama.cpp json-schema to grammar converter from vendor/llama.cpp/examples/json-schema-to-grammar.py""" -import json -import re -from typing import List, Optional -# whitespace is constrained to a single space char to prevent model "running away" in -# whitespace. Also maybe improves generation quality? -SPACE_RULE = '" "?' +def _build_repetition(item_rule, min_items, max_items, separator_rule=None): + if max_items == 0: + return "" -INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+") -GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') -GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"'} - -# whitespace is constrained to a single space char to prevent model "running away" in -# whitespace. Also maybe improves generation quality? -SPACE_RULE = '" "?' + if min_items == 0 and max_items == 1: + return f'{item_rule}?' - -def _build_repetition( - item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False -): if not separator_rule: - if min_items == 0 and max_items == 1: - return f"{item_rule}?" - elif min_items == 1 and max_items is None: - return f"{item_rule}+" - - result = "" - - if min_items > 0: - if item_rule_is_literal and separator_rule is None: - result = '"' + (item_rule[1:-1] * min_items) + '"' - else: - result = (f" {separator_rule} " if separator_rule else " ").join( - [item_rule] * min_items - ) - - def opt_repetitions(up_to_n, prefix_with_sep=False): - """ - - n=4, no sep: '(a (a (a (a)?)?)?)?' - - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?' - - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?' - """ - - content = ( - f"{separator_rule} {item_rule}" - if prefix_with_sep and separator_rule - else item_rule - ) - if up_to_n == 0: - return "" - elif up_to_n == 1: - return f"({content})?" - elif separator_rule and not prefix_with_sep: - return f"({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?" + if min_items == 1 and max_items is None: + return f'{item_rule}+' + elif min_items == 0 and max_items is None: + return f'{item_rule}*' else: - return (f"({content} " * up_to_n).rstrip() + (")?" * up_to_n) + return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}' - if min_items > 0 and max_items != min_items: - result += " " + result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None) + return f'({result})?' if min_items == 0 else result - if max_items is not None: - result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0) - else: - item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})' +def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): + has_min = min_value != None + has_max = max_value != None - if min_items == 0 and separator_rule: - result = f"({item_rule} {item_operator}*)?" + def digit_range(from_char: str, to_char: str): + out.append("[") + if from_char == to_char: + out.append(from_char) else: - result += f"{item_operator}*" + out.append(from_char) + out.append("-") + out.append(to_char) + out.append("]") + + def more_digits(min_digits: int, max_digits: int): + out.append("[0-9]") + if min_digits == max_digits and min_digits == 1: + return + out.append("{") + out.append(str(min_digits)) + if max_digits != min_digits: + out.append(",") + if max_digits != sys.maxsize: + out.append(str(max_digits)) + out.append("}") + + def uniform_range(from_str: str, to_str: str): + i = 0 + while i < len(from_str) and from_str[i] == to_str[i]: + i += 1 + if i > 0: + out.append("\"") + out.append(from_str[:i]) + out.append("\"") + if i < len(from_str): + if i > 0: + out.append(" ") + sub_len = len(from_str) - i - 1 + if sub_len > 0: + from_sub = from_str[i+1:] + to_sub = to_str[i+1:] + sub_zeros = "0" * sub_len + sub_nines = "9" * sub_len + + to_reached = False + out.append("(") + if from_sub == sub_zeros: + digit_range(from_str[i], chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + else: + out.append("[") + out.append(from_str[i]) + out.append("] ") + out.append("(") + uniform_range(from_sub, sub_nines) + out.append(")") + if ord(from_str[i]) < ord(to_str[i]) - 1: + out.append(" | ") + if to_sub == sub_nines: + digit_range(chr(ord(from_str[i]) + 1), to_str[i]) + to_reached = True + else: + digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + if not to_reached: + out.append(" | ") + digit_range(to_str[i], to_str[i]) + out.append(" ") + uniform_range(sub_zeros, to_sub) + out.append(")") + else: + out.append("[") + out.append(from_str[i]) + out.append("-") + out.append(to_str[i]) + out.append("]") + + if has_min and has_max: + if min_value < 0 and max_value < 0: + out.append("\"-\" (") + _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) + out.append(")") + return + + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True) + out.append(") | ") + min_value = 0 + + min_s = str(min_value) + max_s = str(max_value) + min_digits = len(min_s) + max_digits = len(max_s) + + for digits in range(min_digits, max_digits): + uniform_range(min_s, "9" * digits) + min_s = "1" + "0" * digits + out.append(" | ") + uniform_range(min_s, max_s) + return + + less_decimals = max(decimals_left - 1, 1) + + if has_min: + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) + out.append(") | [0] | [1-9] ") + more_digits(0, decimals_left - 1) + elif min_value == 0: + if top_level: + out.append("[0] | [1-9] ") + more_digits(0, less_decimals) + else: + more_digits(1, decimals_left) + elif min_value <= 9: + c = str(min_value) + range_start = '1' if top_level else '0' + if c > range_start: + digit_range(range_start, chr(ord(c) - 1)) + out.append(" ") + more_digits(1, less_decimals) + out.append(" | ") + digit_range(c, "9") + out.append(" ") + more_digits(0, less_decimals) + else: + min_s = str(min_value) + length = len(min_s) + c = min_s[0] + + if c > "1": + digit_range("1" if top_level else "0", chr(ord(c) - 1)) + out.append(" ") + more_digits(length, less_decimals) + out.append(" | ") + digit_range(c, c) + out.append(" (") + _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False) + out.append(")") + if c < "9": + out.append(" | ") + digit_range(chr(ord(c) + 1), "9") + out.append(" ") + more_digits(length - 1, less_decimals) + return + + if has_max: + if max_value >= 0: + if top_level: + out.append("\"-\" [1-9] ") + more_digits(0, less_decimals) + out.append(" | ") + _generate_min_max_int(0, max_value, out, decimals_left, top_level=True) + else: + out.append("\"-\" (") + _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False) + out.append(")") + return - return result + raise RuntimeError("At least one of min_value or max_value must be set") class BuiltinRule: @@ -321,70 +435,46 @@ def __init__(self, content: str, deps: list = None): self.content = content self.deps = deps or [] - -_up_to_15_digits = _build_repetition("[0-9]", 0, 15) +# Constraining spaces to prevent model "running away". +SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { - "boolean": BuiltinRule('("true" | "false") space', []), - "decimal-part": BuiltinRule("[0-9] " + _up_to_15_digits, []), - "integral-part": BuiltinRule("[0-9] | [1-9] " + _up_to_15_digits, []), - "number": BuiltinRule( - '("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', - ["integral-part", "decimal-part"], - ), - "integer": BuiltinRule('("-"? integral-part) space', ["integral-part"]), - "value": BuiltinRule( - "object | array | string | number | boolean | null", - ["object", "array", "string", "number", "boolean", "null"], - ), - "object": BuiltinRule( - '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', - ["string", "value"], - ), - "array": BuiltinRule( - '"[" space ( value ("," space value)* )? "]" space', ["value"] - ), - "uuid": BuiltinRule( - r'"\"" ' - + ' "-" '.join("[0-9a-fA-F]" * n for n in [8, 4, 4, 4, 12]) - + r' "\"" space', - [], - ), - "char": BuiltinRule( - r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', - [], - ), - "string": BuiltinRule(r'"\"" char* "\"" space', ["char"]), - "null": BuiltinRule('"null" space', []), + 'boolean' : BuiltinRule('("true" | "false") space', []), + 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), + 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []), + 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), + 'null' : BuiltinRule('"null" space', []), } # TODO: support "uri", "email" string formats STRING_FORMAT_RULES = { - "date": BuiltinRule( - '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )', - [], - ), - "time": BuiltinRule( - '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', - [], - ), - "date-time": BuiltinRule('date "T" time', ["date", "time"]), - "date-string": BuiltinRule('"\\"" date "\\"" space', ["date"]), - "time-string": BuiltinRule('"\\"" time "\\"" space', ["time"]), - "date-time-string": BuiltinRule('"\\"" date-time "\\"" space', ["date-time"]), + 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), + 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), + 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), + 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), } -DOTALL = "[\\U00000000-\\U0010FFFF]" -DOT = "[^\\x0A\\x0D]" +DOTALL = '[\\U00000000-\\U0010FFFF]' +DOT = '[^\\x0A\\x0D]' -RESERVED_NAMES = set( - ["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()] -) +RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()]) +INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+") +GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\\\]') +GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\]\\-\\\\]') +GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"', "-": "\\-", "]": "\\]", "\\": "\\\\"} NON_LITERAL_SET = set("|.()[]{}*+?") -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("[]()|{}*+?") - +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("^$.[]()|{}*+?") class SchemaConverter: def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): @@ -393,101 +483,142 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): self._dotall = dotall self._raw_pattern = raw_pattern self._rules = { - "space": SPACE_RULE, + 'space': SPACE_RULE, } self._refs = {} self._refs_being_resolved = set() def _format_literal(self, literal): escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( - lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal ) return f'"{escaped}"' - def not_literal( - self, literal: str, dotall: bool = True, maybe_escaped_underscores=False - ) -> str: - """ - not_literal('a') -> '[^a]' - not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?' - """ - assert len(literal) > 0, "Empty literal not supported" - + def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str: + ''' + not_literal('a') -> '[^a]' + not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?' + ''' + assert len(literal) > 0, 'Empty literal not supported' def recurse(i: int): c = literal[i] - if maybe_escaped_underscores and c == "_": - yield f"[^{c}\\\\]" - yield " | " + if maybe_escaped_underscores and c == '_': + yield f'[^{c}\\\\]' + yield ' | ' yield f'"\\\\"? "{c}"' else: - yield f"[^{c}]" + yield f'[^{c}]' if i < len(literal) - 1: - yield " | " + yield ' | ' yield self._format_literal(c) - yield " (" + yield ' (' yield from recurse(i + 1) - yield ")?" - - return "".join(("(", *recurse(0), ")")) + yield ')?' + + return ''.join(('(', *recurse(0), ')')) + + def _not_strings(self, strings): + class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_string = False + + def insert(self, string): + node = self + for c in string: + node = node.children.setdefault(c, TrieNode()) + node.is_end_of_string = True + + trie = TrieNode() + for s in strings: + trie.insert(s) + + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + out = ['["] ( '] + + def visit(node): + rejects = [] + first = True + for c in sorted(node.children.keys()): + child = node.children[c] + rejects.append(c) + if first: + first = False + else: + out.append(' | ') + out.append(f'[{c}]') + if child.children: + out.append(f' (') + visit(child) + out.append(')') + elif child.is_end_of_string: + out.append(f' {char_rule}+') + if node.children: + if not first: + out.append(' | ') + out.append(f'[^"{"".join(rejects)}] {char_rule}*') + visit(trie) + + out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + return ''.join(out) def _add_rule(self, name, rule): - esc_name = INVALID_RULE_CHARS_RE.sub("-", name) + esc_name = INVALID_RULE_CHARS_RE.sub('-', name) if esc_name not in self._rules or self._rules[esc_name] == rule: key = esc_name else: i = 0 - while ( - f"{esc_name}{i}" in self._rules - and self._rules[f"{esc_name}{i}"] != rule - ): + while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule: i += 1 - key = f"{esc_name}{i}" + key = f'{esc_name}{i}' self._rules[key] = rule return key def resolve_refs(self, schema: dict, url: str): - """ - Resolves all $ref fields in the given schema, fetching any remote schemas, - replacing $ref with absolute reference URL and populating self._refs with the - respective referenced (sub)schema dictionaries. - """ - + ''' + Resolves all $ref fields in the given schema, fetching any remote schemas, + replacing $ref with absolute reference URL and populating self._refs with the + respective referenced (sub)schema dictionaries. + ''' def visit(n: dict): if isinstance(n, list): return [visit(x) for x in n] elif isinstance(n, dict): - ref = n.get("$ref") + ref = n.get('$ref') if ref is not None and ref not in self._refs: - if ref.startswith("https://"): - assert ( - self._allow_fetch - ), "Fetching remote schemas is not allowed (use --allow-fetch for force)" + if ref.startswith('https://'): + assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)' import requests - frag_split = ref.split("#") + frag_split = ref.split('#') base_url = frag_split[0] target = self._refs.get(base_url) if target is None: - target = self.resolve_refs( - requests.get(ref).json(), base_url - ) + target = self.resolve_refs(requests.get(ref).json(), base_url) self._refs[base_url] = target - if len(frag_split) == 1 or frag_split[-1] == "": + if len(frag_split) == 1 or frag_split[-1] == '': return target - elif ref.startswith("#/"): + elif ref.startswith('#/'): target = schema - ref = f"{url}{ref}" - n["$ref"] = ref + ref = f'{url}{ref}' + n['$ref'] = ref else: - raise ValueError(f"Unsupported ref {ref}") - - for sel in ref.split("#")[-1].split("/")[1:]: - assert ( - target is not None and sel in target - ), f"Error resolving ref {ref}: {sel} not in {target}" - target = target[sel] + raise ValueError(f'Unsupported ref {ref}') + + for sel in ref.split('#')[-1].split('/')[1:]: + assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}' + if isinstance(target, list): + try: + sel_index = int(sel) + except ValueError: + raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}') + assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}' + target = target[sel_index] + else: + assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}' + target = target[sel] self._refs[ref] = target else: @@ -495,47 +626,42 @@ def visit(n: dict): visit(v) return n - return visit(schema) def _generate_union_rule(self, name, alt_schemas): - return " | ".join( - ( - self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}') - for i, alt_schema in enumerate(alt_schemas) - ) - ) + return ' | '.join(( + self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}') + for i, alt_schema in enumerate(alt_schemas) + )) def _visit_pattern(self, pattern, name): - """ - Transforms a regular expression pattern into a GBNF rule. + ''' + Transforms a regular expression pattern into a GBNF rule. - Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions - Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions + Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md - Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. + Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. - Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which - we define sub-rules to keep the output lean. - """ + Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which + we define sub-rules to keep the output lean. + ''' - assert pattern.startswith("^") and pattern.endswith( - "$" - ), 'Pattern must start with "^" and end with "$"' + assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"' pattern = pattern[1:-1] sub_rule_ids = {} i = 0 length = len(pattern) - def to_rule(s: Tuple[str, bool]) -> str: + def to_rule(s: tuple[str, bool]) -> str: (txt, is_literal) = s - return '"' + txt + '"' if is_literal else txt + return "\"" + txt + "\"" if is_literal else txt - def transform() -> Tuple[str, bool]: - """ - Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. - """ + def transform() -> tuple[str, bool]: + ''' + Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. + ''' nonlocal i nonlocal pattern nonlocal sub_rule_ids @@ -545,7 +671,7 @@ def transform() -> Tuple[str, bool]: # We only need a flat structure here to apply repetition operators to the last item, and # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially # (GBNF's syntax is luckily very close to regular expressions!) - seq: list[Tuple[str, bool]] = [] + seq: list[tuple[str, bool]] = [] def get_dot(): if self._dotall: @@ -553,72 +679,64 @@ def get_dot(): else: # Accept any character... except \n and \r line break chars (\x0A and \xOD) rule = DOT - return self._add_rule(f"dot", rule) + return self._add_rule(f'dot', rule) def join_seq(): nonlocal seq ret = [] - for is_literal, g in groupby(seq, lambda x: x[1]): + for is_literal, g in itertools.groupby(seq, lambda x: x[1]): if is_literal: - ret.append(("".join(x[0] for x in g), True)) + ret.append((''.join(x[0] for x in g), True)) else: ret.extend(g) if len(ret) == 1: return ret[0] - return (" ".join(to_rule(x) for x in seq), False) + return (' '.join(to_rule(x) for x in seq), False) while i < length: c = pattern[i] - if c == ".": + if c == '.': seq.append((get_dot(), False)) i += 1 - elif c == "(": + elif c == '(': i += 1 if i < length: - assert ( - pattern[i] != "?" - ), f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' - seq.append((f"({to_rule(transform())})", False)) - elif c == ")": + assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' + seq.append((f'({to_rule(transform())})', False)) + elif c == ')': i += 1 - assert ( - start > 0 and pattern[start - 1] == "(" - ), f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}" + assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}' return join_seq() - elif c == "[": + elif c == '[': square_brackets = c i += 1 - while i < length and pattern[i] != "]": - if pattern[i] == "\\": - square_brackets += pattern[i : i + 2] + while i < length and pattern[i] != ']': + if pattern[i] == '\\': + square_brackets += pattern[i:i+2] i += 2 else: square_brackets += pattern[i] i += 1 - assert ( - i < length - ), f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}" - square_brackets += "]" + assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}' + square_brackets += ']' i += 1 seq.append((square_brackets, False)) - elif c == "|": - seq.append(("|", False)) + elif c == '|': + seq.append(('|', False)) i += 1 - elif c in ("*", "+", "?"): + elif c in ('*', '+', '?'): seq[-1] = (to_rule(seq[-1]) + c, False) i += 1 - elif c == "{": + elif c == '{': curly_brackets = c i += 1 - while i < length and pattern[i] != "}": + while i < length and pattern[i] != '}': curly_brackets += pattern[i] i += 1 - assert ( - i < length - ), f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}" - curly_brackets += "}" + assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}' + curly_brackets += '}' i += 1 - nums = [s.strip() for s in curly_brackets[1:-1].split(",")] + nums = [s.strip() for s in curly_brackets[1:-1].split(',')] min_times = 0 max_times = None try: @@ -630,49 +748,35 @@ def join_seq(): min_times = int(nums[0]) if nums[0] else 0 max_times = int(nums[1]) if nums[1] else None except ValueError: - raise ValueError( - f"Invalid quantifier {curly_brackets} in /{pattern}/" - ) + raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/') (sub, sub_is_literal) = seq[-1] if not sub_is_literal: id = sub_rule_ids.get(sub) if id is None: - id = self._add_rule(f"{name}-{len(sub_rule_ids) + 1}", sub) + id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub) sub_rule_ids[sub] = id sub = id - seq[-1] = ( - _build_repetition( - f'"{sub}"' if sub_is_literal else sub, - min_times, - max_times, - item_rule_is_literal=sub_is_literal, - ), - False, - ) + seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False) else: - literal = "" + literal = '' while i < length: - if pattern[i] == "\\" and i < length - 1: + if pattern[i] == '\\' and i < length - 1: next = pattern[i + 1] if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS: i += 1 literal += pattern[i] i += 1 else: - literal += pattern[i : i + 2] + literal += pattern[i:i+2] i += 2 elif pattern[i] == '"' and not self._raw_pattern: literal += '\\"' i += 1 - elif pattern[i] not in NON_LITERAL_SET and ( - i == length - 1 - or literal == "" - or pattern[i + 1] == "." - or pattern[i + 1] not in NON_LITERAL_SET - ): + elif pattern[i] not in NON_LITERAL_SET and \ + (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET): literal += pattern[i] i += 1 else: @@ -684,15 +788,13 @@ def join_seq(): return self._add_rule( name, - ( - to_rule(transform()) - if self._raw_pattern - else '"\\"" ' + to_rule(transform()) + ' "\\"" space' - ), - ) + to_rule(transform()) if self._raw_pattern \ + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + def _resolve_ref(self, ref): - ref_name = ref.split("/")[-1] + ref_fragment = ref.split('#')[-1] + ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment) if ref_name not in self._rules and ref not in self._refs_being_resolved: self._refs_being_resolved.add(ref) resolved = self._refs[ref] @@ -704,203 +806,166 @@ def _generate_constant_rule(self, value): return self._format_literal(json.dumps(value)) def visit(self, schema, name): - schema_type = schema.get("type") - schema_format = schema.get("format") - rule_name = name + "-" if name in RESERVED_NAMES else name or "root" + schema_type = schema.get('type') + schema_format = schema.get('format') + rule_name = name + '-' if name in RESERVED_NAMES else name or 'root' - if (ref := schema.get("$ref")) is not None: + if (ref := schema.get('$ref')) is not None: return self._add_rule(rule_name, self._resolve_ref(ref)) - elif "oneOf" in schema or "anyOf" in schema: - return self._add_rule( - rule_name, - self._generate_union_rule(name, schema.get("oneOf") or schema["anyOf"]), - ) + elif 'oneOf' in schema or 'anyOf' in schema: + return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf'])) elif isinstance(schema_type, list): - return self._add_rule( - rule_name, - self._generate_union_rule(name, [{"type": t} for t in schema_type]), - ) + return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) - elif "const" in schema: - return self._add_rule( - rule_name, self._generate_constant_rule(schema["const"]) - ) + elif 'const' in schema: + return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') - elif "enum" in schema: - rule = " | ".join((self._generate_constant_rule(v) for v in schema["enum"])) + elif 'enum' in schema: + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' return self._add_rule(rule_name, rule) - elif schema_type in (None, "object") and ( - "properties" in schema - or ( - "additionalProperties" in schema - and schema["additionalProperties"] is not True - ) - ): - required = set(schema.get("required", [])) - properties = list(schema.get("properties", {}).items()) - return self._add_rule( - rule_name, - self._build_object_rule( - properties, required, name, schema.get("additionalProperties") - ), - ) + elif schema_type in (None, 'object') and \ + ('properties' in schema or \ + ('additionalProperties' in schema and schema['additionalProperties'] is not True)): + required = set(schema.get('required', [])) + properties = list(schema.get('properties', {}).items()) + return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties'))) - elif schema_type in (None, "object") and "allOf" in schema: + elif schema_type in (None, 'object', 'string') and 'allOf' in schema: required = set() properties = [] + enum_sets = [] hybrid_name = name - def add_component(comp_schema, is_required): - if (ref := comp_schema.get("$ref")) is not None: + if (ref := comp_schema.get('$ref')) is not None: comp_schema = self._refs[ref] - if "properties" in comp_schema: - for prop_name, prop_schema in comp_schema["properties"].items(): + if 'properties' in comp_schema: + for prop_name, prop_schema in comp_schema['properties'].items(): properties.append((prop_name, prop_schema)) if is_required: required.add(prop_name) - for t in schema["allOf"]: - if "anyOf" in t: - for tt in t["anyOf"]: + if 'enum' in comp_schema: + enum_sets.append(set(comp_schema['enum'])) + + for t in schema['allOf']: + if 'anyOf' in t: + for tt in t['anyOf']: add_component(tt, is_required=False) else: add_component(t, is_required=True) - return self._add_rule( - rule_name, - self._build_object_rule( - properties, required, hybrid_name, additional_properties=[] - ), - ) + if enum_sets: + enum_intersection = enum_sets[0] + for s in enum_sets[1:]: + enum_intersection &= s + + if enum_intersection: + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + return self._add_rule(rule_name, rule) + + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) - elif schema_type in (None, "array") and ( - "items" in schema or "prefixItems" in schema - ): - items = schema.get("items") or schema["prefixItems"] + elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): + items = schema.get('items') or schema['prefixItems'] if isinstance(items, list): return self._add_rule( rule_name, - '"[" space ' - + ' "," space '.join( + '"[" space ' + + ' "," space '.join( self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') - for i, item in enumerate(items) - ) - + ' "]" space', - ) + for i, item in enumerate(items)) + + ' "]" space') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - return self._add_rule( - rule_name, - '"[" space ' - + _build_repetition( - item_rule_name, min_items, max_items, separator_rule='"," space' - ) - + ' "]" space', - ) + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') - elif schema_type in (None, "string") and "pattern" in schema: - return self._visit_pattern(schema["pattern"], rule_name) + elif schema_type in (None, 'string') and 'pattern' in schema: + return self._visit_pattern(schema['pattern'], rule_name) - elif schema_type in (None, "string") and re.match( - r"^uuid[1-5]?$", schema_format or "" - ): + elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''): return self._add_primitive( - "root" if rule_name == "root" else schema_format, - PRIMITIVE_RULES["uuid"], - ) - - elif ( - schema_type in (None, "string") - and f"{schema_format}-string" in STRING_FORMAT_RULES - ): - prim_name = f"{schema_format}-string" - return self._add_rule( - rule_name, - self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]), - ) - - elif schema_type == "string" and ( - "minLength" in schema or "maxLength" in schema - ): - char_rule = self._add_primitive("char", PRIMITIVE_RULES["char"]) - min_len = schema.get("minLength", 0) - max_len = schema.get("maxLength") - - return self._add_rule( - rule_name, - r'"\"" ' - + _build_repetition(char_rule, min_len, max_len) - + r' "\"" space', + 'root' if rule_name == 'root' else schema_format, + PRIMITIVE_RULES['uuid'] ) - elif (schema_type == "object") or (len(schema) == 0): - return self._add_rule( - rule_name, self._add_primitive("object", PRIMITIVE_RULES["object"]) - ) + elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES: + prim_name = f'{schema_format}-string' + return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name])) + + elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema): + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + min_len = schema.get('minLength', 0) + max_len = schema.get('maxLength') + + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + + elif schema_type in (None, 'integer') and \ + ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): + min_value = None + max_value = None + if 'minimum' in schema: + min_value = schema['minimum'] + elif 'exclusiveMinimum' in schema: + min_value = schema['exclusiveMinimum'] + 1 + if 'maximum' in schema: + max_value = schema['maximum'] + elif 'exclusiveMaximum' in schema: + max_value = schema['exclusiveMaximum'] - 1 + + out = ["("] + _generate_min_max_int(min_value, max_value, out) + out.append(") space") + return self._add_rule(rule_name, ''.join(out)) + + elif (schema_type == 'object') or (len(schema) == 0): + return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) else: - assert schema_type in PRIMITIVE_RULES, f"Unrecognized schema: {schema}" + assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero - return self._add_primitive( - "root" if rule_name == "root" else schema_type, - PRIMITIVE_RULES[schema_type], - ) + return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type]) def _add_primitive(self, name: str, rule: BuiltinRule): n = self._add_rule(name, rule.content) for dep in rule.deps: dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep) - assert dep_rule, f"Rule {dep} not known" + assert dep_rule, f'Rule {dep} not known' if dep not in self._rules: self._add_primitive(dep, dep_rule) return n - def _build_object_rule( - self, - properties: List[Tuple[str, Any]], - required: Set[str], - name: str, - additional_properties: Union[bool, Any], - ): + def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]): prop_order = self._prop_order # sort by position in prop_order (if specified) then by original order - sorted_props = [ - kv[0] - for _, kv in sorted( - enumerate(properties), - key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]), - ) - ] + sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))] prop_kv_rule_names = {} for prop_name, prop_schema in properties: - prop_rule_name = self.visit( - prop_schema, f'{name}{"-" if name else ""}{prop_name}' - ) + prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') prop_kv_rule_names[prop_name] = self._add_rule( f'{name}{"-" if name else ""}{prop_name}-kv', - rf'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}', + fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}' ) required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties == True or isinstance(additional_properties, dict): + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' - value_rule = self.visit( - {} if additional_properties == True else additional_properties, - f"{sub_name}-value", - ) + value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ + self._add_primitive('value', PRIMITIVE_RULES['value']) + key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \ + else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props)) + prop_kv_rule_names["*"] = self._add_rule( - f"{sub_name}-kv", - self._add_primitive("string", PRIMITIVE_RULES["string"]) - + f' ":" space {value_rule}', + f'{sub_name}-kv', + f'{key_rule} ":" space {value_rule}' ) optional_props.append("*") @@ -908,44 +973,40 @@ def _build_object_rule( rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props) if optional_props: - rule += " (" + rule += ' (' if required_props: rule += ' "," space ( ' def get_recursive_refs(ks, first_is_optional): [k, *rest] = ks kv_rule_name = prop_kv_rule_names[k] - if k == "*": - res = self._add_rule( - f'{name}{"-" if name else ""}additional-kvs', - f'{kv_rule_name} ( "," space ' + kv_rule_name + " )*", - ) - elif first_is_optional: - res = f'( "," space {kv_rule_name} )?' + comma_ref = f'( "," space {kv_rule_name} )' + if first_is_optional: + res = comma_ref + ('*' if k == '*' else '?') else: - res = kv_rule_name + res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '') if len(rest) > 0: - res += " " + self._add_rule( + res += ' ' + self._add_rule( f'{name}{"-" if name else ""}{k}-rest', - get_recursive_refs(rest, first_is_optional=True), + get_recursive_refs(rest, first_is_optional=True) ) return res - rule += " | ".join( + rule += ' | '.join( get_recursive_refs(optional_props[i:], first_is_optional=False) for i in range(len(optional_props)) ) if required_props: - rule += " )" - rule += " )?" + rule += ' )' + rule += ' )?' rule += ' "}" space' return rule def format_grammar(self): - return "\n".join( - f"{name} ::= {rule}" + return '\n'.join( + f'{name} ::= {rule}' for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0]) ) From 7ce56f6a79e030c93399dc4313f707a6a1e38d66 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 15:06:57 +0800 Subject: [PATCH 007/518] feat: Add specific exception messages for llama_decode failure codes --- llama_cpp/_internals.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 91eb801f6b..596053fb97 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -436,12 +436,19 @@ def encode(self, batch: LlamaBatch): raise RuntimeError(f"llama_encode returned {return_code}") def decode(self, batch: LlamaBatch): - return_code = llama_cpp.llama_decode( - self.ctx, - batch.batch, - ) - if return_code != 0: - raise RuntimeError(f"llama_decode returned {return_code}") + return_code = llama_cpp.llama_decode(self.ctx, batch.batch) + + if return_code == 0: + return + + error_map = { + 1: "No KV slot available: try reducing batch size or increasing context window", + 2: "Decoding aborted", + -1: "Invalid input batch", + } + + msg = error_map.get(return_code, "Fatal internal error") + raise RuntimeError(f"llama_decode failed (code {return_code}): {msg}") def set_n_threads(self, n_threads: int, n_threads_batch: int): llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) From ebe395c25e1169839255fabf057ce38bf3f17656 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 15:47:03 +0800 Subject: [PATCH 008/518] Remove add_grammar_lazy from _internals.py --- llama_cpp/_internals.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 596053fb97..fae8b50d50 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1010,27 +1010,6 @@ def convert_list_str_to_char_array_ptr(self, str_list: List[str]): # Return the char** pointer and the number of strings return char_array_ptr, num_byte_list - def add_grammar_lazy( - self, - model: LlamaModel, - grammar: LlamaGrammar, - trigger_tokens:list[llama_cpp.llama_token], - num_trigger_tokens: int, - trigger_words: list[str]=[] - ): - trigger_words_char_array_ptr, num_trigger_words = self.convert_list_str_to_char_array_ptr(trigger_words) - - sampler = llama_cpp.llama_sampler_init_grammar_lazy( - model.vocab, - grammar._grammar.encode("utf-8"), - grammar._root.encode("utf-8"), - trigger_words_char_array_ptr, - num_trigger_words, - trigger_tokens, - num_trigger_tokens - ) - self._add_sampler(sampler) - def add_grammar_lazy_patterns( self, model: LlamaModel, From ed221d988484eae07093e6a041b22e12aa2bd26c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 17:31:46 +0800 Subject: [PATCH 009/518] refactor: optimize LlamaGrammar class code - Add property accessor for current grammar. - Improve file handling robustness using `pathlib` and explicit encoding. - Update `from_json_schema` to support `prop_order` - Fix typos and improve naming conventions. Signed-off-by: JamePeng --- llama_cpp/llama_grammar.py | 44 +++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 7759082f38..46ae4ba1ce 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -23,30 +23,50 @@ def __init__(self, *args, _grammar: str, **kwargs): self._grammar = _grammar self._root = LLAMA_GRAMMAR_DEFAULT_ROOT + @property + def grammar(self) -> str: + return self._grammar + @classmethod def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar": return cls(_grammar=grammar) @classmethod def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar": + file_path = Path(file) + + if not file_path.exists(): + raise FileNotFoundError(f"{cls.__name__}.from_file: file not found: {file_path}") + try: - with open(file) as f: - grammar = f.read() + grammar_content = file_path.read_text(encoding='utf-8') except Exception as err: - raise Exception( - f"{cls.from_file.__name__}: error reading grammar file: {err}" - ) + raise IOError(f"{cls.__name__}.from_file: error reading grammar file: {err}") - if grammar: - return cls.from_string(grammar, verbose=verbose) + if not grammar_content.strip(): + raise ValueError(f"{cls.__name__}.from_file: grammar file is empty") - raise ValueError( - f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty" - ) + return cls.from_string(grammar_content, verbose=verbose) @classmethod - def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGrammar": - return cls.from_string(json_schema_to_gbnf(json_schema), verbose=verbose) + def from_json_schema( + cls, + json_schema: str, + prop_order: Optional[List[str]] = None, + verbose: bool = True + ) -> "LlamaGrammar": + """ + Create a syntax object from a JSON Schema. + + json_schema: A JSON Schema string or dictionary. + prop_order: Specifies the order in which fields are generated (helps improve the stability of small models). + verbose: Whether to log. + """ + try: + gbnf_grammar_str = json_schema_to_gbnf(json_schema, prop_order=prop_order) + return cls.from_string(gbnf_grammar_str, verbose=verbose) + except Exception as e: + raise ValueError(f"{cls.__name__}.from_json_schema: conversion failed: {e}") """llama.cpp gbnf rules from vendor/llama.cpp/grammars""" From 155245549d614704a3ce754d8ef48de9dfec2862 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 23:46:01 +0800 Subject: [PATCH 010/518] Update Submodule vendor/llama.cpp c7af376..7f8ef50 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c7af376c29..7f8ef50cce 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c7af376c298b7d09c280233548668ba6fcc17deb +Subproject commit 7f8ef50cce40e3e7e4526a3696cb45658190e69a From 83054c40995c2198f996e243fa4bc8172acf3580 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 30 Nov 2025 23:46:49 +0800 Subject: [PATCH 011/518] build: Improve CMakeLists target logic - Deduplicate target installation logic using loops. - Encapsulate Windows DLL installation within the install helper. - Optimize macOS architecture detection using CMake built-ins. --- CMakeLists.txt | 142 +++++++++++++++++++++---------------------------- 1 file changed, 61 insertions(+), 81 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 54afdca424..9549b37ea4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,43 +5,49 @@ project(llama_cpp) option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON) option(MTMD_BUILD "Build mtmd shared library and install alongside python package" ON) +# Helper function to install targets to Python package directories function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) return() endif() - install( - TARGETS ${target} - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) - install( - TARGETS ${target} - LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ) - set_target_properties(${target} PROPERTIES - INSTALL_RPATH "$ORIGIN" - BUILD_WITH_INSTALL_RPATH TRUE + # Define install destinations to avoid code duplication + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" ) + + foreach(DIR ${INSTALL_DIRS}) + install( + TARGETS ${target} + LIBRARY DESTINATION ${DIR} + RUNTIME DESTINATION ${DIR} + ARCHIVE DESTINATION ${DIR} + FRAMEWORK DESTINATION ${DIR} + RESOURCE DESTINATION ${DIR} + ) + + # Automatically handle Windows DLL installation for each target + if (WIN32) + install( + FILES $ + DESTINATION ${DIR} + OPTIONAL # Prevent errors if the target has no DLLs + ) + endif() + endforeach() + + # Configure RPATH if(UNIX) + set(INSTALL_RPATH_VAL "$ORIGIN") if(APPLE) - set_target_properties(${target} PROPERTIES - INSTALL_RPATH "@loader_path" - BUILD_WITH_INSTALL_RPATH TRUE - ) - else() - set_target_properties(${target} PROPERTIES - INSTALL_RPATH "$ORIGIN" - BUILD_WITH_INSTALL_RPATH TRUE - ) + set(INSTALL_RPATH_VAL "@loader_path") endif() + + set_target_properties(${target} PROPERTIES + INSTALL_RPATH "${INSTALL_RPATH_VAL}" + BUILD_WITH_INSTALL_RPATH TRUE + ) endif() endfunction() @@ -72,19 +78,12 @@ if (LLAMA_BUILD) # Architecture detection and settings for Apple platforms if (APPLE) - # Get the target architecture - execute_process( - COMMAND uname -m - OUTPUT_VARIABLE HOST_ARCH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture if(NOT CMAKE_OSX_ARCHITECTURES) - set(CMAKE_OSX_ARCHITECTURES ${HOST_ARCH} CACHE STRING "Build architecture for macOS" FORCE) + set(CMAKE_OSX_ARCHITECTURES ${CMAKE_HOST_SYSTEM_PROCESSOR} CACHE STRING "Build architecture for macOS" FORCE) endif() - message(STATUS "Host architecture: ${HOST_ARCH}") + message(STATUS "Host architecture: ${CMAKE_HOST_SYSTEM_PROCESSOR}") message(STATUS "Target architecture: ${CMAKE_OSX_ARCHITECTURES}") # Configure based on target architecture @@ -109,45 +108,31 @@ if (LLAMA_BUILD) endif() endif() - llama_cpp_python_install_target(llama) - llama_cpp_python_install_target(ggml) - - llama_cpp_python_install_target(ggml-base) - - llama_cpp_python_install_target(ggml-blas) - llama_cpp_python_install_target(ggml-cann) - llama_cpp_python_install_target(ggml-cpu) - llama_cpp_python_install_target(ggml-cuda) - llama_cpp_python_install_target(ggml-hexagon) - llama_cpp_python_install_target(ggml-hip) - llama_cpp_python_install_target(ggml-metal) - llama_cpp_python_install_target(ggml-musa) - llama_cpp_python_install_target(ggml-opencl) - llama_cpp_python_install_target(ggml-rpc) - llama_cpp_python_install_target(ggml-sycl) - llama_cpp_python_install_target(ggml-vulkan) - llama_cpp_python_install_target(ggml-webgpu) - llama_cpp_python_install_target(ggml-zdnn) - - # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563 - if (WIN32) - install( - FILES $ - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) - install( - FILES $ - DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ) - install( - FILES $ - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) - install( - FILES $ - DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib - ) - endif() + # Define list of GGML targets to install + set(GGML_TARGETS + llama + ggml + ggml-base + ggml-blas + ggml-cann + ggml-cpu + ggml-cuda + ggml-hexagon + ggml-hip + ggml-metal + ggml-musa + ggml-opencl + ggml-rpc + ggml-sycl + ggml-vulkan + ggml-webgpu + ggml-zdnn + ) + + # Loop through targets to avoid repetitive function calls + foreach(TARGET_NAME ${GGML_TARGETS}) + llama_cpp_python_install_target(${TARGET_NAME}) + endforeach() if (MTMD_BUILD) if (NOT DEFINED LLAMA_BUILD_NUMBER) @@ -172,10 +157,5 @@ if (LLAMA_BUILD) endif() llama_cpp_python_install_target(mtmd) - - if (WIN32) - install(FILES $ DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib) - install(FILES $ DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib) - endif() endif() endif() From 4a5baa6aa0314dad09ee92f1b0d8e39acb88c491 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Dec 2025 08:11:39 +0800 Subject: [PATCH 012/518] Update Submodule vendor/llama.cpp 7f8ef50..746f9ee --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7f8ef50cce..746f9ee889 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7f8ef50cce40e3e7e4526a3696cb45658190e69a +Subproject commit 746f9ee88941c2f259268c484fe8278375387081 From 6ee17c06556a6019683ab2e9704015689f0d4737 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Dec 2025 08:37:52 +0800 Subject: [PATCH 013/518] Sync mtmd: add mtmd_context_params::warmup option --- llama_cpp/llama_chat_format.py | 1 + llama_cpp/mtmd_cpp.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 62d604ce7d..dfdbc1c507 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2828,6 +2828,7 @@ def _init_mtmd_context(self, llama_model: llama.Llama): mctx_params.print_timings = self.verbose mctx_params.n_threads = llama_model.n_threads mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + mctx_params.warmup = True if self.image_min_tokens > 0: mctx_params.image_min_tokens = self.image_min_tokens if self.image_max_tokens > 0: diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 89008043cf..e2ee004dea 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -127,6 +127,7 @@ class clip_flash_attn_type (enum.IntEnum): # enum clip_flash_attn_type flash_attn_type; # int image_min_tokens; # int image_max_tokens; +# bool warmup; # }; class clip_context_params(Structure): _fields_ = [ @@ -134,6 +135,7 @@ class clip_context_params(Structure): ("flash_attn_type", c_int), ("image_min_tokens", c_int), ("image_max_tokens", c_int), + ("warmup", c_bool), ] # struct mtmd_context_params { @@ -143,6 +145,7 @@ class clip_context_params(Structure): # const char * image_marker; // deprecated, use media_marker instead # const char * media_marker; # enum llama_flash_attn_type flash_attn_type; +# bool warmup; // whether to run a warmup encode pass after initialization # // limit number of image tokens, only for vision models with dynamic resolution # int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) @@ -156,6 +159,7 @@ class mtmd_context_params(Structure): ("image_marker", c_char_p), ("media_marker", c_char_p), ("flash_attn_type", c_int), + ("warmup", c_bool), ("image_min_tokens", c_int), ("image_max_tokens", c_int), ] From 5eac2342514b16695a8381776aa2a460415208a7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Dec 2025 08:42:34 +0800 Subject: [PATCH 014/518] Replace forced BoringSSL build with OpenSSL option Previously BoringSSL was forced on Windows/macOS platforms. Now enable OpenSSL support by default instead, allowing use of system OpenSSL and avoiding large BoringSSL downloads. Signed-off-by: JamePeng --- CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9549b37ea4..6b0a3ac692 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,10 +71,8 @@ if (LLAMA_BUILD) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE) - if (WIN32 OR APPLE) - # Enable build and link BoringSSL only on Windows and macOS - set(LLAMA_BUILD_BORINGSSL ON CACHE BOOL "llama.cpp: build and link BoringSSL" FORCE) - endif() + # Enable build and link OpenSSL + set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) # Architecture detection and settings for Apple platforms if (APPLE) From 22034f5eb09c66051a3b7f9832c117422d7485ae Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Dec 2025 09:03:03 +0800 Subject: [PATCH 015/518] Add new workflow with Basic options for Linux platform (CUDA 12.4, 12.6, 12.8) --- .../build-wheels-cu124-linux-basic.yml | 116 ++++++++++++++++++ .../workflows/build-wheels-cu124-linux.yml | 2 +- .../build-wheels-cu126-linux-basic.yml | 116 ++++++++++++++++++ .../workflows/build-wheels-cu126-linux.yml | 2 +- .../build-wheels-cu128-linux-basic.yml | 116 ++++++++++++++++++ 5 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/build-wheels-cu124-linux-basic.yml create mode 100644 .github/workflows/build-wheels-cu126-linux-basic.yml create mode 100644 .github/workflows/build-wheels-cu128-linux-basic.yml diff --git a/.github/workflows/build-wheels-cu124-linux-basic.yml b/.github/workflows/build-wheels-cu124-linux-basic.yml new file mode 100644 index 0000000000..2d36e5da46 --- /dev/null +++ b/.github/workflows/build-wheels-cu124-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU124) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["12.4.1"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v4 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 124) from the full version (e.g., 12.4.1) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 20d4abc360..fdbccb474d 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | apt update - apt install -y build-essential cmake ccache curl git libgomp1 libjpeg-dev libssl-dev + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - uses: actions/checkout@v4 # Checkout code with: diff --git a/.github/workflows/build-wheels-cu126-linux-basic.yml b/.github/workflows/build-wheels-cu126-linux-basic.yml new file mode 100644 index 0000000000..f61ea50ea2 --- /dev/null +++ b/.github/workflows/build-wheels-cu126-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU126) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["12.6.3"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v4 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 126) from the full version (e.g., 12.6.3) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index b6f8311d9a..599391dcc2 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | apt update - apt install -y build-essential cmake ccache curl git libgomp1 libjpeg-dev libssl-dev + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - uses: actions/checkout@v4 # Checkout code with: diff --git a/.github/workflows/build-wheels-cu128-linux-basic.yml b/.github/workflows/build-wheels-cu128-linux-basic.yml new file mode 100644 index 0000000000..83bd0b4632 --- /dev/null +++ b/.github/workflows/build-wheels-cu128-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU128) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["12.8.1"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v4 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 128) from the full version (e.g., 12.8.1) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file From 5d4928788a1a3d8d24f61a3c186a9d5ec5621f7d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Dec 2025 13:05:23 +0800 Subject: [PATCH 016/518] Fixed Basic workflow typos --- .github/workflows/build-wheels-cu124-linux-basic.yml | 2 +- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu126-linux-basic.yml | 2 +- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu128-linux-basic.yml | 2 +- .github/workflows/build-wheels-cu128-linux.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-linux-basic.yml b/.github/workflows/build-wheels-cu124-linux-basic.yml index 2d36e5da46..98f50fe474 100644 --- a/.github/workflows/build-wheels-cu124-linux-basic.yml +++ b/.github/workflows/build-wheels-cu124-linux-basic.yml @@ -67,7 +67,7 @@ jobs: # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index fdbccb474d..571402ff02 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -79,7 +79,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu126-linux-basic.yml b/.github/workflows/build-wheels-cu126-linux-basic.yml index f61ea50ea2..78d1471c76 100644 --- a/.github/workflows/build-wheels-cu126-linux-basic.yml +++ b/.github/workflows/build-wheels-cu126-linux-basic.yml @@ -67,7 +67,7 @@ jobs: # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 599391dcc2..969272ea02 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -79,7 +79,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu128-linux-basic.yml b/.github/workflows/build-wheels-cu128-linux-basic.yml index 83bd0b4632..8c527c7187 100644 --- a/.github/workflows/build-wheels-cu128-linux-basic.yml +++ b/.github/workflows/build-wheels-cu128-linux-basic.yml @@ -67,7 +67,7 @@ jobs: # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index ffe030f745..ec10872b62 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -79,7 +79,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS = "${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it From 83224819a33593e82c89d8e1241fe7873c27191c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 5 Dec 2025 20:44:59 +0800 Subject: [PATCH 017/518] Update Submodule vendor/llama.cpp 746f9ee..1be9783 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 746f9ee889..1be97831e4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 746f9ee88941c2f259268c484fe8278375387081 +Subproject commit 1be97831e44a6335aca9c3f4f3edbb0e35bea98f From 8213c19b0e164780ffffa3e64b5fc033cdbe4974 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 5 Dec 2025 23:00:06 +0800 Subject: [PATCH 018/518] perf: optimize LlamaModel.metadata reading performance - Increase initial buffer size to 16KB to eliminate re-allocations for large chat templates. - Cache ctypes function references to reduce loop overhead. - Repeated model loading can result in a cumulative speed improvement of 1-3%. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 42 +++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index fae8b50d50..c4254ae10e 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -225,32 +225,38 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: # Extra def metadata(self) -> Dict[str, str]: metadata: Dict[str, str] = {} - buffer_size = 1024 + # Pre-allocate a 16KB buffer. This is large enough to handle almost all + # metadata values (including gpt-oss large chat templates ~15KB) in a single pass, + # eliminating the need for resize-and-retry in most cases. + buffer_size = 16384 buffer = ctypes.create_string_buffer(buffer_size) - # zero the buffer - buffer.value = b"\0" * buffer_size + + # Caching function references reduces the overhead of property lookups within loops. + get_key_by_index = llama_cpp.llama_model_meta_key_by_index + get_val_by_index = llama_cpp.llama_model_meta_val_str_by_index + metadata_count = llama_cpp.llama_model_meta_count(self.model) # iterate over model keys - for i in range(llama_cpp.llama_model_meta_count(self.model)): - nbytes = llama_cpp.llama_model_meta_key_by_index( - self.model, i, buffer, buffer_size - ) + for i in range(metadata_count): + # 1. Get Key + nbytes = get_key_by_index(self.model, i, buffer, buffer_size) + # Handle buffer resize if the key exceeds current size if nbytes > buffer_size: - buffer_size = nbytes + 1 + buffer_size = nbytes + 1024 buffer = ctypes.create_string_buffer(buffer_size) - nbytes = llama_cpp.llama_model_meta_key_by_index( - self.model, i, buffer, buffer_size - ) + # Retry with the larger buffer + nbytes = get_key_by_index(self.model, i, buffer, buffer_size) key = buffer.value.decode("utf-8") - nbytes = llama_cpp.llama_model_meta_val_str_by_index( - self.model, i, buffer, buffer_size - ) + + # 2. Get Value + nbytes = get_val_by_index(self.model, i, buffer, buffer_size) + # Handle buffer resize if the value exceeds current size if nbytes > buffer_size: - buffer_size = nbytes + 1 + buffer_size = nbytes + 1024 buffer = ctypes.create_string_buffer(buffer_size) - nbytes = llama_cpp.llama_model_meta_val_str_by_index( - self.model, i, buffer, buffer_size - ) + # Retry with the larger buffer + nbytes = get_val_by_index(self.model, i, buffer, buffer_size) value = buffer.value.decode("utf-8") + metadata[key] = value return metadata From a5a14a64d87fde89d0d5b329479c91db8062b19e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 09:46:19 +0800 Subject: [PATCH 019/518] Update Submodule vendor/llama.cpp 1be9783..d9e03db --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1be97831e4..d9e03db1e7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1be97831e44a6335aca9c3f4f3edbb0e35bea98f +Subproject commit d9e03db1e701e34ed0b764615025110041729864 From be1363c4e861f06ea12f5e99ee913a72412546bb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 09:56:05 +0800 Subject: [PATCH 020/518] Sync ggml-zendnn : add ZenDNN backend for AMD CPUs --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b0a3ac692..ca53b2ff63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,6 +125,7 @@ if (LLAMA_BUILD) ggml-vulkan ggml-webgpu ggml-zdnn + ggml-zendnn ) # Loop through targets to avoid repetitive function calls From 0be9424d8a86b861b1c8ba310f9be48ba13542cd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 22:38:08 +0800 Subject: [PATCH 021/518] Update Submodule vendor/llama.cpp d9e03db..0a540f9 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d9e03db1e7..0a540f9abd 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d9e03db1e701e34ed0b764615025110041729864 +Subproject commit 0a540f9abd98915edb99fed47d80078ed8d2f343 From 61e2346b03d595484a8f6e1f7197c4a5bfe7512a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 22:53:25 +0800 Subject: [PATCH 022/518] Remove outdated CUDA workflows --- .github/workflows/build-wheels-cuda.yaml | 136 ----------------------- 1 file changed, 136 deletions(-) delete mode 100644 .github/workflows/build-wheels-cuda.yaml diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml deleted file mode 100644 index 4620cdd76d..0000000000 --- a/.github/workflows/build-wheels-cuda.yaml +++ /dev/null @@ -1,136 +0,0 @@ -name: Build Wheels (CUDA) - -on: workflow_dispatch - -permissions: - contents: write - -jobs: - define_matrix: - name: Define Build Matrix - runs-on: ubuntu-22.04 - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - defaults: - run: - shell: pwsh - - steps: - - name: Define Job Output - id: set-matrix - run: | - $matrix = @{ - 'os' = @('ubuntu-22.04', 'windows-2022') - 'pyver' = @("3.9", "3.10", "3.11", "3.12") - 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1") - 'releasetag' = @("basic") - } - - $matrixOut = ConvertTo-Json $matrix -Compress - Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT - - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - needs: define_matrix - runs-on: ${{ matrix.os }} - strategy: - matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 - with: - vs-version: '[16.11,16.12)' - - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.pyver }} - cache: 'pip' - - - name: Setup Mamba - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - activate-environment: "llamacpp" - python-version: ${{ matrix.pyver }} - miniforge-version: latest - add-pip-as-python-dependency: true - auto-activate-base: false - - - name: VS Integration Cache - id: vs-integration-cache - if: runner.os == 'Windows' - uses: actions/cache@v4 - with: - path: ./MSBuildExtensions - key: cuda-${{ matrix.cuda }}-vs-integration - - - name: Get Visual Studio Integration - if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' - run: | - if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} - $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) - for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} - Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' - & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null - Remove-Item 'cudainstaller.zip' - - - name: Install Visual Studio Integration - if: runner.os == 'Windows' - run: | - $y = (gi '.\MSBuildExtensions').fullname + '\*' - (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) - $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') - echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV - - - name: Install Dependencies - env: - MAMBA_DOWNLOAD_FAILFAST: "0" - MAMBA_NO_LOW_SPEED_LIMIT: "1" - run: | - $cudaVersion = $env:CUDAVER - mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion - python -m pip install build wheel - - - name: Build Wheel - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX - if ($IsLinux) { - $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH - } - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" - # if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - # } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # if ($env:AVXVER -eq 'basic') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - # } - python -m build --wheel - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - - uses: softprops/action-gh-release@v2 - with: - files: dist/* - # Set tag_name to -cu - tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 2b97b7ea8c79de56f7b71cb5213363ddc2e253ac Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 23:02:19 +0800 Subject: [PATCH 023/518] workflow: Added workflows for compiling with CUDA 13.0.2 on Windows and Linux. --- .../build-wheels-cu130-linux-basic.yml | 116 ++++++++++++++++ .../workflows/build-wheels-cu130-linux.yml | 128 ++++++++++++++++++ .../build-wheels-cu130-win-basic.yml | 107 +++++++++++++++ .github/workflows/build-wheels-cu130-win.yml | 119 ++++++++++++++++ 4 files changed, 470 insertions(+) create mode 100644 .github/workflows/build-wheels-cu130-linux-basic.yml create mode 100644 .github/workflows/build-wheels-cu130-linux.yml create mode 100644 .github/workflows/build-wheels-cu130-win-basic.yml create mode 100644 .github/workflows/build-wheels-cu130-win.yml diff --git a/.github/workflows/build-wheels-cu130-linux-basic.yml b/.github/workflows/build-wheels-cu130-linux-basic.yml new file mode 100644 index 0000000000..618aa7c9a7 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-linux-basic.yml @@ -0,0 +1,116 @@ +name: Build Wheels(CU130) for Linux(Basic) + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["13.0.2"] + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v5 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml new file mode 100644 index 0000000000..708ae462eb --- /dev/null +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -0,0 +1,128 @@ +name: Build Wheels(CU130) for Linux + +on: + workflow_dispatch: # Manual trigger + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 + strategy: + matrix: # Define the build matrix directly here + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + cuda: ["13.0.2"] + releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + cudaarch: ["all"] # Controls target CUDA architectures for nvcc + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + + steps: + - name: Install dependencies + run: | + apt update + apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev + + - uses: actions/checkout@v5 # Checkout code + with: + submodules: "recursive" + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - run: nvcc -V + + - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: 1 # Enable verbose build output + CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME + CUDA_PATH: "${PATH}" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + run: | + echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting + find /usr/ -name 'libcuda.so.*' + echo $LD_LIBRARY_PATH + + # Add project-specific and feature flags + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + + if [ "${AVXVER}" = "AVX" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + fi + if [ "${AVXVER}" = "AVX2" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + if [ "${AVXVER}" = "AVXVNNI" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + fi + # if [ "${AVXVER}" = "AVX512" ]; then + # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" + # fi + # Basic options for compiling without AVX instructions + if [ "${AVXVER}" = "Basic" ]; then + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + fi + + # Export CMAKE_ARGS environment variable so the python -m build command can use it + echo ${CMAKE_ARGS} + echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV + + # Run the Python build command to generate the wheel + uv pip install build setuptools wheel packaging + # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + + # --- Post-build steps to get info for release tag --- + + # Find the generated wheel file in the 'dist' directory using bash + # Assumes only one wheel is generated per build configuration run + wheel_file=$(ls dist/*.whl | head -n 1) + + # Extract the package version (e.g., 1.2.3) from the wheel filename + # Filename format is typically: package_name-version-tag-specificators.whl + # Using basename and cut to split by '-' and get the second field + tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) + echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + + # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + with: + files: dist/* # Upload the generated wheel files from the dist directory + # Define the release tag name using the collected environment variables + # Format: v-cu--linux- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux + # Note: This action will create a new release tag if it doesn't exist, + # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu130-win-basic.yml b/.github/workflows/build-wheels-cu130-win-basic.yml new file mode 100644 index 0000000000..386c7b5d1f --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win-basic.yml @@ -0,0 +1,107 @@ +name: Build Wheels (CU130) for Windows(Basic) + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['windows-2022'] + pyver: ["3.10", "3.11", "3.12", "3.13"] + cuda: ["13.0.2"] + releasetag: ["Basic"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list + # e.g. "all" "89" "90" "100" "120" + MAX_JOBS: 8 + + steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v2 + with: + msbuild-architecture: x64 + + - uses: actions/checkout@v5 + with: + submodules: "recursive" + + # from kingbri1/flash-attention build-wheels.yml + - name: Install CUDA ${{ matrix.cuda }} + uses: N-Storm/cuda-toolkit@v0.2.29 + id: cuda-toolkit + with: + cuda: "${{ matrix.cuda }}" + use-github-cache: false + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install Dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + + # Basic options for compiling without AVX instructions + if ($env:AVXVER -eq 'Basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + $wheel = (gi '.\dist\*.whl')[0] + $tagVer = $wheel.name.split('-')[1] + Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + + - name: Get Current Date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create Release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v2 + with: + files: dist/* + # Set tag_name to -cu--win + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml new file mode 100644 index 0000000000..f714871631 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -0,0 +1,119 @@ +name: Build Wheels (CU130) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['windows-2022'] + pyver: ["3.10", "3.11", "3.12", "3.13"] + cuda: ["13.0.2"] + releasetag: ["AVX2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list + # e.g. "all" "89" "90" "100" "120" + MAX_JOBS: 8 + + steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v2 + with: + msbuild-architecture: x64 + + - uses: actions/checkout@v5 + with: + submodules: "recursive" + + # from kingbri1/flash-attention build-wheels.yml + - name: Install CUDA ${{ matrix.cuda }} + uses: N-Storm/cuda-toolkit@v0.2.29 + id: cuda-toolkit + with: + cuda: "${{ matrix.cuda }}" + use-github-cache: false + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install Dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + + if ($env:AVXVER -eq 'AVX') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' + } + if ($env:AVXVER -eq 'AVX2') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + if ($env:AVXVER -eq 'AVXVNNI') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off' + } + # if ($env:AVXVER -eq 'AVX512') { + # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' + # } + # Basic options for compiling without AVX instructions + if ($env:AVXVER -eq 'Basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + + $wheel = (gi '.\dist\*.whl')[0] + $tagVer = $wheel.name.split('-')[1] + Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + + - name: Get Current Date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create Release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v2 + with: + files: dist/* + # Set tag_name to -cu--win + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 4189786de5e401ecf04044f575a9da5f46346fbb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 23:13:24 +0800 Subject: [PATCH 024/518] workflow: Fixed nvcc fatal : Unsupported gpu architecture 'compute_101' --- .github/workflows/build-wheels-cu130-linux-basic.yml | 2 +- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win-basic.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-wheels-cu130-linux-basic.yml b/.github/workflows/build-wheels-cu130-linux-basic.yml index 618aa7c9a7..0f03787a68 100644 --- a/.github/workflows/build-wheels-cu130-linux-basic.yml +++ b/.github/workflows/build-wheels-cu130-linux-basic.yml @@ -61,7 +61,7 @@ jobs: echo $LD_LIBRARY_PATH # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 708ae462eb..59bfec7412 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -61,7 +61,7 @@ jobs: echo $LD_LIBRARY_PATH # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" + CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" diff --git a/.github/workflows/build-wheels-cu130-win-basic.yml b/.github/workflows/build-wheels-cu130-win-basic.yml index 386c7b5d1f..17b0fb6c72 100644 --- a/.github/workflows/build-wheels-cu130-win-basic.yml +++ b/.github/workflows/build-wheels-cu130-win-basic.yml @@ -16,7 +16,7 @@ jobs: pyver: ["3.10", "3.11", "3.12", "3.13"] cuda: ["13.0.2"] releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] defaults: run: shell: pwsh diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index f714871631..95da62fc37 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -16,7 +16,7 @@ jobs: pyver: ["3.10", "3.11", "3.12", "3.13"] cuda: ["13.0.2"] releasetag: ["AVX2"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] defaults: run: shell: pwsh From e5e19e8a80654ff28862590202a2888079730029 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Dec 2025 23:34:20 +0800 Subject: [PATCH 025/518] feat: Added the scan path for CUDA 13.0+ dynamic link libraries under Windows system ($env:CUDA_PATH\bin\x64) Signed-off-by: JamePeng --- llama_cpp/_ctypes_extensions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index 4ad6b0d1ba..0ba7f416d9 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -56,6 +56,7 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): cuda_path = os.environ["CUDA_PATH"] sub_dirs_to_add = [ "bin", + os.path.join("bin", "x64"), # CUDA 13.0+ "lib", os.path.join("lib", "x64") ] From 588974d79e255add87379e7eee6b3a7b777147b5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Dec 2025 00:42:48 +0800 Subject: [PATCH 026/518] Update Submodule vendor/llama.cpp 0a540f9..2fa51c1 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0a540f9abd..2fa51c19b0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0a540f9abd98915edb99fed47d80078ed8d2f343 +Subproject commit 2fa51c19b028180b35d316e9ed06f5f0f7ada2c1 From fcfde244e90c3f4ab246715f8407a71b221cc2bd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Dec 2025 21:29:21 +0800 Subject: [PATCH 027/518] Attempting to fix the AVX2 workflow: Missing GGML_FMA and GGML_F16C may cause an OSError: [WinError -1073741795] Windows Error 0xc000001d error on processors that support AVX2 instructions. --- .github/workflows/build-wheels-cu124-cu126-win.yml | 6 +++--- .github/workflows/build-wheels-cu124-linux.yml | 6 +++--- .github/workflows/build-wheels-cu126-linux.yml | 6 +++--- .github/workflows/build-wheels-cu128-linux.yml | 6 +++--- .github/workflows/build-wheels-cu128-win.yml | 4 ++-- .github/workflows/build-wheels-cu130-linux.yml | 6 +++--- .github/workflows/build-wheels-cu130-win.yml | 6 +++--- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index 7ce77041eb..5d5a91efe0 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -72,13 +72,13 @@ jobs: $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' } if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 571402ff02..3feeeecfd7 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -66,13 +66,13 @@ jobs: CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 969272ea02..f9b566fab8 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -66,13 +66,13 @@ jobs: CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index ec10872b62..b25128d6e8 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -66,13 +66,13 @@ jobs: CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index adf78e1e7f..40578c8b4c 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -75,10 +75,10 @@ jobs: $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' } if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 59bfec7412..6451c7ee43 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -66,13 +66,13 @@ jobs: CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" fi if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" fi if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" fi # if [ "${AVXVER}" = "AVX512" ]; then # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index 95da62fc37..3c7d07caa1 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -72,13 +72,13 @@ jobs: $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' } if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on -DGGML_FMA=on -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' From 46ad8c0c6131e6bbf6c417756e8eddaebfa1548d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Dec 2025 22:01:56 +0800 Subject: [PATCH 028/518] Update Submodule vendor/llama.cpp 2fa51c1..6b82eb7 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2fa51c19b0..6b82eb7883 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2fa51c19b028180b35d316e9ed06f5f0f7ada2c1 +Subproject commit 6b82eb7883d1babf5f6bd4bca70997f229691fed From b2aa65678ca07114feff08885eee110e75621c56 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 10 Dec 2025 07:25:09 +0800 Subject: [PATCH 029/518] Fixed typo --- llama_cpp/_internals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index c4254ae10e..bdc8f3dc89 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -307,8 +307,8 @@ def __del__(self): def n_ctx(self) -> int: return llama_cpp.llama_n_ctx(self.ctx) - def n_batch(self) -> int: - return llama_cpp.llama_n_batch(self.ctx) + def n_ctx_seq(self) -> int: + return llama_cpp.llama_n_ctx_seq(self.ctx) def n_batch(self) -> int: return llama_cpp.llama_n_batch(self.ctx) From e4db2763ab06ea714cc9940bdcf02e7811eae861 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Dec 2025 09:38:43 +0800 Subject: [PATCH 030/518] Update Submodule vendor/llama.cpp 6b82eb7..4a4f7e6 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6b82eb7883..4a4f7e6550 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6b82eb7883d1babf5f6bd4bca70997f229691fed +Subproject commit 4a4f7e6550cf5b327ea0fb241ce7417ab46e1ace From c98ce3b7ce6808eb7a86e388e42553fc2ba1e6d3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 15 Dec 2025 20:45:16 +0800 Subject: [PATCH 031/518] Sync llama.cpp API 20251215 --- llama_cpp/llama_cpp.py | 77 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9b67621518..98ec5db01d 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -712,6 +712,7 @@ class llama_model_tensor_buft_override(ctypes.Structure): # bool check_tensors; // validate model tensor data # bool use_extra_bufts; // use extra buffer types (used for weight repacking) # bool no_host; // bypass host buffer allowing extra buffers to be used +# bool no_alloc; // only load metadata and simulate memory allocations # }; class llama_model_params(ctypes.Structure): """Parameters for llama_model @@ -731,7 +732,8 @@ class llama_model_params(ctypes.Structure): use_mlock (bool): force system to keep model in RAM check_tensors (bool): validate model tensor data use_extra_bufts (bool): use extra buffer types (used for weight repacking) - no_host (bool): bypass host buffer allowing extra buffers to be used""" + no_host (bool): bypass host buffer allowing extra buffers to be used + no_alloc (bool): only load metadata and simulate memory allocations""" if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused @@ -749,6 +751,7 @@ class llama_model_params(ctypes.Structure): check_tensors: bool use_extra_bufts: bool no_host: bool + no_alloc: bool _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused @@ -766,8 +769,10 @@ class llama_model_params(ctypes.Structure): ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), ("no_host", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), ] +llama_model_params_p = ctypes.POINTER(llama_model_params) # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations # // https://github.com/ggml-org/llama.cpp/pull/7544 @@ -918,6 +923,7 @@ class llama_context_params(ctypes.Structure): ("kv_unified", ctypes.c_bool), ] +llama_context_params_p = ctypes.POINTER(llama_context_params) # // Signature for logging events # // Note that text includes the new line character at the end for most events. @@ -1306,6 +1312,51 @@ def llama_free(ctx: llama_context_p, /): ... +# // fits mparams and cparams to free device memory (assumes system memory is unlimited) +# // returns true if the parameters could be successfully modified to fit device memory +# // this function is NOT thread safe because it modifies the global llama logger state +# LLAMA_API bool llama_params_fit( +# const char * path_model, +# struct llama_model_params * mparams, +# struct llama_context_params * cparams, +# float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements +# struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements +# size_t margin, // margin of memory to leave per device in bytes +# uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use +# enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log +@ctypes_function( + "llama_params_fit", + [ + ctypes.c_char_p, + llama_model_params_p, + llama_context_params_p, + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(llama_model_tensor_buft_override), + ctypes.c_size_t, + ctypes.c_uint32, + ctypes.c_int, + ], + ctypes.c_bool, +) +def llama_params_fit( + path_model: ctypes.c_char_p, + mparams: llama_model_params_p, + cparams: llama_context_params_p, + tensor_split: ctypes.pointer(ctypes.c_float), + tensor_buft_overrides: ctypes.pointer(llama_model_tensor_buft_override), + margin: ctypes.c_size_t, + n_ctx_min: ctypes.c_uint32, + log_level: int, + /, +) -> bool: + """ + fits mparams and cparams to free device memory (assumes system memory is unlimited) + returns true if the parameters could be successfully modified to fit device memory + this function is NOT thread safe because it modifies the global llama logger state + """ + ... + + # LLAMA_API int64_t llama_time_us(void); @ctypes_function( "llama_time_us", @@ -1328,6 +1379,12 @@ def llama_max_parallel_sequences() -> int: ... +# LLAMA_API size_t llama_max_tensor_buft_overrides(void); +@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t) +def llama_max_tensor_buft_overrides() -> int: + ... + + # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) def llama_supports_mmap() -> bool: @@ -4217,6 +4274,23 @@ def llama_print_system_info() -> bytes: # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. +# // The logger state is global so these functions are NOT thread safe. +# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); +@ctypes_function( + "llama_log_get", + [ctypes.POINTER(ggml_log_callback), ctypes.POINTER(ctypes.c_void_p)], + None, +) +def llama_log_get( + log_callback: Optional[ctypes.pointer(ggml_log_callback)], + user_data: ctypes.pointer(ctypes.c_void_p), + /, +): + """Get callback for all future logging events. + If this is not called, or NULL is supplied, everything is output on stderr.""" + ... + + # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "llama_log_set", @@ -4229,7 +4303,6 @@ def llama_log_set( /, ): """Set callback for all future logging events. - If this is not called, or NULL is supplied, everything is output on stderr.""" ... From 2789d345b0381735972fc3cf5afa7c94772fd5c2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 17 Dec 2025 20:28:20 +0800 Subject: [PATCH 032/518] Update Submodule vendor/llama.cpp 4a4f7e6..669696e --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4a4f7e6550..669696e00d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4a4f7e6550cf5b327ea0fb241ce7417ab46e1ace +Subproject commit 669696e00d60e7ffa0223ee61242318a51f33a79 From 236eff59badb7181e710b409869bbfe471fc7b54 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 18 Dec 2025 00:42:01 +0800 Subject: [PATCH 033/518] feat: implement `GLM41VChatHandler` for GLM-4.1V-9B-Thinking Model - Patch stop tokens in __call__ to handle and EOS truncation. Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 77 ++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/README.md b/README.md index 581e4f387d..015f37705f 100644 --- a/README.md +++ b/README.md @@ -496,6 +496,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | +| [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index dfdbc1c507..5ae364a2ca 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3717,6 +3717,83 @@ class Gemma3ChatHandler(Llava15ChatHandler): ) +class GLM41VChatHandler(Llava15ChatHandler): + # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. + + GLM41V_EOS_TOKEN = "<|endoftext|>" + GLM41V_PAD_TOKEN = "<|endoftext|>" + GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]\n" + "{%- for msg in messages -%}" + "{%- if msg.role == 'system' -%}" + "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'user' -%}" + "<|user|>\n" + "{%- if msg.content is string -%}" + "{{ msg.content }}" + "{%- else -%}" + "{%- for item in msg.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'assistant' -%}" + "{%- if msg.metadata -%}" + "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- else -%}" + "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{%- endif -%}" + ) + + def __call__(self, **kwargs): + self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN + stop_tokens = [self.GLM41V_EOS_TOKEN, ""] # Stop token patch + kwargs['stop'] = stop_tokens + + llama = kwargs['llama'] + + # Clear state for multiple runs + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + # Clear any handler state + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"GLM4VChatHandler - Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"GLM4VChatHandler - State reset", file=sys.stderr) + + # Use parent implementation + return super().__call__(**kwargs) + + class Qwen25VLChatHandler(Llava15ChatHandler): DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." From a266a0bc4f4ff64206d1bf9f5263209afaf8c260 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 18 Dec 2025 09:15:18 +0800 Subject: [PATCH 034/518] Update Submodule vendor/llama.cpp 669696e..4d1316c --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 669696e00d..4d1316c440 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 669696e00d60e7ffa0223ee61242318a51f33a79 +Subproject commit 4d1316c440e7ee7b854423aea8db6ffcd92caeaf From 9b9710a0d225bb70176c60e2032749f73d7ed8c1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 19 Dec 2025 09:08:04 +0800 Subject: [PATCH 035/518] Optimization: Improved batch token processing logic in Llava15ChatHandler. --- llama_cpp/_internals.py | 2 +- llama_cpp/llama_chat_format.py | 32 +++++++++++++++++++------------- llama_cpp/llama_cpp.py | 4 ++-- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index bdc8f3dc89..0c95a6132b 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -662,7 +662,7 @@ def n_tokens(self) -> int: def reset(self): self.batch.n_tokens = 0 - def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): + def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_all: bool): n_tokens = len(batch) self.batch.n_tokens = n_tokens for i in range(n_tokens): diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 5ae364a2ca..3f03cb7ec3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2999,22 +2999,19 @@ def __call__( llama._ctx.memory_clear(True) # Process each chunk - n_past = llama_cpp.llama_pos(0) + n_past = 0 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: - continue + if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: # Handle text chunk n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( - chunk, ctypes.byref(n_tokens_out) - ) + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) if tokens_ptr and n_tokens_out.value > 0: # Convert ctypes array to Python list @@ -3024,15 +3021,17 @@ def __call__( raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" ) + llama.n_tokens = n_past llama.eval(tokens) + n_past = llama.n_tokens elif chunk_type in [self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO]: # Handle image/audio chunk using helper chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): + if n_past + chunk_n_tokens > llama.n_ctx(): raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" + f"Prompt exceeds n_ctx: {n_past + chunk_n_tokens} > {llama.n_ctx()}" ) new_n_past = llama_cpp.llama_pos(0) @@ -3040,7 +3039,7 @@ def __call__( self.mtmd_ctx, llama._ctx.ctx, chunk, - llama_cpp.llama_pos(llama.n_tokens), + llama_cpp.llama_pos(n_past), llama_cpp.llama_seq_id(0), llama.n_batch, False, # logits_last @@ -3051,8 +3050,15 @@ def __call__( raise ValueError(f"Failed to evaluate chunk: error code {result}") # Update llama's token count - llama.n_tokens = new_n_past.value - + n_past = new_n_past.value + llama.n_tokens = n_past + + n_past = llama.n_tokens + if n_past > 0: + llama._ctx.memory_seq_rm(0, n_past - 1, -1) + if llama._ctx.memory_seq_pos_min(0) == llama._ctx.memory_seq_pos_max(0): + n_past += 1 + llama.n_tokens = n_past # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() @@ -3786,9 +3792,9 @@ def __call__(self, **kwargs): messages = kwargs.get('messages', []) try: image_count = len(self.get_image_urls(messages)) - print(f"GLM4VChatHandler - Processing {image_count} images", file=sys.stderr) + print(f"GLM4VChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) except Exception: - print(f"GLM4VChatHandler - State reset", file=sys.stderr) + print(f"GLM4VChatHandler - Cleared state", file=sys.stderr) # Use parent implementation return super().__call__(**kwargs) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 98ec5db01d..85710277bd 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -550,7 +550,7 @@ class llama_batch(ctypes.Structure): The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens Attributes: - n_tokens (int): number of tokens + n_tokens (ctypes.c_int32): number of tokens token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL) embd (ctypes.Array[ctypes.ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL) pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence @@ -559,7 +559,7 @@ class llama_batch(ctypes.Structure): """ if TYPE_CHECKING: - n_tokens: int + n_tokens: ctypes.c_int32 token: CtypesArray[llama_token] embd: CtypesArray[ctypes.c_float] pos: CtypesArray[CtypesArray[llama_pos]] From 060f06d2dcdd032283c2d00208c213c235824e7f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 19 Dec 2025 09:12:22 +0800 Subject: [PATCH 036/518] feat: implement LFM2VLChatHandler for LFM2-VL series models --- README.md | 1 + llama_cpp/llama_chat_format.py | 65 ++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/README.md b/README.md index 015f37705f..16608d95a6 100644 --- a/README.md +++ b/README.md @@ -497,6 +497,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | +| [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 3f03cb7ec3..bec4ebe03e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3800,6 +3800,71 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class LFM2VLChatHandler(Llava15ChatHandler): + LFM2VL_BOS_TOKEN = "<|startoftext|>" + LFM2VL_EOS_TOKEN = "<|im_end|>" + LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{ '<|im_start|>' + message['role'] + '\n' }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] }}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if 'image_url' in content -%}" + "{%- if content.image_url is string -%}" + "<|image_start|>{{ content.image_url }}<|image_end|>" + "{%- else -%}" + "<|image_start|>{{ content.image_url.url }}<|image_end|>" + "{%- endif -%}" + "{%- elif content['type'] == 'text' -%}" + "{{ content['text'] }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{ '<|im_end|>\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ '<|im_start|>assistant\n' }}" + "{%- endif -%}" + ) + + def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): + """ + LFM2-VL Handler + LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 + """ + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) + + def __call__(self, **kwargs): + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"LFM2VLChatHandler - Cleared state, Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"LFM2VLChatHandler - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + class Qwen25VLChatHandler(Llava15ChatHandler): DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." From ceb2b07873294699b0ad030d8a4f8a23b3bdd70c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 20 Dec 2025 05:22:44 +0800 Subject: [PATCH 037/518] Update Submodule vendor/llama.cpp 4d1316c..ce734a8 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4d1316c440..ce734a8a2f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4d1316c440e7ee7b854423aea8db6ffcd92caeaf +Subproject commit ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787 From eecb7014c1037f442001d4c85ffc8c61de47973c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 20 Dec 2025 05:33:35 +0800 Subject: [PATCH 038/518] Update README.md CUDA version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 16608d95a6..ade44d49ed 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: -- CUDA Version is 12.4, 12.6 or 12.8 +- CUDA Version is 12.4, 12.6, 12.8 or 13.0 - Python Version is 3.10, 3.11, 3.12 or 3.13 - Basic version: A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). - AVX2 version: A version compiled using AVX2 instructions. From 70d5844ddc61029fca2f33af60e8ce9a99ca92d5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 20 Dec 2025 06:01:21 +0800 Subject: [PATCH 039/518] More Stop token patch for GLM41VChatHandler --- llama_cpp/llama_chat_format.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index bec4ebe03e..666ec8faf5 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3770,7 +3770,8 @@ class GLM41VChatHandler(Llava15ChatHandler): def __call__(self, **kwargs): self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN - stop_tokens = [self.GLM41V_EOS_TOKEN, ""] # Stop token patch + # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json + stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch kwargs['stop'] = stop_tokens llama = kwargs['llama'] From d5131e2ff41e05f83fd847052b06938c7a551a6a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 20 Dec 2025 07:14:38 +0800 Subject: [PATCH 040/518] feat: implement GLM46VChatHandler for GLM-4.6V Series Model Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 100 +++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/README.md b/README.md index ade44d49ed..6c18a4a24f 100644 --- a/README.md +++ b/README.md @@ -497,6 +497,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | +| [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 666ec8faf5..a1f55b5f5a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3801,6 +3801,106 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class GLM46VChatHandler(Llava15ChatHandler): + GLM46V_EOS_TOKEN = "<|endoftext|>" + GLM46V_PAD_TOKEN = "<|endoftext|>" + GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]" + "{%- if tools -%}" + "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" + "You are provided with function signatures within XML tags:\n\n" + "{%- for tool in tools -%}" + "{{ tool | tojson(ensure_ascii=False) }}\n" + "{%- endfor -%}" + "\n\nFor each function call, output the function name and arguments within the following XML format:\n" + "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" + "{%- endif -%}" + + "{%- for m in messages -%}" + "{%- if m.role == 'system' -%}" + "<|system|>\n{{ m.content }}" + "{%- elif m.role == 'user' -%}" + "<|user|>\n" + "{%- if m.content is string -%}" + "{{ m.content }}" + "{%- else -%}" + "{%- for item in m.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + # If enable_thinking is disabled, insert `/nothink` according to the source code logic. + "{{ '/nothink' if not enable_thinking else '' }}" + "{%- elif m.role == 'assistant' -%}" + "<|assistant|>" + "{%- if enable_thinking -%}" + "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" + "\n{{ reasoning.strip() }}" + "{%- else -%}" + "\n" + "{%- endif -%}" + "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" + "{%- endif -%}" + "{{ GLM46V_EOS_TOKEN }}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{{ '' if enable_thinking else '\n' }}" + "{%- endif -%}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + GLM-4.6V Handler + Parameters: + - enable_thinking (bool): Whether to enable the model's think process. The default is True. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN + + # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json + kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + class LFM2VLChatHandler(Llava15ChatHandler): LFM2VL_BOS_TOKEN = "<|startoftext|>" LFM2VL_EOS_TOKEN = "<|im_end|>" From 171bd1922ef8525c33f7cc94bb31bb51b1e248c1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 20 Dec 2025 07:39:33 +0800 Subject: [PATCH 041/518] Bump version to 0.3.18 Signed-off-by: JamePeng --- CHANGELOG.md | 20 +++++++++++++++++++- llama_cpp/__init__.py | 2 +- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41b3d3209b..2ad6f16796 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.18] +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787](https://github.com/ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787) +- feat: Sync llama.cpp llama/mtmd API Binding 20251215 +- feat: **implement `GLM46VChatHandler` for GLM-4.6V Series Model** +- feat: **implement `LFM2VLChatHandler` for LFM2-VL series models** +- feat: **implement `GLM41VChatHandler` for GLM-4.1V-9B-Thinking Model** +- workflow: Added workflows for compiling with CUDA 13.0.2 on Windows and Linux. +- feat: Added the scan path for CUDA 13.0+ dynamic link libraries under Windows system ($env:CUDA_PATH\bin\x64) +- Optimization: Improved batch token processing logic in Llava15ChatHandler. +- [perf: optimize LlamaModel.metadata reading performance](https://github.com/JamePeng/llama-cpp-python/commit/8213c19b0e164780ffffa3e64b5fc033cdbe4974) + - Increase initial buffer size to 16KB to eliminate re-allocations for large chat templates. + - Cache ctypes function references to reduce loop overhead. + - Repeated model loading can result in a cumulative speed improvement of 1-3%. +- build: Improve CMakeLists target logic +- refactor: optimize LlamaGrammar class code + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/67421d546ddcaa07678ac7921a9f124e7e3de10e...d5131e2ff41e05f83fd847052b06938c7a551a6a + ## [0.3.17] - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/054a45c3d313387a4becd5eae982285932852b35](https://github.com/ggml-org/llama.cpp/commit/054a45c3d313387a4becd5eae982285932852b35) - feat: Sync llama.cpp llama/mtmd API Binding 20251121 @@ -20,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - feat: Optimize CUDA Wheel Build Workflow, now workflow action support python3.10-3.13 cu124-cu126-cu128 Basic(Non AVX)-AVX2 win-linux -More information see : https://github.com/JamePeng/llama-cpp-python/compare/e5392b52036bd2770ece5269352f5600a8db5639...fbb0ed2f089c663a5eb75aadcad08f768041ed72 +More information see: https://github.com/JamePeng/llama-cpp-python/compare/e5392b52036bd2770ece5269352f5600a8db5639...fbb0ed2f089c663a5eb75aadcad08f768041ed72 ## [0.3.16] diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index a7c40478bc..bdaefb9e01 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.17" +__version__ = "0.3.18" From 2efaa346bc0aa0d6648938a0dcdf8d12240a8bed Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 23 Dec 2025 06:13:33 +0800 Subject: [PATCH 042/518] Update Submodule vendor/llama.cpp ce734a8..8f48807 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ce734a8a2f..8f48807380 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787 +Subproject commit 8f48807380305a5985df78f67e29862664c9afec From eb4749cb67829ab272ffea2249e5d8eb4c9af31e Mon Sep 17 00:00:00 2001 From: AlcoftTAO Date: Fri, 26 Dec 2025 04:51:53 +0100 Subject: [PATCH 043/518] Fixed a small bug in the Qwen3-VL chat template. --- llama_cpp/llama_chat_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a1f55b5f5a..166459edbc 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4046,7 +4046,7 @@ class Qwen3VLChatHandler(Llava15ChatHandler): "{{- '\n' -}}" "{{- tool | tojson -}}" "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\n\n\n{\"name\": , \"arguments\": }\n' -}}" + "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" "{%- endif -%}" "{{- '<|im_end|>\n' -}}" "{%- set image_count = namespace(value=0) -%}" @@ -4108,7 +4108,7 @@ class Qwen3VLChatHandler(Llava15ChatHandler): "{%- endif -%}" "{%- endfor -%}" "{%- if add_generation_prompt -%}" - "{{- 'assistant\n' -}}" + "{{- '<|im_start|>assistant\n' -}}" "{%- if force_reasoning -%}" "{{- '\n' -}}" "{%- endif -%}" From 9519b47e0a55f36b17064ac2ac78a1a8275c52bd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 24 Dec 2025 20:02:55 +0800 Subject: [PATCH 044/518] Update Submodule vendor/llama.cpp 8f48807..7ac8902 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8f48807380..7ac8902133 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8f48807380305a5985df78f67e29862664c9afec +Subproject commit 7ac8902133da6eb390c4d8368a7d252279123942 From a11d97a45ebca1a2bbbb91dde27a43a138731f39 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 14:59:56 +0800 Subject: [PATCH 045/518] Update llama.cpp API 20251227 --- llama_cpp/llama_cpp.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 85710277bd..aa4e40e05b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1529,22 +1529,47 @@ def llama_model_n_head(model: llama_model_p, /) -> int: ... - # LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); -@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_swa(model: llama_model_p, /) -> int: - ... - - # LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_head_kv(model: llama_model_p, /) -> int: ... + # LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); +@ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_swa(model: llama_model_p, /) -> int: + ... + + # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: + """ + Get the model's RoPE frequency scaling factor + """ + ... + + +# // Returns the number of classifier outputs (only valid for classifier models) +# // Undefined behavior for non-classifier models +# LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model); +@ctypes_function("llama_model_n_cls_out", [llama_model_p_ctypes], ctypes.c_uint32) +def llama_model_n_cls_out(model: llama_model_p, /) -> int: + """ + Returns the number of classifier outputs (only valid for classifier models) + Undefined behavior for non-classifier models + """ + ... + + +# // Returns label of classifier output by index ( ctypes.c_char_p: + """ + Returns label of classifier output by index ( Date: Sat, 27 Dec 2025 18:21:08 +0800 Subject: [PATCH 046/518] Refactor: Extract embedding logic to `LlamaEmbedding` class, Rerank support and fix parallel batching - Decoupled embedding and rerank logic into `llama_embedding.py`. - Implemented streaming batching for constant memory usage. - Fixed parallel batching errors by enabling `kv_unified`. such as "multiple embeddings in a single call" - Added native `rank()` support for Reranker models. - Added advanced normalization support (Euclidean, Taxicab, MaxInt16). - Added `array`,`json+` output format for raw vector access. The legacy embedding implementation in `llama.py` is now superseded by this optimized approach. Signed-off-by: JamePeng --- llama_cpp/llama.py | 11 ++ llama_cpp/llama_embedding.py | 345 +++++++++++++++++++++++++++++++++++ 2 files changed, 356 insertions(+) create mode 100644 llama_cpp/llama_embedding.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 9c815539d9..c2628555b1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1031,6 +1031,11 @@ def create_embedding( Returns: An embedding object. """ + warnings.warn( + "The `create_embedding` method in `Llama` class is deprecated. " + "Please migrate to `LlamaEmbedding.create_embedding` for better efficiency.", + DeprecationWarning, + ) model_name: str = model if model is not None else self.model_path input = input if isinstance(input, list) else [input] @@ -1075,6 +1080,12 @@ def embed( Returns: A list of embeddings """ + warnings.warn( + "The `embed` method in `Llama` class is deprecated and will be removed in future versions. " + "Please use the `LlamaEmbedding` class from `llama_embedding` module for optimized performance and reranking support.", + DeprecationWarning, + ) + n_embd = self.n_embd() n_batch = self.n_batch diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py new file mode 100644 index 0000000000..44d918e16c --- /dev/null +++ b/llama_cpp/llama_embedding.py @@ -0,0 +1,345 @@ +import numpy as np +from typing import Union, List, Optional, Dict, Any, Tuple +import llama_cpp.llama_cpp as llama_cpp +from .llama_types import Embedding +from .llama import Llama +# Pooling types from .llama_cpp +from .llama_cpp import ( + LLAMA_POOLING_TYPE_UNSPECIFIED, + LLAMA_POOLING_TYPE_NONE, + LLAMA_POOLING_TYPE_MEAN, + LLAMA_POOLING_TYPE_CLS, + LLAMA_POOLING_TYPE_LAST, + LLAMA_POOLING_TYPE_RANK, # Specifically for Reranking models +) + +# Normalization modes for embedding vectors +# See: https://github.com/ggml-org/llama.cpp/tree/master/examples/embedding#--embd-normalize-integer +NORM_MODE_NONE = -1 +NORM_MODE_MAX_INT16 = 0 +NORM_MODE_TAXICAB = 1 +NORM_MODE_EUCLIDEAN = 2 + +# TODO(JamePeng): Needs more extensive testing with various embedding and reranking models. +class LlamaEmbedding(Llama): + """ + A specialized class for high-performance Text Embedding and Reranking. + Inherits from the base Llama class but is optimized for vector operations. + + Key Features: + 1. Auto-configuration: Automatically sets embedding=True. + 2. Streaming Batch: Handles massive datasets without OOM (Out Of Memory). + 3. Native Reranking Support: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker). / + It correctly identifies classification heads to output scalar relevance scores instead of high-dimensional vectors. + 4. Advanced Normalization: Implements MaxInt16, Taxicab (L1), and Euclidean (L2) normalization strategies / + using NumPy for optimal performance and compatibility with various vector databases. + """ + + def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, **kwargs): + """ + Initialize the embedding model with enforced configuration. + + Args: + model_path: Path to the GGUF model file. + pooling_type: The pooling strategy used by the model. + - Use `LLAMA_POOLING_TYPE_RANK` (4) for Reranker models. + - Use `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) to let the model metadata decide (for standard embeddings). + **kwargs: Additional arguments passed to the Llama base class (e.g., n_gpu_layers, n_batch, n_ctx). + """ + kwargs["embedding"] = True + + # Enable Unified KV Cache (Crucial for Batching) + # This allows us to assign arbitrary seq_ids in a batch, enabling the parallel / + # encoding of multiple unrelated documents without "invalid seq_id" errors. + kwargs["kv_unified"] = True + + # Set pooling type + kwargs["pooling_type"] = pooling_type + + super().__init__(model_path=model_path, **kwargs) + + if self.verbose: + print(f"LlamaEmbedding initialized with pooling_type: {self.pooling_type()}") + + def _normalize_vector(self, vector: List[float], mode: int) -> List[float]: + """ + Apply mathematical normalization to a vector. + Uses numpy for performance. + """ + if mode == NORM_MODE_NONE: return vector + arr = np.array(vector, dtype=np.float32) + + # Mode 0: Max Absolute Int16 -> 32760 * x_i / max|x_i| + if mode == NORM_MODE_MAX_INT16: + max_abs = np.max(np.abs(arr)) + if max_abs == 0: return vector + return ((arr / max_abs) * 32760.0).tolist() + + # Mode 1: Taxicab (L1 Norm) -> x_i / sum|x_i| + elif mode == NORM_MODE_TAXICAB: + norm = np.sum(np.abs(arr)) + if norm == 0: return vector + return (arr / norm).tolist() + + # Mode 2: Euclidean (L2 Norm) -> x_i / sqrt(sum x_i^2) + elif mode == NORM_MODE_EUCLIDEAN: + norm = np.linalg.norm(arr) + if norm == 0: return vector + return (arr / norm).tolist() + + # Mode > 2: p-norm + elif mode > 2: + norm = np.sum(np.abs(arr) ** mode) ** (1.0 / mode) + if norm == 0: return vector + return (arr / norm).tolist() + + return vector + + def embed( + self, + input: Union[str, List[str], List[List[int]]], + normalize: int = NORM_MODE_EUCLIDEAN, + truncate: bool = True, + separator: Optional[str] = None, + return_count: bool = False, + ) -> Union[List[float], List[List[float]], Tuple[Any, int]]: + + ctx = self._ctx.ctx + n_batch = self.n_batch + n_ctx = self._n_ctx + n_ubatch = self.context_params.n_ubatch + + # Determine if it is in Rerank mode + try: + current_pooling = self.pooling_type() + except AttributeError: + current_pooling = LLAMA_POOLING_TYPE_UNSPECIFIED + is_rank = (current_pooling == LLAMA_POOLING_TYPE_RANK) + logits_all = current_pooling == llama_cpp.LLAMA_POOLING_TYPE_NONE + + # Determine the output dimension + if is_rank: + out_dim = llama_cpp.llama_model_n_cls_out(self._model.model) + else: + out_dim = self.n_embd() + + if self.verbose: + mode_str = "RANK (Score)" if is_rank else "EMBED (Vector)" + print(f"LlamaEmbedding Debug: Mode={mode_str} | Output Dimension={out_dim}") + + # Preprocess Input + inputs: List[Union[str, List[int]]] = [] + is_single = False + + if isinstance(input, str): + if separator: + inputs = input.split(separator) + is_single = False + else: + inputs = [input] + is_single = True + else: + inputs = input + is_single = False + + # Reset Context and Batch + if self.verbose: + llama_cpp.llama_perf_context_reset(ctx) + self._batch.reset() + llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True) + + # Initialize State Variables + results: List[Any] = [] + batch_seq_lens: List[int] = [] + total_tokens_processed = 0 + + # --- Decode Current Batch --- + def _decode_batch(): + nonlocal batch_seq_lens + if not batch_seq_lens: return + + self._ctx.decode(self._batch) + + for i in range(len(batch_seq_lens)): + ptr = llama_cpp.llama_get_embeddings_seq(ctx, i) + data = ptr[:out_dim] + + if not is_rank: + data = self._normalize_vector(data, normalize) + + if is_rank and len(data) == 1: + results.append(data[0]) + else: + results.append(data) + + self._batch.reset() + llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True) + batch_seq_lens = [] + + # Main Streaming Loop + idx_in_batch = 0 + + for item in inputs: + # Tokenize + tokens: List[int] = [] + if isinstance(item, list) and (not item or isinstance(item[0], int)): + tokens = item + elif isinstance(item, str): + tokens = self.tokenize(item.encode("utf-8")) + else: + raise ValueError("Input item must be str or List[int]") + + # Truncate + if truncate and len(tokens) > n_ctx: + tokens = tokens[:n_ctx] + + n_tokens = len(tokens) + total_tokens_processed += n_tokens + + if n_tokens == 0: + results.append(0.0 if is_rank else []) + continue + + # Check Batch Capacity + if (self._batch.n_tokens() + n_tokens > n_batch) or (idx_in_batch >= n_ubatch): + _decode_batch() + idx_in_batch = 0 + + # Add to Batch + self._batch.add_sequence(tokens, idx_in_batch, logits_all=logits_all) + batch_seq_lens.append(n_tokens) + idx_in_batch += 1 + + # Process Remaining Items + _decode_batch() + + if self.verbose: + llama_cpp.llama_perf_context_print(ctx) + + final_result = results[0] if is_single else results + + if return_count: + return final_result, total_tokens_processed + + return final_result + + def rank(self, query: str, documents: List[str]) -> List[float]: + """ + Calculate relevance scores for a list of documents against a query using a Reranking model. + + This method constructs a specific prompt structure ([BOS] Query [SEP] Doc [EOS]) + typically used by Cross-Encoders to estimate similarity. + + Args: + query: The search query string. + documents: A list of candidate document strings to be scored. + + Returns: + A list of float scores, where higher values indicate greater relevance. + """ + if self.pooling_type() != LLAMA_POOLING_TYPE_RANK: + raise ValueError(f"Model pooling_type is {self.pooling_type()}, but LLAMA_POOLING_TYPE_RANK is required.") + + # Prepare Special Tokens + sep_id = self.token_sep() + if sep_id == -1: sep_id = self.token_eos() + eos_id = self.token_eos() + + # Pre-process Query + q_tokens = self.tokenize(query.encode("utf-8"), add_bos=True, special=True) + # Remove the automatically added EOS token from the query + # because we need to append the separator and document tokens after it. + if q_tokens and q_tokens[-1] == eos_id: + q_tokens.pop() + + # Construct Batch Inputs + batch_inputs: List[List[int]] = [] + for doc in documents: + d_tokens = self.tokenize(doc.encode("utf-8"), add_bos=False, special=True) + full_seq = q_tokens + [sep_id] + d_tokens + # Ensure the sequence ends with an EOS token to mark the end of inference. + if not full_seq or full_seq[-1] != eos_id: + full_seq.append(eos_id) + batch_inputs.append(full_seq) + + # We use NORM_MODE_NONE because rerankers output raw logits/scores, not vectors that need normalization. + return self.embed(batch_inputs, normalize=NORM_MODE_NONE) + + def create_embedding( + self, + input: Union[str, List[str]], + model: Optional[str] = None, + normalize: int = NORM_MODE_EUCLIDEAN, + output_format: str = "json" + ) -> Union[Dict[str, Any], List[float], List[List[float]]]: + """ + High-level API compatible with OpenAI format. + + Args: + output_format: + - 'json': OpenAI style dict (Default) + - 'json+': OpenAI style dict + cosineSimilarity matrix + - 'array': Raw python list (List[float] or List[List[float]]) + """ + model_name = model if model is not None else self.model_path + + # Normalize input to list + inputs_list = [input] if isinstance(input, str) else input + + # Generate Embeddings(and get token count) + embeddings, token_count = self.embed( + inputs_list, + normalize=normalize, + return_count=True + ) + + if output_format == "array": + return embeddings + + # Structure the OpenAI-style response ('json' or 'json+') + # Ensure embeddings is a list for iteration + # (If input was single string, embeddings is List[float], wrap it for the loop) + iter_embeddings = [embeddings] if isinstance(embeddings[0], float) else embeddings + + data: List[Embedding] = [ + { + "object": "embedding", + "embedding": emb, + "index": idx, + } + for idx, emb in enumerate(iter_embeddings) + ] + + response = { + "object": "list", + "data": data, + "model": model_name, + "usage": { + "prompt_tokens": token_count, # Input consumption + "completion_tokens": 0, # The Embedding task does not generate text, so the value is 0. + "total_tokens": token_count, # Total consumption = Input consumption + Output + } + } + + # Calculate Cosine Similarity Matrix (Optimized via Numpy) + # Only if output_format is 'json+' and we have vectors + if output_format == "json+" and len(embeddings) > 1 and isinstance(embeddings[0], list): + try: + # Assuming embeddings are already L2 normalized if normalize=2 + mat = np.array(embeddings) + + # Safety check: Force normalize if not already done, to ensure Cosine (not Dot Product) + if normalize != NORM_MODE_EUCLIDEAN: + norm = np.linalg.norm(mat, axis=1, keepdims=True) + # Avoid division by zero + norm[norm == 0] = 1e-10 + mat = mat / norm + + # Matrix multiplication: A @ A.T + sim_matrix = np.dot(mat, mat.T) + response["cosineSimilarity"] = sim_matrix.tolist() + except Exception as e: + if self.verbose: + print(f"Warning: Failed to calculate similarity matrix: {e}") + + return response From e1005019ff89121a381c63a1f3a1c7e148123377 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 18:54:26 +0800 Subject: [PATCH 047/518] Docs: Update README to feature LlamaEmbedding and Reranking workflows - Added usage guide for the new `LlamaEmbedding` class. - Included code snippets for Reranking, Batching, and Normalization. - Updated legacy examples to reflect current best practices. --- README.md | 138 ++++++++++++++++++++++++++++++----- llama_cpp/llama_embedding.py | 1 + 2 files changed, 121 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 6c18a4a24f..928e8477ee 100644 --- a/README.md +++ b/README.md @@ -730,46 +730,148 @@ print(res["choices"][0]["message"]["content"]) ``` +--- -### Speculative Decoding +## Embeddings & Reranking (GGUF) -`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model. +`llama-cpp-python` provides a high-performance, memory-efficient specialized class `LlamaEmbedding` for generating text embeddings and calculating reranking scores. -The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class. +**Key Features:** +* **Streaming Batch Processing:** Process massive datasets (e.g., Hundreds of documents) without running out of memory (OOM). +* **Native Reranking:** Built-in support for Cross-Encoder models (outputting relevance scores instead of vectors). +* **Optimized Performance:** Utilizes Unified KV Cache for parallel encoding of multiple documents. -Just pass this as a draft model to the `Llama` class during initialization. +### TODO(JamePeng): Needs more extensive testing with various embedding and rerank models. :) + +#### 1. Text Embeddings (Vector Search) + +To generate embeddings, use the `LlamaEmbedding` class. It automatically configures the model for vector generation. ```python -from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaPromptLookupDecoding +from llama_cpp.llama_embedding import LlamaEmbedding -llama = Llama( - model_path="path/to/model.gguf", - draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. +# Initialize the model (automatically sets embedding=True) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf") + +# 1. Simple usage (OpenAI-compatible format) +response = llm.create_embedding("Hello, world!") +print(response['data'][0]['embedding']) + +# 2. Batch processing (High Performance) +# You can pass a large list of strings; the streaming batcher handles memory automatically. +documents = ["Hello, world!", "Goodbye, world!", "Llama is cute."] * 100 +embeddings = llm.embed(documents) # Returns a list of lists (vectors) + +print(f"Generated {len(embeddings)} vectors.") +``` + +**Advanced Output Formats:** +You can request raw arrays or cosine similarity matrices directly: + +```python +# Returns raw List[float] instead of a dictionary wrapper +vector = llm.create_embedding("Text", output_format="array") + +# Returns a similarity matrix (A @ A.T) in the response +# Note: Requires numpy installed +response = llm.create_embedding( + ["apple", "fruit", "car"], + output_format="json+" ) +print(response["cosineSimilarity"]) ``` -### Embeddings +#### 2. Reranking (Cross-Encoder Scoring) -To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly. +Reranking models (like `bge-reranker`) take a **Query** and a list of **Documents** as input and output a relevance score (scalar) for each document. + +> **Important:** You must explicitly set `pooling_type` to `LLAMA_POOLING_TYPE_RANK` (4) when initializing the model. ```python import llama_cpp +from llama_cpp.llama_embedding import LlamaEmbedding -llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True) +# Initialize a Reranking model +ranker = LlamaEmbedding( + model_path="path/to/bge-reranker-v2-m3.gguf", + pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK # Crucial for Rerankers! +) -embeddings = llm.create_embedding("Hello, world!") +query = "What causes rain?" +docs = [ + "Clouds are made of water droplets...", # Relevant + "To bake a cake you need flour...", # Irrelevant + "Rain is liquid water in the form of droplets..." # Highly Relevant +] -# or create multiple embeddings at once +# Calculate relevance scores +# Logic: Constructs inputs like "[BOS] query [SEP] doc [EOS]" automatically +scores = ranker.rank(query, docs) -embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"]) +# Result: List of floats (higher means more relevant) +print(scores) +# e.g., [-0.15, -8.23, 5.67] -> The 3rd doc is the best match ``` -There are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by "pooling" token level embeddings together, usually by averaging them or using the first token. +#### 3. Normalization + +The `embed` method supports various mathematical normalization strategies via the `normalize` parameter. + +| Normalization modes | $Integer$ | Description | Formula | +|---------------------|-----------|---------------------|---------| +| NORM_MODE_NONE | $-1$ | none | +| NORM_MODE_MAX_INT16 | $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$ +| NORM_MODE_TAXICAB | $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$ +| NORM_MODE_EUCLIDEAN | $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$ +| NORM_MODE_PNORM | $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$ + +This is useful for optimizing storage or preparing vectors for cosine similarity search (which requires L2 normalization). + +```python +from llama_cpp.llama_embedding import NORM_MODE_MAX_INT16, NORM_MODE_TAXICAB, NORM_MODE_EUCLIDEAN + +# Taxicab (L1) +vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB) -Models that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings. +# Default is Euclidean (L2) - Standard for vector databases +vec_l2 = llm.embed("text", normalize=NORM_MODE_EUCLIDEAN) -It is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually. +# Max Absolute Int16 - Useful for quantization/compression +vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16) + +# Raw Output (No Normalization) - Get the raw floating point values from the model +embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE) +``` + +#### Legacy Usage (Deprecated) + +The standard `Llama` class still supports basic embedding generation, but it lacks the memory optimizations and reranking capabilities of `LlamaEmbedding`. + +```python +# Old method - Not recommended for large batches or reranking +llm = llama_cpp.Llama(model_path="...", embedding=True) +emb = llm.create_embedding("text") +``` + +--- + +### Speculative Decoding + +`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model. + +The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class. + +Just pass this as a draft model to the `Llama` class during initialization. + +```python +from llama_cpp import Llama +from llama_cpp.llama_speculative import LlamaPromptLookupDecoding + +llama = Llama( + model_path="path/to/model.gguf", + draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. +) +``` ### Adjusting the Context Window diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 44d918e16c..8f504ae6de 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -19,6 +19,7 @@ NORM_MODE_MAX_INT16 = 0 NORM_MODE_TAXICAB = 1 NORM_MODE_EUCLIDEAN = 2 +NORM_MODE_PNORM = 6 # TODO(JamePeng): Needs more extensive testing with various embedding and reranking models. class LlamaEmbedding(Llama): From 573dba7110928e9c54b5906ca8a47bc73d57154f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 19:14:24 +0800 Subject: [PATCH 048/518] Added n_gpu_layers parameter into LlamaEmbedding --- README.md | 15 ++++++++------- llama_cpp/llama_embedding.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 928e8477ee..61e70e0879 100644 --- a/README.md +++ b/README.md @@ -751,7 +751,7 @@ To generate embeddings, use the `LlamaEmbedding` class. It automatically configu from llama_cpp.llama_embedding import LlamaEmbedding # Initialize the model (automatically sets embedding=True) -llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf") +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1) # 1. Simple usage (OpenAI-compatible format) response = llm.create_embedding("Hello, world!") @@ -770,7 +770,7 @@ You can request raw arrays or cosine similarity matrices directly: ```python # Returns raw List[float] instead of a dictionary wrapper -vector = llm.create_embedding("Text", output_format="array") +vector = llm.create_embedding("Text", output_format="array", n_gpu_layers=-1) # Returns a similarity matrix (A @ A.T) in the response # Note: Requires numpy installed @@ -794,7 +794,8 @@ from llama_cpp.llama_embedding import LlamaEmbedding # Initialize a Reranking model ranker = LlamaEmbedding( model_path="path/to/bge-reranker-v2-m3.gguf", - pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK # Crucial for Rerankers! + pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK, # Crucial for Rerankers! + n_gpu_layers=-1, ) query = "What causes rain?" @@ -831,16 +832,16 @@ This is useful for optimizing storage or preparing vectors for cosine similarity from llama_cpp.llama_embedding import NORM_MODE_MAX_INT16, NORM_MODE_TAXICAB, NORM_MODE_EUCLIDEAN # Taxicab (L1) -vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB) +vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB, n_gpu_layers=-1) # Default is Euclidean (L2) - Standard for vector databases -vec_l2 = llm.embed("text", normalize=NORM_MODE_EUCLIDEAN) +vec_l2 = llm.embed("text", normalize=NORM_MODE_EUCLIDEAN, n_gpu_layers=-1) # Max Absolute Int16 - Useful for quantization/compression -vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16) +vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16, n_gpu_layers=-1) # Raw Output (No Normalization) - Get the raw floating point values from the model -embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE) +embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE, n_gpu_layers=-1) ``` #### Legacy Usage (Deprecated) diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 8f504ae6de..3289fc43dd 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -36,7 +36,7 @@ class LlamaEmbedding(Llama): using NumPy for optimal performance and compatibility with various vector databases. """ - def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, **kwargs): + def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, n_gpu_layers: int = 0, **kwargs): """ Initialize the embedding model with enforced configuration. @@ -45,7 +45,10 @@ def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPE pooling_type: The pooling strategy used by the model. - Use `LLAMA_POOLING_TYPE_RANK` (4) for Reranker models. - Use `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) to let the model metadata decide (for standard embeddings). - **kwargs: Additional arguments passed to the Llama base class (e.g., n_gpu_layers, n_batch, n_ctx). + n_gpu_layers: Number of model layers to offload to GPU. + - Set to 0 for CPU only. + - Set to -1 for all layers (recommended for best performance). + **kwargs: Additional arguments passed to the Llama base class (e.g., n_batch, n_ctx, verbose). """ kwargs["embedding"] = True @@ -54,6 +57,9 @@ def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPE # encoding of multiple unrelated documents without "invalid seq_id" errors. kwargs["kv_unified"] = True + # Number of model layers to offload to GPU. + kwargs["n_gpu_layers"] = n_gpu_layers + # Set pooling type kwargs["pooling_type"] = pooling_type From fb0847525861eeed96b7b541c1460598d4bfe708 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 19:35:45 +0800 Subject: [PATCH 049/518] Update LlamaModel api-call into _internals.py --- llama_cpp/_internals.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 0c95a6132b..b5ff5866b7 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -84,14 +84,23 @@ def n_vocab(self) -> int: return llama_cpp.llama_n_vocab(self.vocab) def n_ctx_train(self) -> int: - return llama_cpp.llama_n_ctx_train(self.model) + return llama_cpp.llama_model_n_ctx_train(self.model) + + def n_cls_out(self) -> int: + return llama_cpp.llama_model_n_cls_out(self.model) def n_embd(self) -> int: - return llama_cpp.llama_n_embd(self.model) + return llama_cpp.llama_model_n_embd(self.model) + + def n_head(self) -> int: + return llama_cpp.llama_model_n_head(self.model) def n_head_kv(self) -> int: return llama_cpp.llama_model_n_head_kv(self.model) + def n_swa(self) -> int: + return llama_cpp.llama_model_n_swa(self.model) + def n_params(self) -> int: return llama_cpp.llama_model_n_params(self.model) From fcdf9acbbe58b6b87193bbaca78483a5c2657590 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 20:27:55 +0800 Subject: [PATCH 050/518] Added n_ctx,n_batch,n_ubatch parameter into LlamaEmbedding --- llama_cpp/llama_embedding.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 3289fc43dd..fd50c804da 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -36,12 +36,23 @@ class LlamaEmbedding(Llama): using NumPy for optimal performance and compatibility with various vector databases. """ - def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, n_gpu_layers: int = 0, **kwargs): + def __init__( + self, + model_path: str, + n_ctx: int = 1024, + n_batch: int = 512, + n_ubatch: int = 512, + pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, + n_gpu_layers: int = 0, + **kwargs): """ Initialize the embedding model with enforced configuration. Args: model_path: Path to the GGUF model file. + n_ctx: Text context, 0 = from model + n_batch: Prompt processing maximum batch size + n_ubatch: Physical batch size pooling_type: The pooling strategy used by the model. - Use `LLAMA_POOLING_TYPE_RANK` (4) for Reranker models. - Use `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) to let the model metadata decide (for standard embeddings). @@ -51,15 +62,16 @@ def __init__(self, model_path: str, pooling_type: int = LLAMA_POOLING_TYPE_UNSPE **kwargs: Additional arguments passed to the Llama base class (e.g., n_batch, n_ctx, verbose). """ kwargs["embedding"] = True + kwargs["n_gpu_layers"] = n_gpu_layers + kwargs["n_ctx"] = n_ctx + kwargs["n_batch"] = n_batch + kwargs["n_ubatch"] = n_ubatch # Enable Unified KV Cache (Crucial for Batching) # This allows us to assign arbitrary seq_ids in a batch, enabling the parallel / # encoding of multiple unrelated documents without "invalid seq_id" errors. kwargs["kv_unified"] = True - # Number of model layers to offload to GPU. - kwargs["n_gpu_layers"] = n_gpu_layers - # Set pooling type kwargs["pooling_type"] = pooling_type From 52661043991f77010d2f6b5b0eb0ff83ef8ec2c3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 20:44:57 +0800 Subject: [PATCH 051/518] Update README.md --- README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 61e70e0879..00226ae3e3 100644 --- a/README.md +++ b/README.md @@ -736,14 +736,22 @@ print(res["choices"][0]["message"]["content"]) `llama-cpp-python` provides a high-performance, memory-efficient specialized class `LlamaEmbedding` for generating text embeddings and calculating reranking scores. -**Key Features:** +### Key Features: * **Streaming Batch Processing:** Process massive datasets (e.g., Hundreds of documents) without running out of memory (OOM). * **Native Reranking:** Built-in support for Cross-Encoder models (outputting relevance scores instead of vectors). * **Optimized Performance:** Utilizes Unified KV Cache for parallel encoding of multiple documents. +### Support Embeddings & Rerank Model: + + +| Model | Type | Link | Status | +|--------------------|-----------|--------------------------------------------------------|--------------| +| `bge-m3` | Embedding |https://huggingface.co/gpustack/bge-m3-GGUF | Useful ✅ | +|`bge-reranker-v2-m3`| Rerank |https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF | Useful ✅ | + ### TODO(JamePeng): Needs more extensive testing with various embedding and rerank models. :) -#### 1. Text Embeddings (Vector Search) +### 1. Text Embeddings (Vector Search) To generate embeddings, use the `LlamaEmbedding` class. It automatically configures the model for vector generation. @@ -781,7 +789,7 @@ response = llm.create_embedding( print(response["cosineSimilarity"]) ``` -#### 2. Reranking (Cross-Encoder Scoring) +### 2. Reranking (Cross-Encoder Scoring) Reranking models (like `bge-reranker`) take a **Query** and a list of **Documents** as input and output a relevance score (scalar) for each document. @@ -814,7 +822,7 @@ print(scores) # e.g., [-0.15, -8.23, 5.67] -> The 3rd doc is the best match ``` -#### 3. Normalization +### 3. Normalization The `embed` method supports various mathematical normalization strategies via the `normalize` parameter. @@ -844,7 +852,7 @@ vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16, n_gpu_layers=-1) embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE, n_gpu_layers=-1) ``` -#### Legacy Usage (Deprecated) +### Legacy Usage (Deprecated) The standard `Llama` class still supports basic embedding generation, but it lacks the memory optimizations and reranking capabilities of `LlamaEmbedding`. From 71281d0a1c20ca944e2678990c06a2b09126417e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 22:10:41 +0800 Subject: [PATCH 052/518] Improve Extract Embeddings Branch Code --- llama_cpp/llama_embedding.py | 58 +++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index fd50c804da..8ef372c787 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -39,7 +39,7 @@ class LlamaEmbedding(Llama): def __init__( self, model_path: str, - n_ctx: int = 1024, + n_ctx: int = 0, n_batch: int = 512, n_ubatch: int = 512, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, @@ -130,11 +130,12 @@ def embed( # Determine if it is in Rerank mode try: - current_pooling = self.pooling_type() + pooling_type = self.pooling_type() except AttributeError: - current_pooling = LLAMA_POOLING_TYPE_UNSPECIFIED - is_rank = (current_pooling == LLAMA_POOLING_TYPE_RANK) - logits_all = current_pooling == llama_cpp.LLAMA_POOLING_TYPE_NONE + pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED + is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK) + is_none = (pooling_type == LLAMA_POOLING_TYPE_NONE) # Token-level embedding + logits_all = True if is_none else False # Determine the output dimension if is_rank: @@ -143,8 +144,8 @@ def embed( out_dim = self.n_embd() if self.verbose: - mode_str = "RANK (Score)" if is_rank else "EMBED (Vector)" - print(f"LlamaEmbedding Debug: Mode={mode_str} | Output Dimension={out_dim}") + type_str = "TOKEN (None)" if is_none else ("RANK (Score)" if is_rank else "SEQ (Vector)") + print(f"LlamaEmbedding Debug: Mode={type_str} | Pooling={pooling_type} | Dim={out_dim}") # Preprocess Input inputs: List[Union[str, List[int]]] = [] @@ -179,17 +180,38 @@ def _decode_batch(): self._ctx.decode(self._batch) - for i in range(len(batch_seq_lens)): - ptr = llama_cpp.llama_get_embeddings_seq(ctx, i) - data = ptr[:out_dim] - - if not is_rank: - data = self._normalize_vector(data, normalize) - - if is_rank and len(data) == 1: - results.append(data[0]) - else: - results.append(data) + # Extract Embeddings + # Branch A: LLAMA_POOLING_TYPE_NONE (Token Level) + if is_none: + curr_token_idx = 0 + for seq_len in batch_seq_lens: + doc_tokens_embd = [] + for _ in range(seq_len): + # Get the vector of the i-th token + ptr = llama_cpp.llama_get_embeddings_ith(ctx, curr_token_idx) + data = ptr[:out_dim] + + # Normalization + data = self._normalize_vector(data, normalize) + + doc_tokens_embd.append(data) + curr_token_idx += 1 + results.append(doc_tokens_embd) + + # Branth B: Sequence Level (Mean, Cls, Rank, Unspecified) + else: + for i in range(len(batch_seq_lens)): + # Obtain the vector of the i-th sequence. + ptr = llama_cpp.llama_get_embeddings_seq(ctx, i) + data = ptr[:out_dim] + + if not is_rank: + data = self._normalize_vector(data, normalize) + + if is_rank and len(data) == 1: + results.append(data[0]) + else: + results.append(data) self._batch.reset() llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(ctx), True) From 0494d983b9f26a296ea3fdf7008cc4a4da17c9c7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 22:17:36 +0800 Subject: [PATCH 053/518] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00226ae3e3..0e956f324d 100644 --- a/README.md +++ b/README.md @@ -746,8 +746,8 @@ print(res["choices"][0]["message"]["content"]) | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |https://huggingface.co/gpustack/bge-m3-GGUF | Useful ✅ | -|`bge-reranker-v2-m3`| Rerank |https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF | Useful ✅ | +| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | ### TODO(JamePeng): Needs more extensive testing with various embedding and rerank models. :) From eff3b2ba9bf2a32acc9de0f0ba7d37fd3a799d01 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 27 Dec 2025 22:25:32 +0800 Subject: [PATCH 054/518] Fix typos and zero-pad process --- llama_cpp/llama_embedding.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 8ef372c787..cbaca02483 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -44,6 +44,7 @@ def __init__( n_ubatch: int = 512, pooling_type: int = LLAMA_POOLING_TYPE_UNSPECIFIED, n_gpu_layers: int = 0, + verbose: bool = True, **kwargs): """ Initialize the embedding model with enforced configuration. @@ -66,6 +67,7 @@ def __init__( kwargs["n_ctx"] = n_ctx kwargs["n_batch"] = n_batch kwargs["n_ubatch"] = n_ubatch + kwargs["verbose"] = verbose # Enable Unified KV Cache (Crucial for Batching) # This allows us to assign arbitrary seq_ids in a batch, enabling the parallel / @@ -189,16 +191,19 @@ def _decode_batch(): for _ in range(seq_len): # Get the vector of the i-th token ptr = llama_cpp.llama_get_embeddings_ith(ctx, curr_token_idx) - data = ptr[:out_dim] + if ptr is None: + # Fallback: append zero vector or skip (here we zero-pad to keep shape) + doc_tokens_embd.append([0.0] * out_dim) + else: + data = ptr[:out_dim] + # Normalization + data = self._normalize_vector(data, normalize) + doc_tokens_embd.append(data) - # Normalization - data = self._normalize_vector(data, normalize) - - doc_tokens_embd.append(data) curr_token_idx += 1 results.append(doc_tokens_embd) - # Branth B: Sequence Level (Mean, Cls, Rank, Unspecified) + # Branch B: Sequence Level (Mean, Cls, Rank, Unspecified) else: for i in range(len(batch_seq_lens)): # Obtain the vector of the i-th sequence. From a6d0ebd0d614f374ce8752a44e4b73d947ff1b21 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 30 Dec 2025 19:25:01 +0800 Subject: [PATCH 055/518] Update Submodule vendor/llama.cpp 7ac8902..d77d7c5 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7ac8902133..d77d7c5c06 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7ac8902133da6eb390c4d8368a7d252279123942 +Subproject commit d77d7c5c0654dc52b51f03941b12ae85d7227608 From 3716d0aa3460c9e394e5f7babf7c39478a18d298 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 30 Dec 2025 19:32:40 +0800 Subject: [PATCH 056/518] Update llama.cpp API 20251230 --- llama_cpp/llama_cpp.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index aa4e40e05b..30b2de961a 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -681,7 +681,7 @@ class llama_model_tensor_buft_override(ctypes.Structure): # // NULL-terminated list of buffer types to use for tensors that match a pattern # const struct llama_model_tensor_buft_override * tensor_buft_overrides; # -# int32_t n_gpu_layers; // number of layers to store in VRAM +# int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers # enum llama_split_mode split_mode; // how to split the model across multiple GPUs # // main_gpu interpretation depends on split_mode: @@ -720,7 +720,7 @@ class llama_model_params(ctypes.Structure): Attributes: devices (ctypes.Array[ggml_backend_dev_t]): NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) tensor_buft_overrides(llama_model_tensor_buft_override): NULL-terminated list of buffer types to use for tensors that match a pattern - n_gpu_layers (int): number of layers to store in VRAM + n_gpu_layers (int): number of layers to store in VRAM, a negative value means all layers split_mode (int): how to split the model across multiple GPUs main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() @@ -1312,10 +1312,22 @@ def llama_free(ctx: llama_context_p, /): ... +# enum llama_params_fit_status { +# LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit +# LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit +# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path +# }; +class llama_params_fit_status(enum.IntEnum): + LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 + LLAMA_PARAMS_FIT_STATUS_FAILURE = 1 + LLAMA_PARAMS_FIT_STATUS_ERROR = 2 + + # // fits mparams and cparams to free device memory (assumes system memory is unlimited) -# // returns true if the parameters could be successfully modified to fit device memory -# // this function is NOT thread safe because it modifies the global llama logger state -# LLAMA_API bool llama_params_fit( +# // - returns true if the parameters could be successfully modified to fit device memory +# // - this function is NOT thread safe because it modifies the global llama logger state +# // - only parameters that have the same value as in llama_default_model_params are modified +# LLAMA_API enum llama_params_fit_status llama_params_fit( # const char * path_model, # struct llama_model_params * mparams, # struct llama_context_params * cparams, @@ -1336,7 +1348,7 @@ def llama_free(ctx: llama_context_p, /): ctypes.c_uint32, ctypes.c_int, ], - ctypes.c_bool, + ctypes.c_int, ) def llama_params_fit( path_model: ctypes.c_char_p, @@ -1348,7 +1360,7 @@ def llama_params_fit( n_ctx_min: ctypes.c_uint32, log_level: int, /, -) -> bool: +) -> int: """ fits mparams and cparams to free device memory (assumes system memory is unlimited) returns true if the parameters could be successfully modified to fit device memory From 93c9f7e1b22d2141764c927a9a670d3c7e47872a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 30 Dec 2025 21:36:24 +0800 Subject: [PATCH 057/518] refactor(LlamaBatch): enhance safety checks and fix indexing logic - Add boundary validation and overflow protection to `__init__`, `set_batch`, and `add_sequence`. - Introduce `capacity()` and `space_left()` for state monitoring and consistency checks. - Fix incorrect logits index calculation in `add_sequence` (using absolute position). - Update error handling (MemoryError) and add docstrings. --- llama_cpp/_internals.py | 67 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b5ff5866b7..f90ee66f1f 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -636,18 +636,31 @@ def default_params(): class LlamaBatch: def __init__( - self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True + self, + *, + n_tokens: int, + embd: int, + n_seq_max: int, + verbose: bool = True ): - self._n_tokens = n_tokens + # logical validity of parameters + if n_tokens <= 0: + raise ValueError(f"n_tokens must be positive, got {n_tokens}") + if n_seq_max <= 0: + raise ValueError(f"n_seq_max must be positive, got {n_seq_max}") + + self.n_tokens_capacity = n_tokens self.embd = embd self.n_seq_max = n_seq_max self.verbose = verbose self._exit_stack = ExitStack() - batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max) + batch = llama_cpp.llama_batch_init(self.n_tokens_capacity, self.embd, self.n_seq_max) if batch is None: - raise ValueError("Failed to create llama_batch") + raise MemoryError( + f"Failed to allocate memory for llama_batch via llama_batch_init({n_tokens},{embd},{n_seq_max})" + ) self.batch = batch @@ -660,18 +673,51 @@ def free_batch(): self._exit_stack.callback(free_batch) def close(self): + """Manually free resources.""" self._exit_stack.close() def __del__(self): self.close() def n_tokens(self) -> int: + """ + Current number of tokens stored in the batch. + """ + if self.batch is None: return 0 return self.batch.n_tokens + def capacity(self) -> int: + """ + Total capacity of the batch. + """ + return self.n_tokens_capacity + + def space_left(self) -> int: + """ + Returns the number of empty slots remaining in the batch. + Throws a RuntimeError if internal state implies an overflow. + """ + if self.batch is None: return 0 + elif self.n_tokens_capacity >= self.batch.n_tokens: + return self.n_tokens_capacity - self.batch.n_tokens + else: + raise RuntimeError( + f"LlamaBatch Critical Error: n_tokens ({self.batch.n_tokens}) exceeds capacity ({self.n_tokens_capacity}). " + "This implies a buffer overflow or corrupted internal state." + ) + def reset(self): - self.batch.n_tokens = 0 + """ + Resets the batch counter to 0. Does not free memory, just resets the index. + Call this before starting a new decoding step. + """ + if self.batch is not None: + self.batch.n_tokens = 0 def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_all: bool): + if len(batch) > self.n_tokens_capacity: + raise IndexError(f"Input batch size {len(batch)} exceeds capacity {self.n_tokens_capacity}") + n_tokens = len(batch) self.batch.n_tokens = n_tokens for i in range(n_tokens): @@ -684,16 +730,21 @@ def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_al def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): n_tokens = len(batch) - n_tokens0 = self.batch.n_tokens + current_count = self.batch.n_tokens + if current_count + n_tokens > self.n_tokens_capacity: + raise IndexError( + f"LlamaBatch overflow: Cannot add {n_tokens} tokens. " + f"Space left: {self.n_tokens_capacity - current_count}" + ) self.batch.n_tokens += n_tokens for i in range(n_tokens): - j = n_tokens0 + i + j = current_count + i self.batch.token[j] = batch[i] self.batch.pos[j] = i self.batch.seq_id[j][0] = seq_id self.batch.n_seq_id[j] = 1 self.batch.logits[j] = logits_all - self.batch.logits[n_tokens - 1] = True + self.batch.logits[current_count + n_tokens - 1] = True class LlamaTokenDataArray: From 3e00e8e47117ac6fdd7507a0c484f3aa41e24d97 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 30 Dec 2025 21:54:28 +0800 Subject: [PATCH 058/518] Remove unnecessary checks in LlamaContext Class --- llama_cpp/_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index f90ee66f1f..ac8e179212 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -340,7 +340,7 @@ def memory_clear(self, data: bool): llama_cpp.llama_memory_clear(self.get_memory(), data) def memory_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool: - if self.ctx is not None and seq_id >= 0: + if self.ctx is not None: return llama_cpp.llama_memory_seq_rm(self.get_memory(), seq_id, p0, p1) else: return False From 8f7211cc4510a2374c1449152a5adf49f3ba4f3b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 30 Dec 2025 22:34:41 +0800 Subject: [PATCH 059/518] Correct incorrect assignments in llama.py --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c2628555b1..09346bad43 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -435,7 +435,7 @@ def __init__( internals.LlamaBatch( n_tokens=self.n_batch, embd=0, - n_seq_max=self.context_params.n_ctx, + n_seq_max=self.context_params.n_seq_max, verbose=self.verbose, ) ) From 6d31ab0879526ab8d45369c188bcb0c0c7754a5c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 31 Dec 2025 07:38:08 +0800 Subject: [PATCH 060/518] Update Submodule vendor/llama.cpp d77d7c5..4849661 --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 30b2de961a..3c410412e2 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1810,6 +1810,8 @@ def llama_model_quantize( # // Load a LoRA adapter from file +# // The adapter is valid as long as the associated model is not freed +# // All adapters must be loaded before context creation # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( # struct llama_model * model, # const char * path_lora); diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d77d7c5c06..4849661d98 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d77d7c5c0654dc52b51f03941b12ae85d7227608 +Subproject commit 4849661d9898ac3caf59ddd62044185805084370 From 06f64ae7efe79b01a78a9161e9ffce76c687d330 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 31 Dec 2025 09:05:34 +0800 Subject: [PATCH 061/518] refactor(Llama): enhance error handling and cleanup in `eval` method - Wrap `decode` in a try-except block to provide detailed error context (position, batch size) on failure. - Capture and log the result of `memory_seq_rm` to assist in debugging KV cache issues. - Add an early return for empty token lists. - Refactor loop variables and state updates for better clarity. - Remove dead code related to logits processing. --- llama_cpp/llama.py | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 09346bad43..88c06274df 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -667,36 +667,44 @@ def eval(self, tokens: Sequence[int]): Args: tokens: The list of tokens to evaluate. """ - self._ctx.memory_seq_rm(0, self.n_tokens, -1) - for i in range(0, len(tokens), self.n_batch): - batch = tokens[i : min(len(tokens), i + self.n_batch)] + if len(tokens) == 0: + return + n_eval = len(tokens) + current_pos = self.n_tokens + + if self._ctx: + is_success = self._ctx.memory_seq_rm(0, current_pos, -1) + + for i in range(0, n_eval, self.n_batch): + batch = tokens[i : min(n_eval, i + self.n_batch)] n_past = self.n_tokens - n_tokens = len(batch) + n_batch_tokens = len(batch) self._batch.set_batch( batch=batch, n_past=n_past, logits_all=self._logits_all ) - self._ctx.decode(self._batch) + try: + self._ctx.decode(self._batch) + except Exception as e: + raise RuntimeError( + f"Decode Failed at Pos {current_pos}. " + f"Batch size: {n_batch_tokens}. " + f"Result of memory_seq_rm: {is_success}. " + f"Error: {str(e)}." + ) from e # Save tokens - self.input_ids[n_past : n_past + n_tokens] = batch + self.input_ids[n_past : n_past + n_batch_tokens] = batch # Save logits if self._logits_all: - rows = n_tokens + rows = n_batch_tokens cols = self._n_vocab logits = np.ctypeslib.as_array( self._ctx.get_logits(), shape=(rows * cols,) ) - self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits - else: - # rows = 1 - # cols = self._n_vocab - # logits = np.ctypeslib.as_array( - # self._ctx.get_logits(), shape=(rows * cols,) - # ) - # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits - # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all - pass + self.scores[n_past : n_past + n_batch_tokens, :].reshape(-1)[::] = logits + # Update n_tokens - self.n_tokens += n_tokens + current_pos += n_batch_tokens + self.n_tokens = current_pos def _init_sampler( self, From aa653ea5c6a90505a7491e855cc16988293cedd5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 31 Dec 2025 21:16:22 +0800 Subject: [PATCH 062/518] Update Submodule vendor/llama.cpp 4849661..9b8329d --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4849661d98..9b8329de7a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4849661d9898ac3caf59ddd62044185805084370 +Subproject commit 9b8329de7a7200385aaac16ab4a2ab79ae14d829 From 2ff40cc73e23053a23f5755499295c1e09dbf944 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 31 Dec 2025 21:29:50 +0800 Subject: [PATCH 063/518] Bump version to 0.3.19 Signed-off-by: JamePeng --- CHANGELOG.md | 18 ++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ad6f16796..19bb7a7b01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.19] +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/9b8329de7a7200385aaac16ab4a2ab79ae14d829](https://github.com/ggml-org/llama.cpp/commit/9b8329de7a7200385aaac16ab4a2ab79ae14d829) +- feat: Sync llama.cpp llama/mtmd API Binding 20251230 +- **refactor: Extract embedding logic to `LlamaEmbedding` class, `Rerank` support and fix parallel batching** + - Decoupled embedding and rerank logic into `llama_embedding.py`. + - Implemented streaming batching for constant memory usage. + - Fixed parallel batching errors by enabling `kv_unified`. such as `"multiple embeddings in a single call"` + - Added native `rank()` support for `Reranker models`. + - Added advanced normalization support (`Euclidean`, `Taxicab`, `MaxInt16`). + - Added `array`,`json+` output format for raw vector access. + - The legacy embedding implementation in `llama.py` is now superseded by this optimized approach. +- update README.md here: https://github.com/JamePeng/llama-cpp-python?tab=readme-ov-file#embeddings--reranking-gguf +- refactor(LlamaBatch): enhance safety checks and fix indexing logic +- refactor(Llama): enhance error handling and cleanup in eval method +- Fixed a small bug in the Qwen3-VL chat template (by @alcoftTAO) + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/2efaa346bc0aa0d6648938a0dcdf8d12240a8bed...aa653ea5c6a90505a7491e855cc16988293cedd5 + ## [0.3.18] - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787](https://github.com/ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787) - feat: Sync llama.cpp llama/mtmd API Binding 20251215 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index bdaefb9e01..72388c4e5d 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.18" +__version__ = "0.3.19" From 644a8a23e19ee24a8972ac96e78c63f2b1743854 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 1 Jan 2026 06:05:22 +0800 Subject: [PATCH 064/518] fix(Llama): implement fallback to full cache clear in eval If memory_seq_rm fails to clear from current_pos, fallback to clearing the entire sequence to prevent invalid input batch errors. --- llama_cpp/llama.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 88c06274df..75eac766be 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -673,7 +673,13 @@ def eval(self, tokens: Sequence[int]): current_pos = self.n_tokens if self._ctx: + # Standard cleanup by current_pos is_success = self._ctx.memory_seq_rm(0, current_pos, -1) + # Fallback: Broad cleanup + if not is_success: + if self.verbose: + print(f"WARN: memory_seq_rm(0, {current_pos}, -1) failed. Executing fallback: memory_seq_rm(0, 0, -1)") + is_success = self._ctx.memory_seq_rm(0, 0, -1) for i in range(0, n_eval, self.n_batch): batch = tokens[i : min(n_eval, i + self.n_batch)] From 151c8f42fed0b6416553a4f761b57234b22d9392 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 1 Jan 2026 06:07:54 +0800 Subject: [PATCH 065/518] Append new fix(Llama) log in CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19bb7a7b01..5bc84bf490 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - refactor(LlamaBatch): enhance safety checks and fix indexing logic - refactor(Llama): enhance error handling and cleanup in eval method - Fixed a small bug in the Qwen3-VL chat template (by @alcoftTAO) +- fix(Llama): implement fallback to full cache clear in eval + - If `memory_seq_rm` fails to clear from `current_pos`, fallback to clearing the entire sequence to prevent invalid input batch errors. More information see: https://github.com/JamePeng/llama-cpp-python/compare/2efaa346bc0aa0d6648938a0dcdf8d12240a8bed...aa653ea5c6a90505a7491e855cc16988293cedd5 From 9a5f323bc038ae0d48f24f7976d44445842baf41 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 1 Jan 2026 20:07:27 +0800 Subject: [PATCH 066/518] Update Submodule vendor/llama.cpp 9b8329d..26831bd --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9b8329de7a..26831bded9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9b8329de7a7200385aaac16ab4a2ab79ae14d829 +Subproject commit 26831bded991d3fb31ca6b143af46eebb85f7e60 From b6bf564bb7d7cef8b28cf248d5b13ddc74dce72d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 1 Jan 2026 20:09:05 +0800 Subject: [PATCH 067/518] Update llama_context_params and fix the `embeddings` typo --- README.md | 2 +- llama_cpp/llama.py | 19 ++++++++++++++----- llama_cpp/llama_embedding.py | 2 +- llama_cpp/server/model.py | 4 +++- llama_cpp/server/settings.py | 8 +++++++- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0e956f324d..878c48bb78 100644 --- a/README.md +++ b/README.md @@ -858,7 +858,7 @@ The standard `Llama` class still supports basic embedding generation, but it lac ```python # Old method - Not recommended for large batches or reranking -llm = llama_cpp.Llama(model_path="...", embedding=True) +llm = llama_cpp.Llama(model_path="...", embeddings=True) emb = llm.create_embedding("text") ``` diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 75eac766be..2c7214d3f0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -79,6 +79,7 @@ def __init__( n_ctx: int = 512, n_batch: int = 512, n_ubatch: int = 512, + n_seq_max: int = 1, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, rope_scaling_type: Optional[ @@ -95,13 +96,13 @@ def __init__( yarn_beta_slow: float = 1.0, yarn_orig_ctx: int = 0, logits_all: bool = False, - embedding: bool = False, + embeddings: bool = False, offload_kqv: bool = True, + no_perf: bool = False, op_offload: Optional[bool] = None, swa_full: Optional[bool] = None, kv_unified: Optional[bool] = None, # Sampling Params - no_perf: bool = False, last_n_tokens_size: int = 64, # LoRA Params lora_base: Optional[str] = None, @@ -168,6 +169,7 @@ def __init__( n_ctx: Text context, 0 = from model n_batch: Prompt processing maximum batch size n_ubatch: Physical batch size + n_seq_max: max number of sequences (i.e. distinct states for recurrent models) n_threads: Number of threads to use for generation n_threads_batch: Number of threads to use for batch processing rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggml-org/llama.cpp/pull/2054 @@ -182,12 +184,12 @@ def __init__( yarn_beta_slow: YaRN high correction dim yarn_orig_ctx: YaRN original context size logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs. - embedding: Embedding mode only. + embeddings: Embedding mode only. if true, extract embeddings (together with logits) offload_kqv: Offload K, Q, V to GPU. + no_perf: Measure performance timings. op_offload: whether to offload host tensor operations to device swa_full: whether to use full-size SWA cache kv_unified: use single unified KV buffer for the KV cache of all sequences - no_perf: Measure performance timings. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. @@ -314,6 +316,7 @@ def __init__( self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? + self.n_seq_max = n_seq_max self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -325,6 +328,7 @@ def __init__( self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) + self.context_params.n_seq_max = self.n_seq_max self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch self.context_params.rope_scaling_type = ( @@ -366,10 +370,15 @@ def __init__( yarn_beta_slow if yarn_beta_slow != 0.0 else 0 ) self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 + self._logits_all = logits_all if draft_model is None else True - self.context_params.embeddings = embedding # TODO: Rename to embeddings + + self.context_params.embeddings = embeddings self.context_params.offload_kqv = offload_kqv + if no_perf is not None: + self.context_params.no_perf = no_perf + if op_offload is not None: self.context_params.op_offload = op_offload diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index cbaca02483..6b252934fe 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -62,7 +62,7 @@ def __init__( - Set to -1 for all layers (recommended for best performance). **kwargs: Additional arguments passed to the Llama base class (e.g., n_batch, n_ctx, verbose). """ - kwargs["embedding"] = True + kwargs["embeddings"] = True kwargs["n_gpu_layers"] = n_gpu_layers kwargs["n_ctx"] = n_ctx kwargs["n_batch"] = n_batch diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 513a640e97..a802fe3209 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -278,6 +278,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: n_ctx=settings.n_ctx, n_batch=settings.n_batch, n_ubatch=settings.n_ubatch, + n_seq_max=settings.n_seq_max, n_threads=settings.n_threads, n_threads_batch=settings.n_threads_batch, rope_scaling_type=settings.rope_scaling_type, @@ -293,8 +294,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: yarn_orig_ctx=settings.yarn_orig_ctx, mul_mat_q=settings.mul_mat_q, logits_all=settings.logits_all, - embedding=settings.embedding, + embeddings=settings.embeddings, offload_kqv=settings.offload_kqv, + no_perf=settings.no_perf, op_offload=settings.op_offload, swa_full=settings.swa_full, kv_unified=settings.kv_unified, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 6098d30b9a..7527ad9757 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -73,6 +73,9 @@ class ModelSettings(BaseSettings): n_ubatch: int = Field( default=512, ge=1, description="The physical batch size used by llama.cpp" ) + n_seq_max: int = Field( + default=1, ge=1, description="max number of sequences (i.e. distinct states for recurrent models)" + ) n_threads: int = Field( default=max(multiprocessing.cpu_count() // 2, 1), ge=1, @@ -112,10 +115,13 @@ class ModelSettings(BaseSettings): default=True, description="if true, use experimental mul_mat_q kernels" ) logits_all: bool = Field(default=True, description="Whether to return logits.") - embedding: bool = Field(default=False, description="Whether to use embeddings.") + embeddings: bool = Field(default=False, description="Whether to use embeddings.") offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) + no_perf: bool = Field( + default=False, description="measure performance timings" + ) op_offload: bool = Field( default=True, description="Whether to offload host tensor operations to device" ) From 3f347005f2040610a5b4e9f2ccb8a7d23af22282 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 1 Jan 2026 21:28:09 +0800 Subject: [PATCH 068/518] docker(cuda_simple): upgrade to CUDA 12.8.1 and switch to source install - Upgrade base image to nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 - Optimize deps - Switch to git clone + pip install . for source building - Add documentation comments Signed-off-by: JamePeng --- docker/cuda_simple/Dockerfile | 40 +++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile index 0bbf20ffe9..b7eecbc21b 100644 --- a/docker/cuda_simple/Dockerfile +++ b/docker/cuda_simple/Dockerfile @@ -1,27 +1,45 @@ -ARG CUDA_IMAGE="12.5.0-devel-ubuntu22.04" +# Define the base image version with CUDA 12.8 and cuDNN for Ubuntu 22.04 +ARG CUDA_IMAGE="12.8.1-cudnn-devel-ubuntu22.04" FROM nvidia/cuda:${CUDA_IMAGE} # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 +# Install system dependencies: RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y git build-essential \ - python3 python3-pip gcc wget \ - ocl-icd-opencl-dev opencl-headers clinfo \ - libclblast-dev libopenblas-dev \ - && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd + && apt-get install -y build-essential \ + ccache cmake curl gcc git wget \ + python3 python3-pip python3-wheel \ + libgomp1 libjpeg-dev libssl-dev libcurl4-openssl-dev +# Set the working directory for the container +WORKDIR /app + +# Copy the current directory contents into the container (useful for local config files or models) COPY . . -# setting build related env vars -ENV CUDA_DOCKER_ARCH=all +# Set the target GPU architecture (default allows CMake to auto-detect or use common archs) +ENV CUDA_DOCKER_ARCH=default ENV GGML_CUDA=1 +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" +# Enable verbose build output +ENV VERBOSE=1 +# Specific CUDA paths to ensure the compiler (nvcc) is found +ENV CUDA_HOME="/usr/local/cuda/" +ENV CUDA_PATH="/usr/local/cuda/:${PATH}" +ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda/" + +# Install Python build tools and server dependencies +RUN python3 -m pip install --upgrade pip pytest wheel packaging scikit-build setuptools fastapi uvicorn pydantic-settings sse-starlette starlette-context huggingface-hub + +# Clone the source code. --recursive is CRITICAL to fetch the 'llama.cpp' submodule +RUN git clone --recursive https://github.com/JamePeng/llama-cpp-python.git -# Install depencencies -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context +# Switch context to the cloned repository directory +WORKDIR /app/llama-cpp-python # Install llama-cpp-python (build with cuda) -RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python +RUN CMAKE_ARGS="-DGGML_CUDA=on -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}" pip install . # Run the server CMD python3 -m llama_cpp.server From 8a96113e592f4c4552d308049a7ff72eeafa8270 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 1 Jan 2026 21:34:55 +0800 Subject: [PATCH 069/518] Fixed `embeddings` typos --- README.md | 2 +- llama_cpp/llama.py | 2 +- llama_cpp/llama_embedding.py | 2 +- tests/test_llama.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 878c48bb78..9b3c66d3ca 100644 --- a/README.md +++ b/README.md @@ -758,7 +758,7 @@ To generate embeddings, use the `LlamaEmbedding` class. It automatically configu ```python from llama_cpp.llama_embedding import LlamaEmbedding -# Initialize the model (automatically sets embedding=True) +# Initialize the model (automatically sets embeddings=True) llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1) # 1. Simple usage (OpenAI-compatible format) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2c7214d3f0..7eada4741f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1118,7 +1118,7 @@ def embed( if self.context_params.embeddings is False: raise RuntimeError( - "Llama model must be created with embedding=True to call this method" + "Llama model must be created with embeddings=True to call this method" ) if self.verbose: diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 6b252934fe..69179cb0b5 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -28,7 +28,7 @@ class LlamaEmbedding(Llama): Inherits from the base Llama class but is optimized for vector operations. Key Features: - 1. Auto-configuration: Automatically sets embedding=True. + 1. Auto-configuration: Automatically sets embeddings=True. 2. Streaming Batch: Handles massive datasets without OOM (Out Of Memory). 3. Native Reranking Support: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker). / It correctly identifies classification heads to output scalar relevance scores instead of high-dimensional vectors. diff --git a/tests/test_llama.py b/tests/test_llama.py index 195b425793..6e7b84495f 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -232,9 +232,9 @@ def test_real_llama_embeddings(llama_cpp_model_path): n_threads=multiprocessing.cpu_count(), n_threads_batch=multiprocessing.cpu_count(), logits_all=False, - swa_full=True, + embeddings=True, kv_unified=True, - embedding=True + swa_full=True, ) # Smoke test for now model.embed("Hello World") From 965584307ff6cb13a431ca71cce1856533ff06aa Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 2 Jan 2026 05:29:36 +0800 Subject: [PATCH 070/518] Update Submodule vendor/llama.cpp 26831bd..ced765b --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 26831bded9..ced765be44 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 26831bded991d3fb31ca6b143af46eebb85f7e60 +Subproject commit ced765be44ce173c374f295b3c6f4175f8fd109b From 6f7fa1153a9755b9967570cfede9cb5f02743d8e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 2 Jan 2026 05:41:44 +0800 Subject: [PATCH 071/518] Update llama_vocab_pre_type varriable --- llama_cpp/llama_cpp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3c410412e2..2f8e332462 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -139,6 +139,7 @@ # NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) +# https://github.com/ggml-org/llama.cpp/blob/master/src/llama-vocab.h#L10 # // pre-tokenization types # enum llama_vocab_pre_type { # LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, @@ -184,6 +185,8 @@ # LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, # LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, # LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, +# LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, +# LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -228,6 +231,8 @@ LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 +LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, +LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, # // note: these values should be synchronized with ggml_rope From 3d0f716040b51f730ff34e3e65ef9e52c05bd123 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 2 Jan 2026 07:22:15 +0800 Subject: [PATCH 072/518] Update docker/README.md for cuda simple --- docker/README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/docker/README.md b/docker/README.md index 474503fdfc..4ed1552717 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,18 +15,28 @@ docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model//` is the full path to the model file on the Docker host system. +-------------------------------------------------------------------------- + ### cuda_simple + > [!WARNING] -> Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+> **NVIDIA Container Toolkit**: You must have the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed on the host. The `12.8.1-cudnn-devel-ubuntu22.04` images currently in use generally include the necessary NVCC compilation environment.
+> **VRAM**: Ensure your GPU has enough VRAM to load the model. -A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image: +A Dockerfile that builds `llama-cpp-python` from source (with CUDA 12.8 support) and runs an OpenAI-compatible API server. -``` +#### 1. Build +Note: The build process will compile the llama.cpp C++ backend, which may take several tens of minutes. +```bash cd ./cuda_simple docker build -t cuda_simple . -docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple ``` -where `/` is the full path to the model file on the Docker host system. +#### 2. Run +```bash +docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/app/models/ -v /path/to/your/models:/app/models -t cuda_simple +``` +`--gpus=all`: Enables GPU access.
+`-e MODEL=...`: Specifies the path to the model inside the container. -------------------------------------------------------------------------- From 4e3367fbe735f2265825cd5dd2aa36ca9c20743c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 3 Jan 2026 11:18:02 +0800 Subject: [PATCH 073/518] Update Submodule vendor/llama.cpp ced765b..18ddaea --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ced765be44..18ddaea2ae 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ced765be44ce173c374f295b3c6f4175f8fd109b +Subproject commit 18ddaea2aecf7fbfe7acab77465808f3cf6200d3 From c7721b4b99b5edc098c0b2e9773d64a8dd21e297 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 3 Jan 2026 14:55:15 +0800 Subject: [PATCH 074/518] Small fix for Llava15ChatHandler class --- llama_cpp/llama_chat_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 166459edbc..b8ff84e984 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2972,7 +2972,7 @@ def __call__( # Create input text structure input_text = self._mtmd_cpp.mtmd_input_text() input_text.text = text.encode('utf-8') - input_text.add_special = True + input_text.add_special = (llama.n_tokens == 0) input_text.parse_special = True # Create input chunks @@ -3058,7 +3058,7 @@ def __call__( llama._ctx.memory_seq_rm(0, n_past - 1, -1) if llama._ctx.memory_seq_pos_min(0) == llama._ctx.memory_seq_pos_max(0): n_past += 1 - llama.n_tokens = n_past + llama.n_tokens = n_past # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() From db98c735e81db0664ea67b93a3fac1d6c32cdf16 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 4 Jan 2026 21:37:04 +0800 Subject: [PATCH 075/518] Update Submodule vendor/llama.cpp 18ddaea..cef1d23 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 18ddaea2ae..cef1d23c5a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 18ddaea2aecf7fbfe7acab77465808f3cf6200d3 +Subproject commit cef1d23c5a33156c44a206c1f4bc146f4db729f9 From c7d53ced09ec54687e0dbbd7176dd9e5f05a4d3e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 4 Jan 2026 21:52:31 +0800 Subject: [PATCH 076/518] Bump version to 0.3.20 --- CHANGELOG.md | 7 +++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bc84bf490..ad501993ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.20] +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/cef1d23c5a33156c44a206c1f4bc146f4db729f9](https://github.com/ggml-org/llama.cpp/commit/cef1d23c5a33156c44a206c1f4bc146f4db729f9) +- feat: Update llama_context_params and fixed some embeddings typo +- [docker(cuda_simple): upgrade to CUDA 12.8.1 and switch to source install](https://github.com/JamePeng/llama-cpp-python/commit/3f347005f2040610a5b4e9f2ccb8a7d23af22282) +- update docker/README.md for cuda simple +- [Small fix for Llava15ChatHandler class](https://github.com/JamePeng/llama-cpp-python/commit/c7721b4b99b5edc098c0b2e9773d64a8dd21e297) + ## [0.3.19] - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/9b8329de7a7200385aaac16ab4a2ab79ae14d829](https://github.com/ggml-org/llama.cpp/commit/9b8329de7a7200385aaac16ab4a2ab79ae14d829) - feat: Sync llama.cpp llama/mtmd API Binding 20251230 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 72388c4e5d..83177c065d 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.19" +__version__ = "0.3.20" From 6b31e8d753a37d162e9d656a08a68c248d20ac74 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 5 Jan 2026 19:58:09 +0800 Subject: [PATCH 077/518] Synchronize Pillow documents with supported image MIME types for conversion, and remove image formats that are not supported for reading and conversion. --- CHANGELOG.md | 2 +- README.md | 36 ++++++++++++++++++++++++---------- llama_cpp/llama_chat_format.py | 2 +- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad501993ab..4a3045fbc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,7 +61,7 @@ More information see: https://github.com/JamePeng/llama-cpp-python/compare/67421 - feat: **Better Qwen3VL chat template. (by @alcoftTAO)** - feat: [Implement LlamaTrieCache into llama_cache.py](https://github.com/JamePeng/llama-cpp-python/commit/2419dc2d9bb0a6be0cd381038ce00fcaea124c76): Optimize LlamaCache lookup from **O(N)** to **O(K)** using a Trie, **improves retrieval speed at least 40x compared to the original linear scan method of finding the longest prefix , thereby enhancing service responsiveness.** - feat: Update Llava15ChatHandler to accept use_gpu, image_min_tokens, and image_max_tokens.Now can pass the`image_min_tokens`parameter in Qwen3VLChatHandler to support bbox grounding tasks. -- feat: [Add Pillow process code in _load_image for VLM](https://github.com/JamePeng/llama-cpp-python/commit/3b0133365e25840c023aef6b6c8578073cd081e8): that can reliably handle common formats, Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.). Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background(white for dark content, black for bright content) +- feat: [Add Pillow process code in _load_image for VLM](https://github.com/JamePeng/llama-cpp-python/commit/3b0133365e25840c023aef6b6c8578073cd081e8): that can reliably handle common formats, Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.). Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background(white for dark content, black for bright content). Support image format see here: [image-file-formats](https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html) - feat: Optimize CUDA Wheel Build Workflow, now workflow action support python3.10-3.13 cu124-cu126-cu128 Basic(Non AVX)-AVX2 win-linux diff --git a/README.md b/README.md index 9b3c66d3ca..514186337e 100644 --- a/README.md +++ b/README.md @@ -626,7 +626,9 @@ llm = Llama( ) # Comprehensive MIME type mapping (updated as of 2025) +# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats # Reference: IANA official media types + common real-world usage +# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html _IMAGE_MIME_TYPES = { # Most common formats '.png': 'image/png', @@ -634,25 +636,39 @@ _IMAGE_MIME_TYPES = { '.jpeg': 'image/jpeg', '.gif': 'image/gif', '.webp': 'image/webp', - '.svg': 'image/svg+xml', - '.svgz': 'image/svg+xml', # Next-generation formats '.avif': 'image/avif', - '.heic': 'image/heic', - '.heif': 'image/heif', - '.heics': 'image/heic-sequence', - '.heifs': 'image/heif-sequence', + '.jp2': 'image/jp2', + '.j2k': 'image/jp2', + '.jpx': 'image/jp2', # Legacy / Windows formats '.bmp': 'image/bmp', - '.dib': 'image/bmp', '.ico': 'image/x-icon', - '.cur': 'image/x-icon', + '.pcx': 'image/x-pcx', + '.tga': 'image/x-tga', + '.icns': 'image/icns', - # Professional imaging + # Professional / Scientific imaging '.tif': 'image/tiff', '.tiff': 'image/tiff', + '.eps': 'application/postscript', + '.dds': 'image/vnd-ms.dds', + '.dib': 'image/dib', + '.sgi': 'image/sgi', + + # Portable Map formats (PPM/PGM/PBM) + '.pbm': 'image/x-portable-bitmap', + '.pgm': 'image/x-portable-graymap', + '.ppm': 'image/x-portable-pixmap', + + # Miscellaneous / Older formats + '.xbm': 'image/x-xbitmap', + '.mpo': 'image/mpo', + '.msp': 'image/msp', + '.im': 'image/x-pillow-im', + '.qoi': 'image/qoi', } def image_to_base64_data_uri( @@ -663,7 +679,7 @@ def image_to_base64_data_uri( """ Convert a local image file to a base64-encoded data URI with the correct MIME type. - Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.). + Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.). Args: file_path: Path to the image file on disk. diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index b8ff84e984..d8295b8d56 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3169,7 +3169,7 @@ def _load_image(image_url: str) -> bytes: - Remote images via HTTP/HTTPS (with proper User-Agent) - Data URIs (base64-encoded, e.g., data:image/png;base64,...) - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background - - Any format that Pillow can open + - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html Returns: JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. From 33a62bb43bfebcf71cbd8dae1d7a7bfc105ee87e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 5 Jan 2026 20:11:06 +0800 Subject: [PATCH 078/518] Update Submodule vendor/llama.cpp cef1d23..f1768d8 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index cef1d23c5a..f1768d8f03 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit cef1d23c5a33156c44a206c1f4bc146f4db729f9 +Subproject commit f1768d8f03fe514794349790de0785eafded6c0b From ee8c925c188cb96b50574ca2e62c976c79a397d7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 5 Jan 2026 23:13:06 +0800 Subject: [PATCH 079/518] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 514186337e..2f72a3773b 100644 --- a/README.md +++ b/README.md @@ -775,7 +775,7 @@ To generate embeddings, use the `LlamaEmbedding` class. It automatically configu from llama_cpp.llama_embedding import LlamaEmbedding # Initialize the model (automatically sets embeddings=True) -llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) # 1. Simple usage (OpenAI-compatible format) response = llm.create_embedding("Hello, world!") @@ -855,6 +855,9 @@ This is useful for optimizing storage or preparing vectors for cosine similarity ```python from llama_cpp.llama_embedding import NORM_MODE_MAX_INT16, NORM_MODE_TAXICAB, NORM_MODE_EUCLIDEAN +# Initialize the model (automatically sets embeddings=True) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) + # Taxicab (L1) vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB, n_gpu_layers=-1) From bf09ae5e5a698708ce133b33740fa177868795fd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 5 Jan 2026 23:13:50 +0800 Subject: [PATCH 080/518] Update llama.cpp API 20260105 --- llama_cpp/llama_cpp.py | 266 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 252 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2f8e332462..348ffb969e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -779,6 +779,23 @@ class llama_model_params(ctypes.Structure): llama_model_params_p = ctypes.POINTER(llama_model_params) + +# struct llama_sampler_seq_config { +# llama_seq_id seq_id; +# struct llama_sampler * sampler; +# }; +class llama_sampler_seq_config(ctypes.Structure): + if TYPE_CHECKING: + seq_id: llama_seq_id + sampler: ctypes.c_void_p + _fields_ = [ + ("seq_id", llama_seq_id), + ("sampler", ctypes.c_void_p), + ] + +llama_sampler_seq_config_p = ctypes.POINTER(llama_sampler_seq_config) + + # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations # // https://github.com/ggml-org/llama.cpp/pull/7544 # struct llama_context_params { @@ -826,6 +843,11 @@ class llama_model_params(ctypes.Structure): # bool kv_unified; // use a unified buffer across the input sequences when computing the attention # // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix # // ref: https://github.com/ggml-org/llama.cpp/pull/14363 +# // [EXPERIMENTAL] +# // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) +# // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) +# struct llama_sampler_seq_config * samplers; +# size_t n_samplers; # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -861,6 +883,8 @@ class llama_context_params(ctypes.Structure): op_offload(bool): whether to offload host tensor operations to device swa_full(bool): whether to use full-size SWA cache kv_unified(bool): use a unified buffer across the input sequences when computing the attention + samplers(llama_sampler_seq_config *): the samplers must be sampler chains (i.e. use llama_sampler_chain_init) + n_samplers(size_t): numbers of sampler chains """ if TYPE_CHECKING: @@ -894,6 +918,8 @@ class llama_context_params(ctypes.Structure): op_offload:bool swa_full:bool kv_unified:bool + samplers: ctypes.c_void_p + n_samplers: int _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -926,6 +952,8 @@ class llama_context_params(ctypes.Structure): ("op_offload", ctypes.c_bool), ("swa_full", ctypes.c_bool), ("kv_unified", ctypes.c_bool), + ("samplers", llama_sampler_seq_config_p), + ("n_samplers", ctypes.c_int), ] llama_context_params_p = ctypes.POINTER(llama_context_params) @@ -1757,9 +1785,9 @@ def llama_model_has_decoder(model: llama_model_p, /) -> bool: # // to the decoder to start generating output sequence. For other models, it returns -1. # LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model); @ctypes_function( - "llama_model_decoder_start_token", [llama_model_p_ctypes], ctypes.c_int32 + "llama_model_decoder_start_token", [llama_model_p_ctypes], llama_token ) -def llama_model_decoder_start_token(model: llama_model_p, /) -> int: +def llama_model_decoder_start_token(model: llama_model_p, /) -> ctypes.c_int32: """For encoder-decoder models, this function returns id of the token that must be provided to the decoder to start generating output sequence. For other models, it returns -1. """ @@ -3027,6 +3055,132 @@ def llama_get_embeddings_seq( ... +# // +# // backend sampling API [EXPERIMENTAL] +# // note: use only if the llama_context was created with at least one llama_sampler_seq_config +# // + +# // Get the backend sampled token for the ith token. +# // Returns LLAMA_TOKEN_NULL if no token was sampled. +# LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_token_ith", + [llama_context_p_ctypes, ctypes.c_int32], + llama_token, +) +def llama_get_sampled_token_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> ctypes.c_int32: + """ + Get the backend sampled token for the ith token. + Returns LLAMA_TOKEN_NULL if no token was sampled. + """ + ... + + +# // Get the backend sampled probabilites for the ith token +# // The index matches llama_get_sampled_token_ith(). +# // Returns NULL if no probabilites were generated. +# LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_probs_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.POINTER(ctypes.c_float), +) +def llama_get_sampled_probs_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> CtypesArray[ctypes.c_float]: + """ + Get the backend sampled probabilites for the ith token + The index matches llama_get_sampled_token_ith(). + Returns NULL if no probabilites were generated. + """ + ... + + +# LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_probs_count_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.c_uint32, +) +def llama_get_sampled_probs_count_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> ctypes.c_uint32: + """ + Get the backend sampled probabilites count for the ith token + """ + ... + + +# // Get the backend sampled logits for the ith token +# // Returns NULL if no logits were sampled. +# LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_logits_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.POINTER(ctypes.c_float), +) +def llama_get_sampled_logits_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> CtypesArray[ctypes.c_float]: + """ + Get the backend sampled logits for the ith token + Returns NULL if no logits were sampled. + """ + ... + + +# LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_logits_count_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.c_uint32, +) +def llama_get_sampled_logits_count_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> ctypes.c_uint32: + """ + Get the backend sampled logits count for the ith token + """ + ... + + +# // Get the backend sampled candidates (token ids) for the ith token +# // These are needed to map probability/logit indices to vocab token ids. +# // Returns NULL if no candidates were sampled. +# LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_candidates_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.POINTER(llama_token), +) +def llama_get_sampled_candidates_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> CtypesArray[llama_token]: + """ + Get the backend sampled candidates (token ids) for the ith token + These are needed to map probability/logit indices to vocab token ids. + Returns NULL if no candidates were sampled. + """ + ... + + +# LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_candidates_count_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.c_uint32, +) +def llama_get_sampled_candidates_count_ith( + ctx: llama_context_p, i: ctypes.c_int32, / +) -> ctypes.c_uint32: + """ + Get the backend sampled candidates (token ids) count for the ith token + """ + ... + + # // # // Vocab # // @@ -3734,13 +3888,32 @@ def llama_chat_builtin_templates( # // # // llama_sampler_free(smpl); # // -# // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU). -# // # typedef void * llama_sampler_context_t; llama_sampler_context_t = ctypes.c_void_p +# struct llama_sampler_data { +# struct ggml_tensor * logits; +# struct ggml_tensor * probs; +# struct ggml_tensor * sampled; +# struct ggml_tensor * candidates; +# }; +class llama_sampler_data(ctypes.Structure): + if TYPE_CHECKING: + logits: ctypes.c_void_p + probs: ctypes.c_void_p + sampled: ctypes.c_void_p + candidates: ctypes.c_void_p + + _fields_ = [ + ("logits", ctypes.c_void_p), + ("probs", ctypes.c_void_p), + ("sampled", ctypes.c_void_p), + ("candidates", ctypes.c_void_p), + ] + + # // user code can implement the interface below in order to create custom llama_sampler # struct llama_sampler_i { # const char * (*name) (const struct llama_sampler * smpl); // can be NULL @@ -3749,17 +3922,38 @@ def llama_chat_builtin_templates( # void (*reset) ( struct llama_sampler * smpl); // can be NULL # struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL # void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL -# -# // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph -# //void (*apply_ggml) (struct llama_sampler * smpl, ...); + +# // [EXPERIMENTAL] +# // backend sampling interface: + +# // return true if the backend supports all ops needed by the sampler +# // note: call once per sampler +# bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft); + +# // call after .backend_apply() +# void (*backend_accept)( +# struct llama_sampler * smpl, +# struct ggml_context * ctx, +# struct ggml_cgraph * gf, +# struct ggml_tensor * selected_token); + +# // call after .backend_init() +# void (*backend_apply)( +# struct llama_sampler * smpl, +# struct ggml_context * ctx, +# struct ggml_cgraph * gf, +# struct llama_sampler_data * data); + +# // called before graph execution to set inputs for the current ubatch +# void (*backend_set_input)(struct llama_sampler * smpl); # }; class llama_sampler_i(ctypes.Structure): ... # struct llama_sampler { -# const struct llama_sampler_i * iface; -# llama_sampler_context_t ctx; +# struct llama_sampler_i * iface; +# llama_sampler_context_t ctx; # }; class llama_sampler(ctypes.Structure): _fields_ = [ @@ -3776,12 +3970,19 @@ class llama_sampler(ctypes.Structure): llama_sampler_i_name = ctypes.CFUNCTYPE(ctypes.c_char_p, llama_sampler_p_ctypes) llama_sampler_i_accept = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes, llama_token) llama_sampler_i_apply = ctypes.CFUNCTYPE( - None, llama_sampler_p_ctypes, llama_token_data_array_p -) + None, llama_sampler_p_ctypes, llama_token_data_array_p) llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes) llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) +llama_sampler_i_backend_init = ctypes.CFUNCTYPE( + ctypes.c_bool, llama_sampler_p_ctypes, ctypes.c_void_p) +llama_sampler_i_backend_accept = ctypes.CFUNCTYPE( + None, llama_sampler_p_ctypes, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p) +llama_sampler_i_backend_apply = ctypes.CFUNCTYPE( + None, llama_sampler_p_ctypes, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p) +llama_sampler_i_backend_set_input = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) + llama_sampler_i._fields_ = [ ("name", llama_sampler_i_name), ("accept", llama_sampler_i_accept), @@ -3789,18 +3990,43 @@ class llama_sampler(ctypes.Structure): ("reset", llama_sampler_i_reset), ("clone", llama_sampler_i_clone), ("free", llama_sampler_i_free), + ("backend_init", llama_sampler_i_backend_init), + ("backend_accept", llama_sampler_i_backend_accept), + ("backend_apply", llama_sampler_i_backend_apply), + ("backend_set_input", llama_sampler_i_backend_set_input), ] +# // [EXPERIMENTAL] +# // attach a sampler to the context +# // note: prefer initializing the context with llama_context_params.samplers when possible +# // note: changing the samplers of a context can cause graph reallocations and degraded performance +# LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl); +@ctypes_function( + "llama_set_sampler", + [llama_context_p_ctypes, llama_seq_id, llama_sampler_p_ctypes], + ctypes.c_bool, +) +def llama_set_sampler( + ctx: llama_context_p, seq_id: llama_seq_id, smpl: llama_sampler_p, / +) -> ctypes.c_bool: + """ + attach a sampler to the context + note: prefer initializing the context with llama_context_params.samplers when possible + note: changing the samplers of a context can cause graph reallocations and degraded performance + """ + ... + + # // mirror of llama_sampler_i: -# LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx); +# LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx); @ctypes_function( "llama_sampler_init", [ctypes.POINTER(llama_sampler_i), llama_sampler_context_t], llama_sampler_p_ctypes, ) def llama_sampler_init( - iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, / + iface: ctypes.pointer(llama_sampler_i), ctx: llama_sampler_context_t, / ) -> llama_sampler_p: ... @@ -3892,7 +4118,12 @@ def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ... -# LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); +# // return NULL if: +# // - the sampler is NULL +# // - the sampler is not a llama_sampler_chain +# // - the index is out of bounds, unless i == -1 +# // - if i == -1, returns the chain itself (can be used to check if the sampler is a chain) +# LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i); @ctypes_function( "llama_sampler_chain_get", [llama_sampler_p_ctypes, ctypes.c_int32], @@ -3901,6 +4132,13 @@ def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): def llama_sampler_chain_get( chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / ) -> llama_sampler_p: + """ + return NULL if: + - the sampler is NULL + - the sampler is not a llama_sampler_chain + - the index is out of bounds, unless i == -1 + - if i == -1, returns the chain itself (can be used to check if the sampler is a chain) + """ ... From 626a7ed33ae917d0517d7088c14a800d8f029e8f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 5 Jan 2026 23:20:53 +0800 Subject: [PATCH 081/518] Update README.md --- README.md | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2f72a3773b..ef7d97d1bc 100644 --- a/README.md +++ b/README.md @@ -772,7 +772,7 @@ print(res["choices"][0]["message"]["content"]) To generate embeddings, use the `LlamaEmbedding` class. It automatically configures the model for vector generation. ```python -from llama_cpp.llama_embedding import LlamaEmbedding +from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_NONE # Initialize the model (automatically sets embeddings=True) llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) @@ -793,8 +793,13 @@ print(f"Generated {len(embeddings)} vectors.") You can request raw arrays or cosine similarity matrices directly: ```python +from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_NONE + +# Initialize the model (automatically sets embeddings=True) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) + # Returns raw List[float] instead of a dictionary wrapper -vector = llm.create_embedding("Text", output_format="array", n_gpu_layers=-1) +vector = llm.create_embedding("Text", output_format="array") # Returns a similarity matrix (A @ A.T) in the response # Note: Requires numpy installed @@ -813,12 +818,12 @@ Reranking models (like `bge-reranker`) take a **Query** and a list of **Document ```python import llama_cpp -from llama_cpp.llama_embedding import LlamaEmbedding +from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_RANK # Initialize a Reranking model ranker = LlamaEmbedding( model_path="path/to/bge-reranker-v2-m3.gguf", - pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK, # Crucial for Rerankers! + pooling_type=LLAMA_POOLING_TYPE_RANK, # Crucial for Rerankers! n_gpu_layers=-1, ) @@ -853,22 +858,27 @@ The `embed` method supports various mathematical normalization strategies via th This is useful for optimizing storage or preparing vectors for cosine similarity search (which requires L2 normalization). ```python -from llama_cpp.llama_embedding import NORM_MODE_MAX_INT16, NORM_MODE_TAXICAB, NORM_MODE_EUCLIDEAN +from llama_cpp.llama_embedding import ( + LLAMA_POOLING_TYPE_NONE, + NORM_MODE_MAX_INT16, + NORM_MODE_TAXICAB, + NORM_MODE_EUCLIDEAN +) # Initialize the model (automatically sets embeddings=True) llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) # Taxicab (L1) -vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB, n_gpu_layers=-1) +vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB) # Default is Euclidean (L2) - Standard for vector databases -vec_l2 = llm.embed("text", normalize=NORM_MODE_EUCLIDEAN, n_gpu_layers=-1) +vec_l2 = llm.embed("text", normalize=NORM_MODE_EUCLIDEAN) # Max Absolute Int16 - Useful for quantization/compression -vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16, n_gpu_layers=-1) +vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16) # Raw Output (No Normalization) - Get the raw floating point values from the model -embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE, n_gpu_layers=-1) +embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE) ``` ### Legacy Usage (Deprecated) From 8a2de8641372b8b66bbad3925661d65f61dd8728 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 6 Jan 2026 06:40:35 +0800 Subject: [PATCH 082/518] Update Submodule vendor/llama.cpp f1768d8..e443fbc --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f1768d8f03..e443fbcfa5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f1768d8f03fe514794349790de0785eafded6c0b +Subproject commit e443fbcfa51a8a27b15f949397ab94b5e87b2450 From 5ea1285e9d450428e17304273dcfc1d382a8b90b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 6 Jan 2026 08:43:32 +0800 Subject: [PATCH 083/518] Update llama.cpp API 20260106 --- llama_cpp/_internals.py | 9 +++++++++ llama_cpp/llama.py | 28 ++++++++++++++++++++++++++++ llama_cpp/llama_cpp.py | 6 ++++++ 3 files changed, 43 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index ac8e179212..4e68904c7e 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -92,6 +92,15 @@ def n_cls_out(self) -> int: def n_embd(self) -> int: return llama_cpp.llama_model_n_embd(self.model) + def n_embd_inp(self) -> int: + return llama_cpp.llama_model_n_embd_inp(self.model) + + def n_embd_out(self) -> int: + return llama_cpp.llama_model_n_embd_out(self.model) + + def n_layer(self) -> int: + return llama_cpp.llama_model_n_layer(self.model) + def n_head(self) -> int: return llama_cpp.llama_model_n_head(self.model) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7eada4741f..c3760b924a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2351,14 +2351,42 @@ def n_ctx(self) -> int: """Return the context window size.""" return self._ctx.n_ctx() + def n_ctx_train(self) -> int: + """Return the training context window size.""" + return self._model.n_ctx_train() + def n_embd(self) -> int: """Return the embedding size.""" return self._model.n_embd() + def n_embd_inp(self) -> int: + """Return the input embedding size.""" + return self._model.n_embd_inp() + + def n_embd_out(self) -> int: + """Return the output embedding size.""" + return self._model.n_embd_out() + + def n_layer(self) -> int: + """Return the n_layer value.""" + return self._model.n_layer() + + def n_head(self) -> int: + """Return the head size.""" + return self._model.n_head() + def n_head_kv(self) -> int: """Return the head_kv size.""" return self._model.n_head_kv() + def n_swa(self) -> int: + """Return the swa size.""" + return self._model.n_swa() + + def n_params(self) -> int: + """Returns the total number of parameters in the model""" + return self._model.n_params() + def n_vocab(self) -> int: """Return the vocabulary size.""" return self._model.n_vocab() diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 348ffb969e..16621c8a82 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1562,6 +1562,12 @@ def llama_model_n_embd_inp(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model); +@ctypes_function("llama_model_n_embd_out", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_embd_out(model: llama_model_p, /) -> int: + ... + + # LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_layer(model: llama_model_p, /) -> int: From 2125d02c5603ce98f98f4311de61b1c87fc6913d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 7 Jan 2026 03:05:42 +0800 Subject: [PATCH 084/518] Update Submodule vendor/llama.cpp e443fbc..68b4d51 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e443fbcfa5..68b4d516c3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e443fbcfa51a8a27b15f949397ab94b5e87b2450 +Subproject commit 68b4d516c305325d31e698c4673b691d2a9d879f From 80279c411ea47210122ad04a63305c07e4c71cd6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 7 Jan 2026 06:29:46 +0800 Subject: [PATCH 085/518] feat: Add Granite-Docling model support with 'GraniteDoclingChatHandler' - Add support for 'controls' parameter to guide model parsing behavior (e.g., Document Parsing: {"mode": "document_parsing", "format": "json"}). - Format(512x512): Content Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 87 ++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/README.md b/README.md index ef7d97d1bc..3f0f88ffef 100644 --- a/README.md +++ b/README.md @@ -498,6 +498,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | | [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | +| [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index d8295b8d56..5370209395 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3901,6 +3901,93 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class GraniteDoclingChatHandler(Llava15ChatHandler): + """ + Handler for Granite-Docling models. + + Format(512x512): Content + + Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! + Since the model does not have special tokens for the start and end of an image, + it is recommended to process only one image at a time. + You can iterate through the images individually for recognition. + + """ + GRANITE_BOS_TOKEN = "<|start_of_role|>" + GRANITE_EOS_TOKEN = "<|end_of_text|>" + GRANITE_PAD_TOKEN = "<|end_of_text|>" + GRANITE_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for part in message['content'] -%}" + "{%- if part['type'] == 'text' -%}" + "{{- part['text'] -}}" + "{%- elif part['type'] == 'image_url' -%}" + "{%- if part.image_url is string -%}" + "{{- part.image_url -}}" + "{%- else -%}" + "{{- part.image_url.url -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '<|end_of_text|>\n' -}}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|start_of_role|>assistant' -}}" + # Support the 'controls' parameter if present in generation arguments + "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" + "{{- '<|end_of_role|>' -}}" + "{%- endif -%}" + ) + + def __init__(self, controls: dict = None, **kwargs): + """ + Granite-Docling Handler + Args: + controls (dict, optional): Operational parameters passed to the assistant role. + + The 'controls' parameter is used to guide the model's behavior or output format. + Common examples for 'controls' include: + - Document Parsing: {"mode": "document_parsing", "format": "json"} + """ + self.controls = controls + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject controls into the template environment + self.extra_template_arguments["controls"] = self.controls + self.DEFAULT_SYSTEM_MESSAGE = None + kwargs['stop'] = [self.GRANITE_EOS_TOKEN] + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"GraniteDoclingChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) + except Exception: + print(f"GraniteDoclingChatHandler - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + class LFM2VLChatHandler(Llava15ChatHandler): LFM2VL_BOS_TOKEN = "<|startoftext|>" LFM2VL_EOS_TOKEN = "<|im_end|>" From e9d94e7a785423ee9b19b63b57bce6b8d2331bba Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 8 Jan 2026 05:29:43 +0800 Subject: [PATCH 086/518] Update Submodule vendor/llama.cpp 68b4d51..55abc39 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 68b4d516c3..55abc39355 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 68b4d516c305325d31e698c4673b691d2a9d879f +Subproject commit 55abc393552f3f2097f168cb6db4dc495a514d56 From 4d0fe249fd3ff1454c5c3e35360f4368ffafbda8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 8 Jan 2026 21:23:28 +0800 Subject: [PATCH 087/518] Update llama.cpp API 20260108 --- examples/low_level_api/common.py | 1 + .../low_level_api/low_level_api_chat_cpp.py | 1 + llama_cpp/llama.py | 3 +++ llama_cpp/llama_cpp.py | 4 ++++ llama_cpp/server/model.py | 4 ++++ llama_cpp/server/settings.py | 24 +++++++++++++++---- tests/test_llama.py | 1 + 7 files changed, 34 insertions(+), 4 deletions(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 0cad6c980c..599c78e13b 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -62,6 +62,7 @@ class GptParams: penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True + use_direct_io: bool = True use_mlock: bool = False mem_test: bool = False verbose_prompt: bool = False diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index f055232a55..7aa80ccec4 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -78,6 +78,7 @@ def __init__(self, params: GptParams) -> None: self.lparams.memory_f16 = self.params.memory_f16 self.lparams.use_mlock = self.params.use_mlock self.lparams.use_mmap = self.params.use_mmap + self.lparams.use_direct_io = self.params.use_direct_io self.model = llama_cpp.llama_load_model_from_file( self.params.model.encode("utf8"), self.lparams diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c3760b924a..d5a0d06e4b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -69,6 +69,7 @@ def __init__( tensor_split: Optional[List[float]] = None, vocab_only: bool = False, use_mmap: bool = True, + use_direct_io: bool = True, use_mlock: bool = False, check_tensors: bool = False, use_extra_bufts: bool = False, @@ -256,6 +257,7 @@ def __init__( self.model_params.tensor_split = self._c_tensor_split self.model_params.vocab_only = vocab_only self.model_params.use_mmap = use_mmap if lora_path is None else False + self.model_params.use_direct_io = use_direct_io self.model_params.use_mlock = use_mlock self.model_params.check_tensors = check_tensors self.model_params.use_extra_bufts = use_extra_bufts @@ -2248,6 +2250,7 @@ def __getstate__(self): tensor_split=self.tensor_split, vocab_only=self.model_params.vocab_only, use_mmap=self.model_params.use_mmap, + use_direct_io=self.model_params.use_direct_io, use_mlock=self.model_params.use_mlock, check_tensors=self.model_params.check_tensors, use_extra_bufts=self.model_params.use_extra_bufts, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 16621c8a82..95abec62ba 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -713,6 +713,7 @@ class llama_model_tensor_buft_override(ctypes.Structure): # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible +# bool use_direct_io; // use direct io, takes precedence over use_mmap # bool use_mlock; // force system to keep model in RAM # bool check_tensors; // validate model tensor data # bool use_extra_bufts; // use extra buffer types (used for weight repacking) @@ -734,6 +735,7 @@ class llama_model_params(ctypes.Structure): kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible + use_direct_io(bool): use direct io, takes precedence over use_mmap use_mlock (bool): force system to keep model in RAM check_tensors (bool): validate model tensor data use_extra_bufts (bool): use extra buffer types (used for weight repacking) @@ -752,6 +754,7 @@ class llama_model_params(ctypes.Structure): kv_overrides: CtypesArray[llama_model_kv_override] vocab_only: bool use_mmap: bool + use_direct_io: bool use_mlock: bool check_tensors: bool use_extra_bufts: bool @@ -770,6 +773,7 @@ class llama_model_params(ctypes.Structure): ("kv_overrides", ctypes.POINTER(llama_model_kv_override)), ("vocab_only", ctypes.c_bool), ("use_mmap", ctypes.c_bool), + ("use_direct_io", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index a802fe3209..62d491a76f 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -270,7 +270,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: tensor_split=settings.tensor_split, vocab_only=settings.vocab_only, use_mmap=settings.use_mmap, + use_direct_io=settings.use_direct_io, use_mlock=settings.use_mlock, + check_tensors=settings.check_tensors, + use_extra_bufts=settings.use_extra_bufts, + no_host=settings.no_host, kv_overrides=kv_overrides, rpc_servers=settings.rpc_servers, # Context Params diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 7527ad9757..a4b67391e0 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -47,12 +47,28 @@ class ModelSettings(BaseSettings): default=False, description="Whether to only return the vocabulary." ) use_mmap: bool = Field( - default=llama_cpp.llama_supports_mmap(), - description="Use mmap.", + default=True, + description="Enable mmap to use filesystem cache.", + ) + use_direct_io: bool = Field( + default=True, + description="Use direct io, takes precedence over use_mmap.", ) use_mlock: bool = Field( - default=llama_cpp.llama_supports_mlock(), - description="Use mlock.", + default=False, + description="Use mlock for force system to keep model in RAM", + ) + check_tensors: bool = Field( + default=False, + description="Validate model tensor data.", + ) + use_extra_bufts: bool = Field( + default=True, + description="Use extra buffer types (used for weight repacking).", + ) + no_host: bool = Field( + default=False, + description="Bypass host buffer allowing extra buffers to be used.", ) kv_overrides: Optional[List[str]] = Field( default=None, diff --git a/tests/test_llama.py b/tests/test_llama.py index 6e7b84495f..ad2651b562 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -75,6 +75,7 @@ def test_real_model(llama_cpp_model_path): params = llama_cpp.llama_model_default_params() params.use_mmap = llama_cpp.llama_supports_mmap() + params.use_direct_io = True params.use_mlock = llama_cpp.llama_supports_mlock() params.check_tensors = False From 1fecb0555a1b5fe97bfcca3a949aa6351a041c54 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 10 Jan 2026 00:20:50 +0800 Subject: [PATCH 088/518] Update Submodule vendor/llama.cpp 55abc39..ec8fd78 --- llama_cpp/llama_cpp.py | 1 + vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 95abec62ba..dec0c60a05 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -4183,6 +4183,7 @@ def llama_sampler_init_greedy() -> llama_sampler_p: ... +# /// seed == LLAMA_DEFAULT_SEED to use a random seed. # LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes) def llama_sampler_init_dist(seed: int) -> llama_sampler_p: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 55abc39355..ec8fd7876b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 55abc393552f3f2097f168cb6db4dc495a514d56 +Subproject commit ec8fd7876b6195bd8582eba4f1debb23d13fde81 From 457d711a343e63d4ba05c6eccb189e7dc5af9125 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 10 Jan 2026 07:13:33 +0800 Subject: [PATCH 089/518] Update llama_cpp.llama_detokenize API and remove unused comment --- llama_cpp/llama_cpp.py | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index dec0c60a05..f1b1871d37 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -3710,23 +3710,6 @@ def llama_token_to_piece( ... -# # // check if token0 is contained as a prefix in token1 -# # LLAMA_API bool llama_token_is_prefix( -# # const struct llama_model * model, -# # llama_token token0, -# # llama_token token1); -# @ctypes_function( -# "llama_token_is_prefix", -# [llama_model_p_ctypes, llama_token, llama_token], -# ctypes.c_bool, -# ) -# def llama_token_is_prefix( -# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], / -# ) -> bool: -# """Check if token0 is contained as a prefix in token1""" -# ... - - # /// @details Convert the provided tokens into text (inverse of llama_tokenize()). # /// @param text The char pointer must be large enough to hold the resulting text. # /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -3734,17 +3717,17 @@ def llama_token_to_piece( # /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so. # /// @param unparse_special If true, special tokens are rendered in the output. # LLAMA_API int32_t llama_detokenize( -# const struct llama_model * model, -# const llama_token * tokens, -# int32_t n_tokens, +# const struct llama_vocab * vocab, +# const llama_token * tokens, +# int32_t n_tokens, # char * text, -# int32_t text_len_max, +# int32_t text_len_max, # bool remove_special, # bool unparse_special); @ctypes_function( "llama_detokenize", [ - llama_model_p_ctypes, + llama_vocab_p_ctypes, llama_token_p, ctypes.c_int32, ctypes.c_char_p, @@ -3755,7 +3738,7 @@ def llama_token_to_piece( ctypes.c_int32, ) def llama_detokenize( - model: llama_model_p, + vocab: llama_vocab_p, tokens: CtypesArray[llama_token], n_tokens: Union[ctypes.c_int, int], text: bytes, @@ -3767,13 +3750,14 @@ def llama_detokenize( """Convert the provided tokens into text (inverse of llama_tokenize()). Args: - model: The model to use for tokenization. + vocab: The model vocab to use for tokenization. tokens: The tokens to convert. n_tokens: The number of tokens. text: The buffer to write the text to. text_len_max: The length of the buffer. remove_special: Allow to remove BOS and EOS tokens if model is configured to do so. - unparse_special: If true, special tokens are rendered in the output.""" + unparse_special: If true, special tokens are rendered in the output. + """ ... From 79f31a974a07124b11c4257bea9d84c5b6d59abc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 10 Jan 2026 07:59:04 +0800 Subject: [PATCH 090/518] perf: optimize tokenization and detokenization logic Refactor `tokenize`, `token_to_piece`, and `detokenize` methods in `_internals.py` to significantly reduce Python overhead and improve stability. Key changes: - Replace O(N) Python loops in `detokenize` with `llama.cpp` native batch C-API (`llama_detokenize`). - Implement dynamic buffer allocation to safely handle arbitrary token lengths (removing the hardcoded 32-byte limit). - Add automatic buffer resizing for `tokenize` to prevent overflow errors. Performance observations (based on user benchmarks): - Small Batch Processing (127 tokens): Latency reduced from ~117ms to ~37ms (approx. 3.1x speedup in processing loop). - Large Batch Processing (2420 tokens): Throughput improved from ~6905 t/s to ~8258 t/s. - General Latency: Total execution time for standard chat scenarios reduced by ~1.1s (from 8.4s to 7.3s). Signed-off-by: JamePeng --- llama_cpp/_internals.py | 87 +++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 4e68904c7e..f07d6377cc 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -200,46 +200,85 @@ def get_add_sep(self) -> bool: # Tokenization def tokenize(self, text: bytes, add_bos: bool, special: bool): - n_ctx = self.n_ctx_train() - tokens = (llama_cpp.llama_token * n_ctx)() + """ + Tokenize a string. + Optimized to use dynamic buffer allocation. + """ + n_tokens_alloc = len(text) + 2 + tokens = (llama_cpp.llama_token * n_tokens_alloc)() + n_tokens = llama_cpp.llama_tokenize( - self.vocab, text, len(text), tokens, n_ctx, add_bos, special + self.vocab, text, len(text), tokens, n_tokens_alloc, add_bos, special ) + + # If the buffer is insufficient (returns a negative number), reallocate the buffer. if n_tokens < 0: - n_tokens = abs(n_tokens) - tokens = (llama_cpp.llama_token * n_tokens)() + n_tokens_alloc = -n_tokens + tokens = (llama_cpp.llama_token * n_tokens_alloc)() n_tokens = llama_cpp.llama_tokenize( - self.vocab, text, len(text), tokens, n_tokens, add_bos, special + self.vocab, text, len(text), tokens, n_tokens_alloc, add_bos, special ) if n_tokens < 0: raise RuntimeError( f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' ) + + # return a buffer of n_tokens size. return list(tokens[:n_tokens]) def token_to_piece(self, token: int, special: bool = False) -> bytes: - buf = ctypes.create_string_buffer(32) - llama_cpp.llama_token_to_piece(self.vocab, token, buf, 32, 0, special) - return bytes(buf) + """ + Convert a single token to bytes. + Optimized to handle dynamic resizing for ultra-long tokens. + """ + size = 32 + buf = (ctypes.c_char * size)() + n = llama_cpp.llama_token_to_piece(self.vocab, token, buf, size, 0, special) + + # If the token is very long (returns a negative number), redistribute it according to the returned size. + if n < 0: + size = -n + buf = (ctypes.c_char * size)() + n = llama_cpp.llama_token_to_piece(self.vocab, token, buf, size, 0, special) + if n < 0: + raise RuntimeError(f"Failed to get piece for token {token}") + + # return a buffer of n size. + return bytes(buf[:n]) def detokenize(self, tokens: List[int], special: bool = False) -> bytes: - output = b"" - size = 32 - buffer = (ctypes.c_char * size)() - for token in tokens: - n = llama_cpp.llama_token_to_piece( - self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special - ) - assert n <= size - output += bytes(buffer[:n]) - # NOTE: Llama1 models automatically added a space at the start of the prompt - # this line removes a leading space if the first token is a beginning of sentence token - return ( - output[1:] - if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" " - else output + """ + Convert a list of tokens to bytes. + Optimized to handle dynamic resizing for ultra-long tokens. + """ + if not tokens: + return b"" + + n_tokens = len(tokens) + # Convert a Python list to a C int array + tokens_array = (llama_cpp.llama_token * n_tokens)(*tokens) + + # Initial buffer size estimation + buffer_size = max(n_tokens, 64) + buffer = (ctypes.c_char * buffer_size)() + + n_chars = llama_cpp.llama_detokenize( + self.vocab, tokens_array, n_tokens, buffer, buffer_size, False, special ) + # If the buffer is insufficient, expand it and retry. + if n_chars < 0: + buffer_size = -n_chars + buffer = (ctypes.c_char * buffer_size)() + n_chars = llama_cpp.llama_detokenize( + self.vocab, tokens_array, n_tokens, buffer, buffer_size, False, special + ) + if n_chars < 0: + raise RuntimeError("Failed to detokenize") + + return bytes(buffer[:n_chars]) + + # Extra def metadata(self) -> Dict[str, str]: metadata: Dict[str, str] = {} From 6d3cfc34f8c95357d366aec3929d377a87fa65bd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 11 Jan 2026 08:29:18 +0800 Subject: [PATCH 091/518] Fixed test_llama.py --- tests/test_llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_llama.py b/tests/test_llama.py index ad2651b562..93719b4b2e 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -36,14 +36,14 @@ def test_llama_cpp_tokenization(): assert tokens[0] == llama.token_bos() assert tokens == [1, 15043, 2787] detokenized = llama.detokenize(tokens) - assert detokenized == text + assert detokenized[1:] == text tokens = llama.tokenize(text, add_bos=False) assert tokens[0] != llama.token_bos() assert tokens == [15043, 2787] detokenized = llama.detokenize(tokens) - assert detokenized != text + assert detokenized == text text = b"Hello World" tokens = llama.tokenize(text) From 3a2e49c97b8a9fc7e719d8a861d47f4d8b04de8c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 11 Jan 2026 08:29:35 +0800 Subject: [PATCH 092/518] Update Submodule vendor/llama.cpp ec8fd78..b137718 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ec8fd7876b..b137718878 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ec8fd7876b6195bd8582eba4f1debb23d13fde81 +Subproject commit b1377188784f9aea26b8abde56d4aee8c733eec7 From d92d2796a8162dfd7fcb0d3bfef1ccd7b92da1bc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 11 Jan 2026 10:49:41 +0800 Subject: [PATCH 093/518] Bump version to 0.3.21 Signed-off-by: JamePeng --- CHANGELOG.md | 28 +++++++++++++++++++++++++--- llama_cpp/__init__.py | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a3045fbc5..d9456371fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.21] +- perf: optimize tokenization and detokenization logic + - Refactor `tokenize`, `token_to_piece`, and `detokenize` methods in `_internals.py` to significantly reduce Python loop overhead and improve the batch-processing performance and stability of `load`/`prompt-eval`. + + - Key changes: + - Replace `token_to_piece` O(N) Python loops in `detokenize` with `llama.cpp` native batch C-API (`llama_detokenize`). + - Implement dynamic buffer allocation to safely handle arbitrary token lengths (removing the hardcoded 32-byte limit). + - Add automatic buffer resizing for `tokenize` to prevent overflow errors. + + - Performance observations (based on simple benchmarks): + - Small Batch Processing (127 tokens): + Latency reduced from ~117ms to ~37ms (approx. 3.1x speedup in processing loop). + - Large Batch Processing (2420 tokens): + Throughput improved from ~6905 t/s to ~8258 t/s. + - General Latency: + Total execution time for standard chat scenarios reduced by ~1.1s (from 8.4s to 7.3s). + - The comparative test results are here: https://github.com/JamePeng/llama-cpp-python/issues/47#issuecomment-3731055087 + +- feat: Add `Granite-Docling` multimodel support with `GraniteDoclingChatHandler` +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/b1377188784f9aea26b8abde56d4aee8c733eec7](https://github.com/ggml-org/llama.cpp/commit/b1377188784f9aea26b8abde56d4aee8c733eec7) +- feat: Sync llama.cpp llama/mtmd API Binding 20260110 + ## [0.3.20] - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/cef1d23c5a33156c44a206c1f4bc146f4db729f9](https://github.com/ggml-org/llama.cpp/commit/cef1d23c5a33156c44a206c1f4bc146f4db729f9) - feat: Update llama_context_params and fixed some embeddings typo @@ -37,9 +59,9 @@ More information see: https://github.com/JamePeng/llama-cpp-python/compare/2efaa ## [0.3.18] - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787](https://github.com/ggml-org/llama.cpp/commit/ce734a8a2f9fb6eb4f0383ab1370a1b0014ab787) - feat: Sync llama.cpp llama/mtmd API Binding 20251215 -- feat: **implement `GLM46VChatHandler` for GLM-4.6V Series Model** -- feat: **implement `LFM2VLChatHandler` for LFM2-VL series models** -- feat: **implement `GLM41VChatHandler` for GLM-4.1V-9B-Thinking Model** +- feat: **implement `GLM46VChatHandler` for GLM-4.6V Series Multimodel** +- feat: **implement `LFM2VLChatHandler` for LFM2-VL series Multimodel** +- feat: **implement `GLM41VChatHandler` for GLM-4.1V-9B-Thinking Multimodel** - workflow: Added workflows for compiling with CUDA 13.0.2 on Windows and Linux. - feat: Added the scan path for CUDA 13.0+ dynamic link libraries under Windows system ($env:CUDA_PATH\bin\x64) - Optimization: Improved batch token processing logic in Llava15ChatHandler. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 83177c065d..fbad5c28b2 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.20" +__version__ = "0.3.21" From 1aabc16ae98852323c967b72a4f4242541d0d00f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 12 Jan 2026 00:11:26 +0800 Subject: [PATCH 094/518] Update README.md --- README.md | 83 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 3f0f88ffef..b297ade462 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) -[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +![GitHub Tag](https://img.shields.io/github/v/tag/JamePeng/llama-cpp-python) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python) @@ -35,31 +35,31 @@ Requirements: - Python 3.9+ - C compiler - Linux: gcc or clang - - Windows: Visual Studio or MinGW + - Windows: [`Visual Studio 2022 Build Tools`](https://download.visualstudio.microsoft.com/download/pr/6efb3484-905b-485c-8b5f-9d3a5f39e731/07908cd6d91e75b8ea4339d8f2cfa6e8d8bb4fd706af7b918ae391cd6fc2a066/vs_BuildTools.exe) or `MinGW` - MacOS: Xcode + - CMake 3.21+ + - Git To install the package, run: - -```bash -pip install llama-cpp-python -``` +- Method 1: + ```bash + pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" + ``` +- Method 2: + ```bash + git clone https://github.com/JamePeng/llama-cpp-python --recursive + cd llama-cpp-python + python -m pip install -U pip + pip install . + ``` This will also build `llama.cpp` from source and install it alongside this python package. If this fails, add `--verbose` to the `pip install` see the full cmake build log. -**Pre-built Wheel (New)** - -It is also possible to install a pre-built wheel with basic CPU support. - -```bash -pip install llama-cpp-python \ - --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu -``` - ### Installation Configuration -`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list. +`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp build docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) for a full list. All `llama.cpp` cmake build options can be set via the `CMAKE_ARGS` environment variable or via the `--config-settings / -C` cli flag during installation. @@ -69,13 +69,13 @@ All `llama.cpp` cmake build options can be set via the `CMAKE_ARGS` environment ```bash # Linux and Mac CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ - pip install llama-cpp-python + pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` ```powershell # Windows $env:CMAKE_ARGS = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" -pip install llama-cpp-python +pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ```
@@ -86,7 +86,7 @@ They can also be set via `pip install -C / --config-settings` command and saved ```bash pip install --upgrade pip # ensure pip is up to date -pip install llama-cpp-python \ +pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" \ -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" ``` @@ -103,22 +103,25 @@ llama-cpp-python -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" Below are some common backends, their build commands and any additional environment variables required.
-OpenBLAS (CPU) +CUDA -To install with OpenBLAS, set the `GGML_BLAS` and `GGML_BLAS_VENDOR` environment variables before installing: +Installing a CUDA-supported version requires the `CUDA Toolkit` environment to be installed first. -```bash -CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python -``` -
+**Note: Please select and install according to your system environment and local graphics card model to ensure that the compilation is based on the optimal local environment.** -
-CUDA +See here: https://developer.nvidia.com/cuda-toolkit-archive -To install with CUDA support, set the `GGML_CUDA=on` environment variable before installing: +Then, set the `GGML_CUDA=on` environment variable before installing: ```bash -CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python +# Linux +CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +```powershell +# Windows +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` **Pre-built Wheel (New)** @@ -135,13 +138,23 @@ https://github.com/JamePeng/llama-cpp-python/releases
+
+OpenBLAS (CPU) + +To install with OpenBLAS, set the `GGML_BLAS` and `GGML_BLAS_VENDOR` environment variables before installing: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` +
+
Metal To install with Metal (MPS), set the `GGML_METAL=on` environment variable before installing: ```bash -CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python +CMAKE_ARGS="-DGGML_METAL=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` **Pre-built Wheel (New)** @@ -164,7 +177,7 @@ pip install llama-cpp-python \ To install with hipBLAS / ROCm support for AMD cards, set the `GGML_HIPBLAS=on` environment variable before installing: ```bash -CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python +CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ```
@@ -179,7 +192,7 @@ CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python Then install with Vulkan support by set the `GGML_VULKAN=on` environment variable before installing: ```bash -CMAKE_ARGS="-DGGML_VULKAN=on" pip install llama-cpp-python +CMAKE_ARGS="-DGGML_VULKAN=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` @@ -191,7 +204,7 @@ To install with SYCL support, set the `GGML_SYCL=on` environment variable before ```bash source /opt/intel/oneapi/setvars.sh -CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` @@ -202,7 +215,7 @@ To install with RPC support, set the `GGML_RPC=on` environment variable before i ```bash source /opt/intel/oneapi/setvars.sh -CMAKE_ARGS="-DGGML_RPC=on" pip install llama-cpp-python +CMAKE_ARGS="-DGGML_RPC=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` @@ -245,7 +258,7 @@ Otherwise, while installing it will build the llama.cpp x86 version which will b Try installing with ```bash -CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python +CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` From b5492ad0100c099bd92ccbe3c4321348ae209efd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 13 Jan 2026 06:32:12 +0800 Subject: [PATCH 095/518] Update Submodule vendor/llama.cpp b137718..e4832e3 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b137718878..e4832e3ae4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b1377188784f9aea26b8abde56d4aee8c733eec7 +Subproject commit e4832e3ae4d58ac0ecbdbf4ae055424d6e628c9f From ec59b63a9eee73a5a0daed55693e10bd7e787478 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 14 Jan 2026 02:07:48 +0800 Subject: [PATCH 096/518] Update LlamaModel API in _internals.py --- llama_cpp/_internals.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index f07d6377cc..03a762e995 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -113,6 +113,24 @@ def n_swa(self) -> int: def n_params(self) -> int: return llama_cpp.llama_model_n_params(self.model) + def has_encoder(self) -> bool: + return llama_cpp.llama_model_has_encoder(self.model) + + def has_decoder(self) -> bool: + return llama_cpp.llama_model_has_decoder(self.model) + + def decoder_start_token(self) -> int: + return llama_cpp.llama_model_decoder_start_token(self.model) + + def is_recurrent(self) -> bool: + return llama_cpp.llama_model_is_recurrent(self.model) + + def is_hybrid(self) -> bool: + return llama_cpp.llama_model_is_hybrid(self.model) + + def is_diffusion(self) -> bool: + return llama_cpp.llama_model_is_diffusion(self.model) + def rope_freq_scale_train(self) -> float: return llama_cpp.llama_model_rope_freq_scale_train(self.model) From a43d904dbac45b87d4086b096779b139fb52a34e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 14 Jan 2026 02:19:28 +0800 Subject: [PATCH 097/518] Add initialization checks to the Encoder-Decoder architecture. --- llama_cpp/llama.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d5a0d06e4b..3bba707028 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -420,6 +420,25 @@ def __init__( ) ) + # Check for Encoder-Decoder architecture + self._has_encoder = self._model.has_encoder() + self._has_decoder = self._model.has_decoder() + self._decoder_start_token_id = -1 + + if self._has_encoder: + try: + self._decoder_start_token_id = self._model.decoder_start_token() + except AttributeError: + # LLAMA_TOKEN_NULL = -1 + self._decoder_start_token_id = -1 + + if self._decoder_start_token_id == -1: + # Fallback to BOS if specific start token is not defined + self._decoder_start_token_id = self.token_bos() + + if self.verbose: + print(f"Model is Encoder-Decoder. Decoder start token: {self._decoder_start_token_id}", file=sys.stderr) + # Override tokenizer self.tokenizer_ = tokenizer or LlamaTokenizer(self) From 5a926525adf66acbb8e1ad296f2fe5801733dd1d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 14 Jan 2026 02:43:59 +0800 Subject: [PATCH 098/518] CMakelists.txt: Disable LLAMA_BUILD_EXAMPLES and LLAMA_BUILD_TESTS to reduce unnecessary options and speed up compilation. --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca53b2ff63..555fb3c522 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,10 +66,16 @@ if (LLAMA_BUILD) set(CMAKE_SKIP_RPATH FALSE) # Enable building of the common library - set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE) + set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama.cpp: build common utils library" FORCE) + + # Disable building of examples + set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama.cpp: build examples" FORCE) + + # Disable building of tests + set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama.cpp: build tests" FORCE) # Disable building curl support - set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE) + set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: use libcurl to download model from an URL" FORCE) # Enable build and link OpenSSL set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) From 4053cde0e0464cdf09895f3fde72b862680109e4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 14 Jan 2026 21:01:27 +0800 Subject: [PATCH 099/518] Update Submodule vendor/llama.cpp e4832e3..3e4bb29 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e4832e3ae4..3e4bb29666 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e4832e3ae4d58ac0ecbdbf4ae055424d6e628c9f +Subproject commit 3e4bb2966685facd549ac99bde1e02633e024920 From fb062808533079cd4d51aac17cf2b690e5832756 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 14 Jan 2026 21:04:59 +0800 Subject: [PATCH 100/518] Update llama_vocab_pre_type enum variables --- llama_cpp/llama_cpp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f1b1871d37..8dbbd1b455 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -187,6 +187,7 @@ # LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, # LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, # LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, +# LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -231,8 +232,9 @@ LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 -LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, -LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, +LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 +LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 +LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 # // note: these values should be synchronized with ggml_rope From b89ee564d6aaa79bc348ac6c6e997def1a8a6ae4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 15 Jan 2026 19:56:35 +0800 Subject: [PATCH 101/518] Update Submodule vendor/llama.cpp 3e4bb29..be8e3d9 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3e4bb29666..be8e3d9515 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3e4bb2966685facd549ac99bde1e02633e024920 +Subproject commit be8e3d9515870a63b250c203d263f9dd2232eb3c From 736f2a12d181eaf504dd39f7fbc6a666ab7e61cd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 16 Jan 2026 03:19:41 +0800 Subject: [PATCH 102/518] Update llama.cpp API 20260116 --- llama_cpp/llama_cpp.py | 47 +++++++++++++++++++++++++++++++++++++++++- llama_cpp/mtmd_cpp.py | 12 ++++++++--- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 8dbbd1b455..114577d219 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1960,7 +1960,8 @@ def llama_adapter_meta_val_str_by_index( # // Manually free a LoRA adapter # // Note: loaded adapters will be free when the associated model is deleted -# LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); +# LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter), +# "adapters are now freed together with the associated model"); @ctypes_function( "llama_adapter_lora_free", [llama_adapter_lora_p_ctypes], @@ -4418,6 +4419,50 @@ def llama_sampler_init_dry( ... +# /// adaptive-p: select tokens near a configurable target probability over time. +# /// +# /// the adaptive-p sampler transforms the token probability distribution to favor tokens +# /// that fall near a user-configurable probability target. +# /// +# /// internally, the sampler maintains an exponential moving average of the *ORIGINAL* +# /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an +# /// adapted target probability at each sampling step, thus maintaining the desired target +# /// probability over time. +# /// +# /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last +# /// in the sampler chain (like mirostat, dist, greedy). +# /// +# /// only mild truncation before this sampler is recommended. we suggest applying min-p +# /// before adaptive-p as the only other active sampler in the chain. +# /// +# /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) +# /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) +# /// @param seed RNG seed +# /// +# /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +# /// +# LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p( +# float target, +# float decay, +# uint32_t seed); +@ctypes_function( + "llama_sampler_init_adaptive_p", + [ + ctypes.c_float, + ctypes.c_float, + ctypes.c_uint32, + ], + llama_sampler_p_ctypes, +) +def llama_sampler_init_adaptive_p( + target: float, + decay: float, + seed: int, + /, +) -> llama_sampler_p: + ... + + # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( # int32_t n_vocab, # int32_t n_logit_bias, diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index e2ee004dea..758678a8de 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -33,7 +33,8 @@ ) from llama_cpp._ggml import ( - ggml_log_callback + ggml_backend_sched_eval_callback, + ggml_log_callback, ) if TYPE_CHECKING: @@ -146,10 +147,14 @@ class clip_context_params(Structure): # const char * media_marker; # enum llama_flash_attn_type flash_attn_type; # bool warmup; // whether to run a warmup encode pass after initialization - +# # // limit number of image tokens, only for vision models with dynamic resolution # int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) # int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) +# +# // callback function passed over to mtmd proper +# ggml_backend_sched_eval_callback cb_eval; +# void * cb_eval_user_data; # }; class mtmd_context_params(Structure): _fields_ = [ @@ -162,9 +167,10 @@ class mtmd_context_params(Structure): ("warmup", c_bool), ("image_min_tokens", c_int), ("image_max_tokens", c_int), + ("cb_eval", ggml_backend_sched_eval_callback), + ("cb_eval_user_data", c_void_p), ] -mtmd_context_params_p = NewType("mtmd_context_params_p", int) mtmd_context_params_p_ctypes = POINTER(mtmd_context_params) # MTMD_API const char * mtmd_default_marker(void); From 99e7ece91a9765ae31922c2bc79f5be1e22bf61e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 16 Jan 2026 04:00:00 +0800 Subject: [PATCH 103/518] Add support for `adaptive_p` and `infill` samplers and optimize the sampler logic. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 17 +++++ llama_cpp/llama.py | 110 ++++++++++++++++++++++++++++----- llama_cpp/llama_chat_format.py | 72 +++++++++++++++++++++ 3 files changed, 182 insertions(+), 17 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 03a762e995..0e7d152bf7 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1208,6 +1208,19 @@ def add_dry( ) self._add_sampler(sampler) + def add_adaptive_p( + self, + target: float, + decay: float, + seed: int, + ): + sampler = llama_cpp.llama_sampler_init_adaptive_p( + target, + decay, + seed + ) + self._add_sampler(sampler) + def add_logit_bias( self, n_vocab: int, logit_bias: Dict[int, float] ): @@ -1221,6 +1234,10 @@ def add_logit_bias( sampler = llama_cpp.llama_sampler_init_logit_bias(n_vocab, len(logit_bias), logit_bias_array) self._add_sampler(sampler) + def add_infill(self, model: LlamaModel): + sampler = llama_cpp.llama_sampler_init_infill(model.vocab) + self._add_sampler(sampler) + def add_custom( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] ): diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3bba707028..2c62fd86e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -764,6 +764,10 @@ def _init_sampler( dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], penalize_nl: bool = True, + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, @@ -773,18 +777,6 @@ def _init_sampler( if logit_bias is not None: sampler.add_logit_bias(self.n_vocab(), logit_bias) - sampler.add_penalties( - n_vocab=self._n_vocab, - special_eos_id=self._token_eos, - linefeed_id=self._token_nl, - penalty_last_n=self.last_n_tokens_size, - penalty_repeat=repeat_penalty, - penalty_freq=frequency_penalty, - penalty_present=presence_penalty, - penalize_nl=penalize_nl, - ignore_eos=False, - ) - if grammar is not None: sampler.add_grammar(self._model, grammar) @@ -794,6 +786,7 @@ def _init_sampler( sampler.add_greedy() else: if mirostat_mode == 1: + sampler.add_temp(temp) mirostat_m = 100 sampler.add_mirostat( self._n_vocab, @@ -803,6 +796,7 @@ def _init_sampler( mirostat_m, ) elif mirostat_mode == 2: + sampler.add_temp(temp) sampler.add_mirostat_v2( self._seed, mirostat_tau, @@ -811,15 +805,33 @@ def _init_sampler( else: n_probs = 0 min_keep = max(1, n_probs) + sampler.add_dry(self._model, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, dry_seq_breakers) sampler.add_top_k(top_k) - sampler.add_typical(typical_p, min_keep) - sampler.add_top_n_sigma(top_n_sigma) sampler.add_top_p(top_p, min_keep) + sampler.add_top_n_sigma(top_n_sigma) sampler.add_min_p(min_p, min_keep) - sampler.add_temp(temp) - sampler.add_dist(self._seed) sampler.add_xtc(xtc_probability, xtc_threshold, min_keep, self._seed) - sampler.add_dry(self._model, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, dry_seq_breakers) + sampler.add_typical(typical_p, min_keep) + sampler.add_temp(temp) + if use_infill: + sampler.add_infill(self._model) + sampler.add_penalties( + n_vocab=self._n_vocab, + special_eos_id=self._token_eos, + linefeed_id=self._token_nl, + penalty_last_n=self.last_n_tokens_size, + penalty_repeat=repeat_penalty, + penalty_freq=frequency_penalty, + penalty_present=presence_penalty, + penalize_nl=penalize_nl, + ignore_eos=False, + ) + if use_adaptive_p: + # only if user explicitly included adaptive-p sampler + sampler.add_adaptive_p(adaptive_target,adaptive_decay, self._seed) + else: + # default: sample from distribution + sampler.add_dist(self._seed) return sampler def sample( @@ -844,6 +856,10 @@ def sample( dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], penalize_nl: bool = True, + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, @@ -887,6 +903,10 @@ def sample( dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, penalize_nl=penalize_nl, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, @@ -924,6 +944,10 @@ def generate( dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], penalize_nl: bool = True, + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, @@ -971,6 +995,10 @@ def generate( dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, penalize_nl=penalize_nl, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, @@ -1034,6 +1062,10 @@ def generate( logits_processor=logits_processor, grammar=grammar, penalize_nl=penalize_nl, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, idx=sample_idx, ) @@ -1265,6 +1297,10 @@ def _create_completion( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, logit_bias: Optional[Dict[int, float]] = None, @@ -1464,6 +1500,10 @@ def logit_bias_processor( presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, stopping_criteria=stopping_criteria, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, @@ -1900,6 +1940,10 @@ def create_completion( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, logit_bias: Optional[Dict[int, float]] = None, @@ -1936,6 +1980,10 @@ def create_completion( dry_allowed_length: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` dry_penalty_last_n: How many tokens to scan for repetitions. Default: `0`, where `0` is disabled and `-1` is context size. dry_seq_breakers: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + adaptive-target: Adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: %.2f) [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) + adaptive-decay: Adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable. (valid range 0.0 to 0.99) (default: %.2f) + use_adaptive_p: The adaptive_p sampler is only checked when use_adaptive_p is true; the default is to use dist. + use_infill: Determines whether to activate the specialized fill-in-the-middle sampler that consolidates probabilities of tokens sharing common prefixes to ensure the generated text coherently bridges the gap between the prefix and suffix. model: The name to use for the model in the completion object. stopping_criteria: A list of stopping criteria to use. logit_bias: A logit bias to use. @@ -1977,6 +2025,10 @@ def create_completion( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, stopping_criteria=stopping_criteria, logit_bias=logit_bias, @@ -2018,6 +2070,10 @@ def __call__( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, logit_bias: Optional[Dict[int, float]] = None, @@ -2054,6 +2110,10 @@ def __call__( dry_allowed_length: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` dry_penalty_last_n: How many tokens to scan for repetitions. Default: `0`, where `0` is disabled and `-1` is context size. dry_seq_breakers: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + adaptive-target: Adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: %.2f) [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) + adaptive-decay: Adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable. (valid range 0.0 to 0.99) (default: %.2f) + use_adaptive_p: The adaptive_p sampler is only checked when use_adaptive_p is true; the default is to use dist. + use_infill: Determines whether to activate the specialized fill-in-the-middle sampler that consolidates probabilities of tokens sharing common prefixes to ensure the generated text coherently bridges the gap between the prefix and suffix. model: The name to use for the model in the completion object. stopping_criteria: A list of stopping criteria to use. logit_bias: A logit bias to use. @@ -2095,6 +2155,10 @@ def __call__( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, stopping_criteria=stopping_criteria, logit_bias=logit_bias, @@ -2133,6 +2197,10 @@ def create_chat_completion( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, @@ -2174,6 +2242,10 @@ def create_chat_completion( dry_allowed_length: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` dry_penalty_last_n: How many tokens to scan for repetitions. Default: `0`, where `0` is disabled and `-1` is context size. dry_seq_breakers: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + adaptive-target: Adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: %.2f) [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) + adaptive-decay: Adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable. (valid range 0.0 to 0.99) (default: %.2f) + use_adaptive_p: The adaptive_p sampler is only checked when use_adaptive_p is true; the default is to use dist. + use_infill: Determines whether to activate the specialized fill-in-the-middle sampler that consolidates probabilities of tokens sharing common prefixes to ensure the generated text coherently bridges the gap between the prefix and suffix. model: The name to use for the model in the completion object. logit_bias: A logit bias to use. logits_processor: A list of logits processors to use. @@ -2220,6 +2292,10 @@ def create_chat_completion( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logit_bias=logit_bias, logits_processor=logits_processor, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 5370209395..627560fe93 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -117,6 +117,10 @@ def __call__( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, logprobs: Optional[bool] = None, @@ -610,6 +614,10 @@ def chat_completion_handler( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, @@ -720,6 +728,10 @@ def chat_completion_handler( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, stopping_criteria=stopping_criteria, @@ -1483,6 +1495,10 @@ def functionary_chat_handler( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, @@ -1696,6 +1712,10 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -1784,6 +1804,10 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, ) # type: ignore @@ -1863,6 +1887,10 @@ def functionary_v1_v2_chat_handler( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, @@ -2086,6 +2114,10 @@ def prepare_messages_for_inference( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -2156,6 +2188,10 @@ def create_completion(prompt, stop, grammar): dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -2914,6 +2950,10 @@ def __call__( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, @@ -3147,6 +3187,10 @@ def __call__( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -4289,6 +4333,10 @@ def chatml_function_calling( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_adaptive_p: bool = False, + use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, @@ -4427,6 +4475,10 @@ def chatml_function_calling( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -4487,6 +4539,10 @@ def chatml_function_calling( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -4538,6 +4594,10 @@ def chatml_function_calling( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=llama_grammar.LlamaGrammar.from_string( @@ -4573,6 +4633,10 @@ def chatml_function_calling( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=llama_grammar.LlamaGrammar.from_string( @@ -4627,6 +4691,10 @@ def chatml_function_calling( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=grammar, @@ -4663,6 +4731,10 @@ def chatml_function_calling( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_adaptive_p=use_adaptive_p, + use_infill=use_infill, model=model, logits_processor=logits_processor, grammar=llama_grammar.LlamaGrammar.from_string( From be6277e2f5a57ac2c2013a8cceec18acc403d180 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 16 Jan 2026 20:33:21 +0800 Subject: [PATCH 104/518] Update Submodule vendor/llama.cpp be8e3d9..6ba6a3c --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index be8e3d9515..6ba6a3c76f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit be8e3d9515870a63b250c203d263f9dd2232eb3c +Subproject commit 6ba6a3c76f1981017bff8a8c0b8857e88db1cdb9 From ed502c568cee106fbc67feace9e31481ca954895 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 17 Jan 2026 01:25:44 +0800 Subject: [PATCH 105/518] perf (TFFT): Optimize `longest_token_prefix` with Numpy SIMD and fast-fail probe - Vectorization: Replaced standard Python zip loop with Numpy SIMD comparison for high-performance context matching. - Fast Exit: Added an O(1) probe check (a[0] != b[0]) to eliminate Numpy conversion overhead on mismatches. Making the "Time To First Token" virtually instantaneous for cached sessions. - Memory Optimization: Only the intersection of the two sequences (`[:min_len]`) is converted to Numpy arrays, minimizing memory allocation. - Result: Achieved ~5x speedup (129ms -> 25ms) in KV cache reuse scenarios and ~2.5x speedup (554.23ms -> 201.62ms) in load time while maintaining stability on long contexts. This change significantly reduces latency in RAG and chat applications on long contexts. Signed-off-by: JamePeng --- llama_cpp/llama.py | 66 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2c62fd86e9..16ff9c53b4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1006,12 +1006,7 @@ def generate( # Check for kv cache prefix match if reset and self.n_tokens > 0: - longest_prefix = 0 - for a, b in zip(self._input_ids, tokens[:-1]): - if a == b: - longest_prefix += 1 - else: - break + longest_prefix = self.longest_token_prefix(self._input_ids.tolist(), tokens[:-1]) if longest_prefix > 0: reset = False tokens = tokens[longest_prefix:] @@ -2551,14 +2546,57 @@ def logits_to_logprobs( return subtract_maxs - out @staticmethod - def longest_token_prefix(a: Sequence[int], b: Sequence[int]): - longest_prefix = 0 - for _a, _b in zip(a, b): - if _a == _b: - longest_prefix += 1 - else: - break - return longest_prefix + def longest_token_prefix(current_ids: Sequence[int], new_tokens: Sequence[int]) -> int: + """ + Calculates the length of the longest common prefix between two token sequences. + + This implementation uses NumPy for vectorized comparison (SIMD), which offers + significant performance improvements (up to 2x~100x+ speedup) over standard Python + loops for long contexts (e.g., RAG or chat history). + + Args: + current_ids: The existing token sequence (e.g., KV cache). + new_tokens: The new input token sequence. + + Returns: + int: The number of matching tokens from the start. + """ + # Fast exit for empty sequences to avoid unnecessary processing + if not current_ids or not new_tokens: + return 0 + + # Probe inspection: Use Python to quickly compare the first token + # If the tokens are different from the beginning, return immediately to avoid any NumPy overhead. + if current_ids[0] != new_tokens[0]: + return 0 + + # Determine the comparison range (limited by the shorter sequence) + min_len = min(len(current_ids), len(new_tokens)) + if min_len == 0: + return 0 + + # Accelerating SIMD for Large Data Volumes + # Only transform necessary slices, avoid processing irrelevant data + # Use asarray to ensure zero-copy (if the input is already an array) + current_ids_array = np.asarray(current_ids[:min_len], dtype=np.int32) + new_tokens_array = np.asarray(new_tokens[:min_len], dtype=np.int32) + + # Perform vectorized element-wise comparison (SIMD instruction set usage) + # Creates a boolean array where True indicates a match (e.g., [True, True, False, ...]) + matches = (current_ids_array == new_tokens_array) + + # Find the index of the first mismatch efficiently + # np.argmin returns the index of the minimum value. Since False (0) < True (1), + # this locates the first False value (mismatch). + idx = np.argmin(matches) + + # Handle the "Full Match" edge case + # This means that the match between the two arrays will still result in True in the end. + if matches[idx]: + return int(min_len) + + # Otherwise, idx is the position of the first mismatch, which equals the prefix length. + return int(idx) @classmethod def from_pretrained( From 1dd9190c59e46987e5a08fc5f694614f8cfaf514 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 17 Jan 2026 02:00:02 +0800 Subject: [PATCH 106/518] Fixed: Only compare the 0 index after ensuring that the content exists. --- llama_cpp/llama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 16ff9c53b4..829ca82b97 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2565,16 +2565,16 @@ def longest_token_prefix(current_ids: Sequence[int], new_tokens: Sequence[int]) if not current_ids or not new_tokens: return 0 - # Probe inspection: Use Python to quickly compare the first token - # If the tokens are different from the beginning, return immediately to avoid any NumPy overhead. - if current_ids[0] != new_tokens[0]: - return 0 - # Determine the comparison range (limited by the shorter sequence) min_len = min(len(current_ids), len(new_tokens)) if min_len == 0: return 0 + # Probe inspection: Use Python to quickly compare the first token + # If the tokens are different from the beginning, return immediately to avoid any NumPy overhead. + if current_ids[0] != new_tokens[0]: + return 0 + # Accelerating SIMD for Large Data Volumes # Only transform necessary slices, avoid processing irrelevant data # Use asarray to ensure zero-copy (if the input is already an array) From 16dba8cb70b5f6dc1d2a6c728ce5f4986ec071b1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 18 Jan 2026 07:40:08 +0800 Subject: [PATCH 107/518] Update Submodule vendor/llama.cpp 6ba6a3c..10c98cb --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6ba6a3c76f..10c98cbdf6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6ba6a3c76f1981017bff8a8c0b8857e88db1cdb9 +Subproject commit 10c98cbdf623d982f7491e8de5711e916a913192 From 5a0391e804f26ae48862cd0688f06058eb270564 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 18 Jan 2026 08:22:04 +0800 Subject: [PATCH 108/518] Bump version to 0.3.22 --- CHANGELOG.md | 21 +++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9456371fe..9a236b58e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.22] +- perf (TFFT): Optimize longest_token_prefix with Numpy SIMD and fast-fail probe + - Vectorization: Replaced standard Python zip loop with Numpy SIMD comparison for high-performance context matching. + + - Fast Exit: Added an O(1) probe check (a[0] != b[0]) to eliminate Numpy conversion overhead on mismatches. Making the "Time To First Token" virtually instantaneous for cached sessions. + + - Memory Optimization: Only the intersection of the two sequences (`[:min_len]`) is converted to Numpy arrays, minimizing memory allocation. + + - Result: Achieved ~5x speedup (129ms -> 25ms) in KV cache reuse scenarios and ~2.5x speedup (554.23ms -> 201.62ms) in load time while maintaining stability on long contexts. + + - The comparative test results are here: https://github.com/JamePeng/llama-cpp-python/issues/47#issuecomment-3761094840 + + - This change significantly reduces latency in RAG and chat applications on long contexts. + +- feat: [Add support for adaptive_p and infill samplers and optimize the sampler logic.](https://github.com/JamePeng/llama-cpp-python/commit/99e7ece91a9765ae31922c2bc79f5be1e22bf61e) + +- feat: [Add initialization checks to the Encoder-Decoder architecture.](https://github.com/JamePeng/llama-cpp-python/commit/a43d904dbac45b87d4086b096779b139fb52a34e) + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/10c98cbdf623d982f7491e8de5711e916a913192](https://github.com/ggml-org/llama.cpp/commit/10c98cbdf623d982f7491e8de5711e916a913192) +- feat: Sync llama.cpp llama/mtmd API Binding 20260116 + ## [0.3.21] - perf: optimize tokenization and detokenization logic - Refactor `tokenize`, `token_to_piece`, and `detokenize` methods in `_internals.py` to significantly reduce Python loop overhead and improve the batch-processing performance and stability of `load`/`prompt-eval`. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index fbad5c28b2..78292de302 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.21" +__version__ = "0.3.22" From 87f73be3b8c5888a338f210b3bfa80682b1ef8b6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 20 Jan 2026 08:59:37 +0800 Subject: [PATCH 109/518] Update Submodule vendor/llama.cpp 10c98cb..c301172 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 10c98cbdf6..c301172f66 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 10c98cbdf623d982f7491e8de5711e916a913192 +Subproject commit c301172f660a1fe0b42023da990bf7385d69adb4 From 4e244a807eea88c33bf93c3d091995a8bd8a4670 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 22 Jan 2026 06:03:37 +0800 Subject: [PATCH 110/518] Update FAQ in README.md --- README.md | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b297ade462..0a85a5af7d 100644 --- a/README.md +++ b/README.md @@ -1076,15 +1076,55 @@ Using pre-built binaries would require disabling these optimizations or supporti That being said there are some pre-built binaries available through the Releases as well as some community provided wheels. In the future, I would like to provide pre-built binaries and wheels for common platforms and I'm happy to accept any useful contributions in this area. -This is currently being tracked in [#741](https://github.com/abetlen/llama-cpp-python/issues/741) ### How does this compare to other Python bindings of `llama.cpp`? I originally wrote this package for my own use with two goals in mind: -- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python +- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h`and `mtmd.h` from Python + - Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp` +- Provide a high-throughput, relatively low-latency Python library by continuously optimizing (reducing unnecessary CPU processing or algorithm tuning) and accepting feedback (issues or pull requests), making loading and running GGUF files via Python simpler and more controllable. + +- Provides clearer code comments and error code analysis feedback in llama.cpp, based on common usage feedback and code execution flow, to help more users who are learning LLM through this project understand the project's operation and subsequent feedback optimization. + +### OSError: libcudart.so.XX/cudart64_XX.dll: cannot open shared object file: No such file or directory +This error is primarily caused by the following reasons: + +- Missing Installation or Configuration: The CUDA Toolkit is either not installed, or the environment variables were not correctly configured after installation, preventing the system from locating the required dynamic link libraries. You can try running `nvidia-smi` or `nvcc` in your terminal to check if they output results correctly. + +- Version Mismatch: The CUDA Toolkit environment is installed and configured, but it does not match the CUDA version of the pre-compiled llama-cpp-python wheel you are using. For example, your local environment might be running CUDA 12.1, but you installed a version compiled for CUDA 12.6. + +- Recommendation (Build from Source): It is recommended to fully configure your local CUDA Toolkit environment (ensuring the PATH for dynamic libraries is set and the nvcc compiler is recognized). Then, clone the code and compile it locally. Remember to enable the -DGGML_CUDA=on CMake option during compilation. This ensures the installation achieves maximum compatibility with your local system. + +### FileNotFoundError: Could not find module (like ggml.dll, ggml-cpu.dll, ggml-cuda.dll) + +**Step 1:** Locate the `lib` folder of the `llama-cpp-python` library within your current Python runtime environment: `Python3XX\Lib\site-packages\llama_cpp\lib\` + +**Step 2:** Verify that the missing DLL mentioned in the error is correctly present in this directory. Developers often have multiple Python environments locally, or projects like ComfyUI may use embedded virtual environments. Please ensure that you are installing the library and running the code in the exact same environment. + +This error is primarily caused by the following reasons: + +1. **Environment Mismatch:** The Python environment used for installation is different from the one being used for execution. + +2. **Instruction Set Incompatibility:** Regarding `ggml.dll` and `ggml-cpu.dll`, the instruction sets (such as AVX) supported by the pre-compiled version may be incompatible with your local processor. (This typically manifests as `OSError: [WinError -1073741795] Windows Error 0xc000001d` after execution). + +3. **CUDA Version Mismatch:** Regarding `ggml-cuda.dll`, the CUDA version of the pre-compiled library does not match your local CUDA Toolkit version (e.g., a mismatch between CUDA 12.X and CUDA 13.X). It is recommended to fully configure your local CUDA Toolkit environment (ensuring the PATH for dynamic libraries is set and the nvcc compiler is recognized). Then, clone the code and compile it locally. + +### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions range from 300MB to 900MB? + +My GitHub Actions script is configured to compile against **all supported CUDA compute architectures** for each specific CUDA version I maintain. + +For example: + +* **CUDA 13.0.2:** Currently supports architectures from SM75 (Turing) up to SM120a (Blackwell). +* **CUDA 12.4.1 and 12.6.3:** Support older architectures as well, such as SM70. +* *(Note: The Windows versions are built to support every architecture compatible with the respective CUDA version).* + +The reason libraries from other authors are smaller is that they often **only compile for a single architecture** (e.g., targeting only the RTX 30 series [SM86] or the RTX 40 series [SM89]). To maximize convenience, I provide an **integrated compilation** covering a wide range of hardware; you simply need to select the CUDA version that matches your environment to load and run it. + + Any contributions and changes to this package will be made with these goals in mind. ## License From b0511e189b2aeea415f493de4e466ccdfd25b458 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 23 Jan 2026 04:56:12 +0800 Subject: [PATCH 111/518] Update Submodule vendor/llama.cpp c301172..51fa458 --- llama_cpp/mtmd_cpp.py | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 758678a8de..111345706e 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -598,7 +598,7 @@ def mtmd_encode_chunk( # // get output embeddings from the last encode pass # // the reading size (in bytes) is equal to: -# // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) +# // llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) # MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); @ctypes_function_mtmd( "mtmd_get_output_embd", [mtmd_context_p_ctypes], POINTER(c_float)) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c301172f66..51fa458a92 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c301172f660a1fe0b42023da990bf7385d69adb4 +Subproject commit 51fa458a92d6a3f305f8fd76fc8f702e3e87ddb5 From 02ae101d36fa06b379197f17637e83f32b520f7d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 23 Jan 2026 06:32:43 +0800 Subject: [PATCH 112/518] Simplify test.yaml and add tests for Python 3.14. --- .github/workflows/test.yaml | 117 +++++++++++++++++------------------- 1 file changed, 54 insertions(+), 63 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index d354b07b85..82bf930cf5 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -2,11 +2,9 @@ name: Tests on: pull_request: - branches: - - main + branches: ["main"] push: - branches: - - main + branches: ["main"] # Auto-cancel stale runs on the same PR/branch concurrency: @@ -16,6 +14,7 @@ concurrency: env: REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf + HF_HOME: ${{ github.workspace }}/hf_cache jobs: # Combined job for Linux, Windows, and macOS (non-Metal) @@ -25,93 +24,85 @@ jobs: # Don't cancel other jobs in the matrix if one fails fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-14] - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + os: [ubuntu-latest, windows-latest] + python-version: ["3.9", "3.13", "3.14"] + include: + # macOS Non-Metal + - os: macos-14 + python-version: "3.9" + cmake_args: "-DLLAMA_METAL=off" + metal_status: "(No Metal)" + - os: macos-14 + python-version: "3.13" + cmake_args: "-DLLAMA_METAL=off" + metal_status: "(No Metal)" + - os: macos-14 + python-version: "3.14" + cmake_args: "-DLLAMA_METAL=off" + metal_status: "(No Metal)" + + # macOS Metal + - os: macos-14 + python-version: "3.9" + cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + metal_status: "(Metal)" + - os: macos-14 + python-version: "3.13" + cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + metal_status: "(Metal)" + - os: macos-14 + python-version: "3.14" + cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" + metal_status: "(Metal)" steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: "recursive" + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - cache: 'pip' - name: Cache HuggingFace model id: model-cache - uses: actions/cache@v4 + uses: actions/cache@v5 with: - path: ~/.cache/huggingface/hub + path: ${{ env.HF_HOME }} key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} - name: Download model if not cached # Only run this step if the cache was not found if: steps.model-cache.outputs.cache-hit != 'true' run: | - pip install huggingface-hub + uv pip install --system huggingface-hub hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} shell: bash - - name: Install dependencies + - name: Install dependencies and Build env: - CMAKE_ARGS: ${{ runner.os == 'macOS' && '-DLLAMA_METAL=off' || '' }} - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - shell: bash - - - name: Test with pytest - run: python -m pytest - shell: bash - - # Dedicated job for macOS with Metal support - build-macos-metal: - runs-on: macos-14 - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up Python 3.9 - uses: actions/setup-python@v5 - with: - python-version: "3.9" - cache: 'pip' - - - name: System Info + CMAKE_ARGS: ${{ matrix.cmake_args }} run: | - uname -a - sysctl -n machdep.cpu.brand_string - python -c "import platform; print(platform.machine(), platform.architecture())" - shell: bash - - - name: Cache HuggingFace model - id: model-cache - uses: actions/cache@v4 - with: - path: ~/.cache/huggingface/hub - key: ${{ runner.os }}-metal-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} - - - name: Download model if not cached - # Only run this step if the cache was not found - if: steps.model-cache.outputs.cache-hit != 'true' - run: | - pip install huggingface-hub - hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} + echo "Building with CMAKE_ARGS: $CMAKE_ARGS" + uv pip install --system -e .[all] --verbose shell: bash - - name: Install dependencies (macOS Metal) + - name: System Info and llama-cpp-python version run: | - python -m pip install --upgrade pip - python -m pip install uv - CMAKE_ARGS="-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" python -m uv pip install -e .[all] --verbose + python -c "import platform; print('Machine:', platform.machine(), 'Arch:', platform.architecture())" + if [[ "${{ runner.os }}" == "macOS" ]]; then + sysctl -n machdep.cpu.brand_string + fi + python -c "import llama_cpp; print('llama_cpp_python:', llama_cpp.__version__)" shell: bash - - name: Test with pytest + - name: Test with pytest by python ${{ matrix.python-version }} run: python -m pytest shell: bash \ No newline at end of file From 6d3642c2b43e5e14472f694dcc7f9cbb96d9145f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 24 Jan 2026 16:25:19 +0800 Subject: [PATCH 113/518] Update Submodule vendor/llama.cpp 51fa458..8f91ca5 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 51fa458a92..8f91ca54ec 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 51fa458a92d6a3f305f8fd76fc8f702e3e87ddb5 +Subproject commit 8f91ca54ec0b22f3ff3a495f32be8e8300638cdf From c3ec230f2fd959982e0a8e63db0c1f87b3a3487d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 24 Jan 2026 20:10:59 +0800 Subject: [PATCH 114/518] Increased the n_batch parameter in Llama model initialization from 512 to 2048 --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 829ca82b97..ffab84671b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -78,7 +78,7 @@ def __init__( # Context Params seed: int = llama_cpp.LLAMA_DEFAULT_SEED, n_ctx: int = 512, - n_batch: int = 512, + n_batch: int = 2048, n_ubatch: int = 512, n_seq_max: int = 1, n_threads: Optional[int] = None, From ecff4822e423b9c6173f01ccbb2ddd3b683ab0a6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 24 Jan 2026 22:42:10 +0800 Subject: [PATCH 115/518] feat: enhance default system prompt with strong multimodal + same-language capabilities --- llama_cpp/llama_chat_format.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 627560fe93..0edb1559bb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2790,7 +2790,8 @@ def generate_streaming(tools, functions, function_call, prompt): class Llava15ChatHandler: DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( - "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." +"""You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, +while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful.""" ) CHAT_FORMAT = ( @@ -3512,10 +3513,8 @@ class MoondreamChatHandler(Llava15ChatHandler): class Llava16ChatHandler(Llava15ChatHandler): - DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. " - # Example prompt - # "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" + # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" CHAT_FORMAT = ( "{% for message in messages %}" @@ -3618,7 +3617,6 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler): # question = "" + q # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - DEFAULT_SYSTEM_MESSAGE = None CHAT_FORMAT = ( "{% for message in messages %}" @@ -3668,7 +3666,6 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler): class MiniCPMv26ChatHandler(Llava15ChatHandler): - DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" @@ -3709,7 +3706,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): class Gemma3ChatHandler(Llava15ChatHandler): - DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." GEMMA3_BOI_TOKEN = "" GEMMA3_EOI_TOKEN = "" @@ -4098,8 +4094,6 @@ def __call__(self, **kwargs): class Qwen25VLChatHandler(Llava15ChatHandler): - DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." - CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" "{% for message in messages %}" From 9a004bdf2271e102bfeb74b8ffc875874b9cdba1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 24 Jan 2026 22:46:37 +0800 Subject: [PATCH 116/518] Fix: catch TemplateSyntaxError when parsing metadata chat templates Some models (e.g., LLaVA 1.5) contain non-standard Jinja2 tags (like {% generation %}) in their metadata. This commit adds a try-except block to prevent initialization crashes, allowing the model to load even if the metadata template is invalid. --- llama_cpp/llama.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ffab84671b..5d9b3a182d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -565,13 +565,25 @@ def free_lora_adapter(): file=sys.stderr, ) + # Iterate through all the chat templates found in the model's metadata for name, template in template_choices.items(): - self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( - template=template, - eos_token=eos_token, - bos_token=bos_token, - stop_token_ids=[eos_token_id], - ).to_chat_handler() + try: + # Attempt to parse and register the template as a valid chat handler. + # We wrap this in a try-block because some models (like LLaVA) contain + # non-standard Jinja2 tags (e.g., {% generation %}) that cause the + # standard parser to crash. + self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( + template=template, + eos_token=eos_token, + bos_token=bos_token, + stop_token_ids=[eos_token_id], + ).to_chat_handler() + except Exception as e: + # If parsing fails (e.g., TemplateSyntaxError), log a warning but do not crash. + # This ensures the model still loads even if one metadata template is broken. + if self.verbose: + print(f"Warning: Failed to parse chat template '{name}': {e}", file=sys.stderr) + pass if ( self.chat_format is None From f0c9633f6f8c477650485c8f25e1ec81de3ed73f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 24 Jan 2026 23:02:54 +0800 Subject: [PATCH 117/518] Better Llava1.5 Chat Format --- llama_cpp/llama_chat_format.py | 55 +++++++++++++++++----------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0edb1559bb..1c4759c821 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2796,36 +2796,35 @@ class Llava15ChatHandler: CHAT_FORMAT = ( "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + + "{% if message.role == 'user' %}" + "{% if message.content is string %}" + "\nUSER: {{ message.content }}" + "{% elif message.content is iterable %}" + "\nUSER: " + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% endif %}" + + "{% if message.role == 'assistant' and message.content is not none %}" + "\nASSISTANT: {{ message.content }}" + "{% endif %}" "{% endfor %}" + "{% if add_generation_prompt %}" - "\nASSISTANT: " + "\nASSISTANT: " "{% endif %}" ) From 8468ac2d55388106eb2fc20ca2eee98802aa4719 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 25 Jan 2026 22:43:22 +0800 Subject: [PATCH 118/518] Update Submodule vendor/llama.cpp 8f91ca5..142cbe2 --- llama_cpp/llama_cpp.py | 21 +++++++++++---------- vendor/llama.cpp | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 114577d219..294c42ca54 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1366,6 +1366,7 @@ class llama_params_fit_status(enum.IntEnum): # // - returns true if the parameters could be successfully modified to fit device memory # // - this function is NOT thread safe because it modifies the global llama logger state # // - only parameters that have the same value as in llama_default_model_params are modified +# // with the exception of the context size which is modified if and only if equal to 0 # LLAMA_API enum llama_params_fit_status llama_params_fit( # const char * path_model, # struct llama_model_params * mparams, @@ -4550,18 +4551,18 @@ def llama_sampler_sample( # /// @details Build a split GGUF final path for this chunk. # /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" # // Returns the split_path length. -# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); +# LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count); @ctypes_function( "llama_split_path", - [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int], - ctypes.c_int, + [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int32, ctypes.c_int32], + ctypes.c_int32, ) def llama_split_path( split_path: bytes, maxlen: Union[ctypes.c_size_t, int], path_prefix: bytes, - split_no: Union[ctypes.c_int, int], - split_count: Union[ctypes.c_int, int], + split_no: Union[ctypes.c_int32, int], + split_count: Union[ctypes.c_int32, int], /, ) -> int: """Build a split GGUF final path for this chunk.""" @@ -4571,18 +4572,18 @@ def llama_split_path( # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. # /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" # // Returns the split_prefix length. -# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); +# LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count); @ctypes_function( "llama_split_prefix", - [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int], - ctypes.c_int, + [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int32, ctypes.c_int32], + ctypes.c_int32, ) def llama_split_prefix( split_prefix: bytes, maxlen: Union[ctypes.c_size_t, int], split_path: bytes, - split_no: Union[ctypes.c_int, int], - split_count: Union[ctypes.c_int, int], + split_no: Union[ctypes.c_int32, int], + split_count: Union[ctypes.c_int32, int], /, ) -> int: """Extract the path prefix from the split_path if and only if the split_no and split_count match.""" diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8f91ca54ec..142cbe2ac6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8f91ca54ec0b22f3ff3a495f32be8e8300638cdf +Subproject commit 142cbe2ac68978e5dec3a2e19c1b64ef1c5740b1 From 83d5839b136a62a2ccac3feabe4eec1dbced961b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Jan 2026 01:20:11 +0800 Subject: [PATCH 119/518] feat: Implement `MiniCPMv45ChatHandler` for MiniCPM-V 4.5 with multi-image tracking Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 138 +++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) diff --git a/README.md b/README.md index 0a85a5af7d..69287a53e5 100644 --- a/README.md +++ b/README.md @@ -508,6 +508,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` | | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | +| [minicpm-v-4.5](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) | `MiniCPMv45ChatHandler` | `minicpm-v-4.5` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | | [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1c4759c821..27c0437626 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3704,6 +3704,144 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): ) +class MiniCPMv45ChatHandler(Llava15ChatHandler): + """ + Handler for MiniCPM-V 4.5 models. + + Supports: + - Multi-step tool calls with and XML tags. + - Integrated reasoning (thinking) process with tags. + - Specialized system prompt handling with tool definitions. + - Global image numbering for multi-image processing. + """ + + # Model specific control tokens + MINICPMV_BOS_TOKEN = "<|im_start|>" + MINICPMV_EOS_TOKEN = "<|im_end|>" + MINICPMV_PAD_TOKEN = "<|endoftext|>" + + # Image placeholder tags + MINICPMV_IMAGE_START_TOKEN = "" + MINICPMV_IMAGE_END_TOKEN = "" + MINICPMV_IMAGE_ID_START_TOKEN = "" + MINICPMV_IMAGE_ID_END_TOKEN = "" + + CHAT_FORMAT = ( + # --- 1. First System Message & Tools Definitions --- + "{%- if tools %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" + "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" + "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" + "{{- 'You are provided with function signatures within XML tags:\\n' }}" + "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" + "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- elif messages[0].role == 'system' %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + + # --- 2. Message Stream Processing --- + "{% set image_count = namespace(value=0) %}" + "{%- for message in messages %}" + # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- + "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" + "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" + + "{%- set content = message.content %}" + "{%- if content is not string %}" + "{%- set ns = namespace(content_str='') %}" + "{%- for item in content %}" + # --- Explicit image_url type and value checking --- + "{%- if item.type == 'image_url' %}" + "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" + "{%- set image_count.value = image_count.value + 1 %}" + # Format: N: IMAGE_URL + "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" + "{%- elif item.type == 'text' %}" + "{%- set ns.content_str = ns.content_str + item.text %}" + "{%- endif %}" + "{%- endfor %}" + "{%- set content = ns.content_str %}" + "{%- endif %}" + + "{{- content -}}" + + # Append tool_calls to assistant messages if they exist + "{%- if message.role == 'assistant' and message.tool_calls %}" + "{%- for tool_call in message.tool_calls %}" + "{%- set tc = tool_call.function if tool_call.function else tool_call %}" + "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" + "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" + "{{- '}\\n' }}" + "{%- endfor %}" + "{%- endif %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + + # --- Specialized Tool Response Handling --- + # Group consecutive tool responses under a single user-like block + "{%- elif message.role == 'tool' %}" + "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" + "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" + "{%- endif %}" + "{{- '\\n\\n' + message.content + '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + + # --- 3. Generation Prompt --- + "{%- if add_generation_prompt %}" + "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" + # Handle thinking/reasoning block visibility based on configuration + "{%- if enable_thinking is defined and enable_thinking is false %}" + "{{- '\\n\\n\\n\\n' }}" + "{%- elif enable_thinking is defined and enable_thinking is true %}" + "{{- '\\n' }}" + "{%- endif %}" + "{%- endif %}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V 4.5 Handler. + + Args: + enable_thinking (bool): If True, model generates reasoning before the final answer. + **kwargs: Additional arguments for the base Llava15ChatHandler. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject thinking control flag into the template + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set stop token patch + kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"MiniCPMV45ChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"MiniCPMV45ChatHandler - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + class Gemma3ChatHandler(Llava15ChatHandler): GEMMA3_BOI_TOKEN = "" From 791b21e76056d260337ac9eb4a13b2252a787391 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Jan 2026 22:50:00 +0800 Subject: [PATCH 120/518] Update Submodule vendor/llama.cpp 142cbe2..68ac3ac --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 142cbe2ac6..68ac3acb43 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 142cbe2ac68978e5dec3a2e19c1b64ef1c5740b1 +Subproject commit 68ac3acb435450d5ba1e62748e17671815313dc3 From 7db5e171aad1d246dea9a803a28da9d803457096 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Jan 2026 22:51:02 +0800 Subject: [PATCH 121/518] build: update project metadata, pin numpy, and add Py3.14 support - Update repository URLs (Homepage, Issues, Changelog) to point to JamePeng/llama-cpp-python. - Restrict numpy version to <=2.3.2 to ensure compatibility. - Add Python 3.14 to project classifiers. - Add FAQ link to project URLs. Signed-off-by: JamePeng --- pyproject.toml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 00dbab1ed1..2e439c0685 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "typing-extensions>=4.8.0", - "numpy>=1.21.6", + "numpy>=1.21.6,<=2.3.2", "diskcache>=5.6.2", "jinja2>=2.11.3", "Pillow>=9.5.0", @@ -26,6 +26,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", ] @@ -73,10 +74,11 @@ provider = "scikit_build_core.metadata.regex" input = "llama_cpp/__init__.py" [project.urls] -Homepage = "https://github.com/abetlen/llama-cpp-python" -Issues = "https://github.com/abetlen/llama-cpp-python/issues" +Homepage = "https://github.com/JamePeng/llama-cpp-python" +Issues = "https://github.com/JamePeng/llama-cpp-python/issues" Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/" -Changelog = "https://llama-cpp-python.readthedocs.io/en/latest/changelog/" +Changelog = "https://github.com/JamePeng/llama-cpp-python/blob/main/CHANGELOG.md" +FAQ = "https://github.com/JamePeng/llama-cpp-python?tab=readme-ov-file#faq" [tool.pytest.ini_options] testpaths = "tests" From f7e5307f316331bdc248e350ea6765afdfea2d3b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Jan 2026 23:12:07 +0800 Subject: [PATCH 122/518] Adding Python 3.14 to the GitHub action workflow - Now only the Basic version is provided by default to maintain compatibility and make maintenance easier. --- .../build-wheels-cu124-cu126-win-basic.yml | 107 ---------------- .../build-wheels-cu124-cu126-win.yml | 4 +- .../build-wheels-cu124-linux-basic.yml | 116 ------------------ .../workflows/build-wheels-cu124-linux.yml | 4 +- .../build-wheels-cu126-linux-basic.yml | 116 ------------------ .../workflows/build-wheels-cu126-linux.yml | 4 +- .../build-wheels-cu128-linux-basic.yml | 116 ------------------ .../workflows/build-wheels-cu128-linux.yml | 4 +- .../build-wheels-cu128-win-basic.yml | 107 ---------------- .github/workflows/build-wheels-cu128-win.yml | 4 +- .../build-wheels-cu130-linux-basic.yml | 116 ------------------ .../workflows/build-wheels-cu130-linux.yml | 4 +- .../build-wheels-cu130-win-basic.yml | 107 ---------------- .github/workflows/build-wheels-cu130-win.yml | 4 +- 14 files changed, 14 insertions(+), 799 deletions(-) delete mode 100644 .github/workflows/build-wheels-cu124-cu126-win-basic.yml delete mode 100644 .github/workflows/build-wheels-cu124-linux-basic.yml delete mode 100644 .github/workflows/build-wheels-cu126-linux-basic.yml delete mode 100644 .github/workflows/build-wheels-cu128-linux-basic.yml delete mode 100644 .github/workflows/build-wheels-cu128-win-basic.yml delete mode 100644 .github/workflows/build-wheels-cu130-linux-basic.yml delete mode 100644 .github/workflows/build-wheels-cu130-win-basic.yml diff --git a/.github/workflows/build-wheels-cu124-cu126-win-basic.yml b/.github/workflows/build-wheels-cu124-cu126-win-basic.yml deleted file mode 100644 index 9776287173..0000000000 --- a/.github/workflows/build-wheels-cu124-cu126-win-basic.yml +++ /dev/null @@ -1,107 +0,0 @@ -name: Build Wheels (CU124-126) for Windows(Basic) - -on: - workflow_dispatch: - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13"] - cuda: ["12.4.1","12.6.3"] - releasetag: ["Basic"] - cudaarch: ["all"] - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 - with: - msbuild-architecture: x64 - - - uses: actions/checkout@v5 - with: - submodules: "recursive" - - # from kingbri1/flash-attention build-wheels.yml - - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.28 - id: cuda-toolkit - with: - cuda: "${{ matrix.cuda }}" - use-github-cache: false - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - name: Install Dependencies - run: | - git config --system core.longpaths true - uv pip install --upgrade build setuptools wheel packaging - - - name: Build Wheel(Basic) - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_HOME = $env:CUDA_PATH - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" - - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - python -m build --wheel - - # Check if wheel was built - if (!(Test-Path '.\dist\*.whl')) { - Write-Error "No wheel built in dist/ directory" - exit 1 - } - - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - $wheel = (gi '.\dist\*.whl')[0] - $tagVer = $wheel.name.split('-')[1] - Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV - - - name: Get Current Date - id: get-date - run: | - $currentDate = Get-Date -UFormat "%Y%m%d" - Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - - name: Create Release - if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 - with: - files: dist/* - # Set tag_name to -cu--win - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index 5d5a91efe0..de44a533e8 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -13,9 +13,9 @@ jobs: strategy: matrix: os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.4.1","12.6.3"] - releasetag: ["AVX2"] + releasetag: ["Basic"] cudaarch: ["all"] defaults: run: diff --git a/.github/workflows/build-wheels-cu124-linux-basic.yml b/.github/workflows/build-wheels-cu124-linux-basic.yml deleted file mode 100644 index 98f50fe474..0000000000 --- a/.github/workflows/build-wheels-cu124-linux-basic.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: Build Wheels(CU124) for Linux(Basic) - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions - cuda: ["12.4.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v4 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" - - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for release tag --- - - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) - - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step - - # Extract the short CUDA version (e.g., 124) from the full version (e.g., 12.4.1) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 3feeeecfd7..f1a4114114 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -14,9 +14,9 @@ jobs: strategy: matrix: # Define the build matrix directly here os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.4.1"] - releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) cudaarch: ["all"] # Controls target CUDA architectures for nvcc defaults: diff --git a/.github/workflows/build-wheels-cu126-linux-basic.yml b/.github/workflows/build-wheels-cu126-linux-basic.yml deleted file mode 100644 index 78d1471c76..0000000000 --- a/.github/workflows/build-wheels-cu126-linux-basic.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: Build Wheels(CU126) for Linux(Basic) - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions - cuda: ["12.6.3"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v4 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" - - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for release tag --- - - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) - - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step - - # Extract the short CUDA version (e.g., 126) from the full version (e.g., 12.6.3) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index f9b566fab8..2fb52a0a4b 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -14,9 +14,9 @@ jobs: strategy: matrix: # Define the build matrix directly here os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.6.3"] - releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) cudaarch: ["all"] # Controls target CUDA architectures for nvcc defaults: diff --git a/.github/workflows/build-wheels-cu128-linux-basic.yml b/.github/workflows/build-wheels-cu128-linux-basic.yml deleted file mode 100644 index 8c527c7187..0000000000 --- a/.github/workflows/build-wheels-cu128-linux-basic.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: Build Wheels(CU128) for Linux(Basic) - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions - cuda: ["12.8.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v4 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" - - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for release tag --- - - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) - - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step - - # Extract the short CUDA version (e.g., 128) from the full version (e.g., 12.8.1) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index b25128d6e8..e54f674790 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -14,9 +14,9 @@ jobs: strategy: matrix: # Define the build matrix directly here os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.8.1"] - releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) cudaarch: ["all"] # Controls target CUDA architectures for nvcc defaults: diff --git a/.github/workflows/build-wheels-cu128-win-basic.yml b/.github/workflows/build-wheels-cu128-win-basic.yml deleted file mode 100644 index 2d78084dbc..0000000000 --- a/.github/workflows/build-wheels-cu128-win-basic.yml +++ /dev/null @@ -1,107 +0,0 @@ -name: Build Wheels (CU128) for Windows(Basic) - -on: - workflow_dispatch: - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13"] - cuda: ["12.8.1"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 - with: - msbuild-architecture: x64 - - - uses: actions/checkout@v5 - with: - submodules: "recursive" - - # from kingbri1/flash-attention build-wheels.yml - - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.28 - id: cuda-toolkit - with: - cuda: "${{ matrix.cuda }}" - use-github-cache: false - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - name: Install Dependencies - run: | - git config --system core.longpaths true - uv pip install --upgrade build setuptools wheel packaging - - - name: Build Wheel - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_HOME = $env:CUDA_PATH - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" - - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - python -m build --wheel - - # Check if wheel was built - if (!(Test-Path '.\dist\*.whl')) { - Write-Error "No wheel built in dist/ directory" - exit 1 - } - - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - $wheel = (gi '.\dist\*.whl')[0] - $tagVer = $wheel.name.split('-')[1] - Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV - - - name: Get Current Date - id: get-date - run: | - $currentDate = Get-Date -UFormat "%Y%m%d" - Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - - name: Create Release - if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 - with: - files: dist/* - # Set tag_name to -cu--win - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 40578c8b4c..306dc194a2 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -13,9 +13,9 @@ jobs: strategy: matrix: os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.8.1"] - releasetag: ["AVX2"] + releasetag: ["Basic"] cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] defaults: run: diff --git a/.github/workflows/build-wheels-cu130-linux-basic.yml b/.github/workflows/build-wheels-cu130-linux-basic.yml deleted file mode 100644 index 0f03787a68..0000000000 --- a/.github/workflows/build-wheels-cu130-linux-basic.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: Build Wheels(CU130) for Linux(Basic) - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions - cuda: ["13.0.2"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v5 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" - - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for release tag --- - - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) - - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step - - # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 6451c7ee43..5c78f595bf 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -14,9 +14,9 @@ jobs: strategy: matrix: # Define the build matrix directly here os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13"] # Python versions + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["13.0.2"] - releasetag: ["AVX2"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) + releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) cudaarch: ["all"] # Controls target CUDA architectures for nvcc defaults: diff --git a/.github/workflows/build-wheels-cu130-win-basic.yml b/.github/workflows/build-wheels-cu130-win-basic.yml deleted file mode 100644 index 17b0fb6c72..0000000000 --- a/.github/workflows/build-wheels-cu130-win-basic.yml +++ /dev/null @@ -1,107 +0,0 @@ -name: Build Wheels (CU130) for Windows(Basic) - -on: - workflow_dispatch: - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13"] - cuda: ["13.0.2"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 - with: - msbuild-architecture: x64 - - - uses: actions/checkout@v5 - with: - submodules: "recursive" - - # from kingbri1/flash-attention build-wheels.yml - - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.29 - id: cuda-toolkit - with: - cuda: "${{ matrix.cuda }}" - use-github-cache: false - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - name: Install Dependencies - run: | - git config --system core.longpaths true - uv pip install --upgrade build setuptools wheel packaging - - - name: Build Wheel - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_HOME = $env:CUDA_PATH - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" - - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - python -m build --wheel - - # Check if wheel was built - if (!(Test-Path '.\dist\*.whl')) { - Write-Error "No wheel built in dist/ directory" - exit 1 - } - - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - $wheel = (gi '.\dist\*.whl')[0] - $tagVer = $wheel.name.split('-')[1] - Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV - - - name: Get Current Date - id: get-date - run: | - $currentDate = Get-Date -UFormat "%Y%m%d" - Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - - name: Create Release - if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 - with: - files: dist/* - # Set tag_name to -cu--win - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index 3c7d07caa1..c6f43a3b7e 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -13,9 +13,9 @@ jobs: strategy: matrix: os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["13.0.2"] - releasetag: ["AVX2"] + releasetag: ["Basic"] cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] defaults: run: From 6c536a051fd8151c7cb70cc5a55f43e25b914b28 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 28 Jan 2026 00:27:04 +0800 Subject: [PATCH 123/518] ci: Customize wheel filename to improve version identification - Parse generated wheel filenames in the build step. - Append CUDA version (cuXXX) and AVX level (basic/avx2) to the version string. - New format: package-ver+cuXXX.avxver-pyver-plat.whl (e.g., llama_cpp_python-0.3.22+cu130.basic-cp310-win_amd64.whl). - Keep the git release tag clean (without local version identifiers). --- .../build-wheels-cu124-cu126-win.yml | 25 ++++++++++++---- .../workflows/build-wheels-cu124-linux.yml | 30 +++++++++++-------- .../workflows/build-wheels-cu126-linux.yml | 30 +++++++++++-------- .../workflows/build-wheels-cu128-linux.yml | 30 +++++++++++-------- .github/workflows/build-wheels-cu128-win.yml | 25 ++++++++++++---- .../workflows/build-wheels-cu130-linux.yml | 30 +++++++++++-------- .github/workflows/build-wheels-cu130-win.yml | 25 ++++++++++++---- 7 files changed, 128 insertions(+), 67 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index de44a533e8..19d07f9dee 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -95,12 +95,27 @@ jobs: exit 1 } + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Split wheel filename: name-ver-py-abi-plat.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" + + $newName = "$distName-$newVersion-$pyTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + # write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - $wheel = (gi '.\dist\*.whl')[0] - $tagVer = $wheel.name.split('-')[1] - Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - name: Get Current Date id: get-date @@ -113,7 +128,7 @@ jobs: uses: softprops/action-gh-release@v2 with: files: dist/* - # Set tag_name to -cu--win + # Set tag_name to -cu--win- tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index f1a4114114..254eda05e5 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -58,6 +58,7 @@ jobs: run: | echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting find /usr/ -name 'libcuda.so.*' + find /usr/ -name 'libcudart.so.*' echo $LD_LIBRARY_PATH # Add project-specific and feature flags @@ -88,25 +89,28 @@ jobs: # Run the Python build command to generate the wheel uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - # --- Post-build steps to get info for release tag --- + # --- Post-build steps to get info for rename wheel file and release tag --- - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") - # Extract the short CUDA version (e.g., 124) from the full version (e.g., 12.4.1) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + # Split wheel filename + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + + # Rename wheel file + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - name: Get Current Date # Step to get current date for the release tag id: get-date diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 2fb52a0a4b..8d0b6f25a9 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -58,6 +58,7 @@ jobs: run: | echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting find /usr/ -name 'libcuda.so.*' + find /usr/ -name 'libcudart.so.*' echo $LD_LIBRARY_PATH # Add project-specific and feature flags @@ -88,25 +89,28 @@ jobs: # Run the Python build command to generate the wheel uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - # --- Post-build steps to get info for release tag --- + # --- Post-build steps to get info for rename wheel file and release tag --- - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") - # Extract the short CUDA version (e.g., 126) from the full version (e.g., 12.6.3) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + # Split wheel filename + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + + # Rename wheel file + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - name: Get Current Date # Step to get current date for the release tag id: get-date diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index e54f674790..3d3af3b210 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -58,6 +58,7 @@ jobs: run: | echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting find /usr/ -name 'libcuda.so.*' + find /usr/ -name 'libcudart.so.*' echo $LD_LIBRARY_PATH # Add project-specific and feature flags @@ -88,25 +89,28 @@ jobs: # Run the Python build command to generate the wheel uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - # --- Post-build steps to get info for release tag --- + # --- Post-build steps to get info for rename wheel file and release tag --- - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") - # Extract the short CUDA version (e.g., 128) from the full version (e.g., 12.8.1) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + # Split wheel filename + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + + # Rename wheel file + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - name: Get Current Date # Step to get current date for the release tag id: get-date diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 306dc194a2..77b73d710e 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -95,12 +95,27 @@ jobs: exit 1 } + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Split file name: name-ver-py-abi-plat.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" + + $newName = "$distName-$newVersion-$pyTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + # write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - $wheel = (gi '.\dist\*.whl')[0] - $tagVer = $wheel.name.split('-')[1] - Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - name: Get Current Date id: get-date @@ -113,7 +128,7 @@ jobs: uses: softprops/action-gh-release@v2 with: files: dist/* - # Set tag_name to -cu--win + # Set tag_name to -cu--win- tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 5c78f595bf..d50133d14a 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -58,6 +58,7 @@ jobs: run: | echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting find /usr/ -name 'libcuda.so.*' + find /usr/ -name 'libcudart.so.*' echo $LD_LIBRARY_PATH # Add project-specific and feature flags @@ -88,25 +89,28 @@ jobs: # Run the Python build command to generate the wheel uv pip install build setuptools wheel packaging - # uv pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - # --- Post-build steps to get info for release tag --- + # --- Post-build steps to get info for rename wheel file and release tag --- - # Find the generated wheel file in the 'dist' directory using bash - # Assumes only one wheel is generated per build configuration run - wheel_file=$(ls dist/*.whl | head -n 1) + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - # Extract the package version (e.g., 1.2.3) from the wheel filename - # Filename format is typically: package_name-version-tag-specificators.whl - # Using basename and cut to split by '-' and get the second field - tag_ver=$(basename "$wheel_file" | cut -d'-' -f 2) - echo "TAG_VERSION=$tag_ver" >> $GITHUB_ENV # Store version in env for release step + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") - # Extract the short CUDA version (e.g., 130) from the full version (e.g., 13.0.2) from the matrix variable - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + # Split wheel filename + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + + # Rename wheel file + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env + echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - name: Get Current Date # Step to get current date for the release tag id: get-date diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index c6f43a3b7e..5ba2704370 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -95,12 +95,27 @@ jobs: exit 1 } + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Split file name: name-ver-py-abi-plat.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" + + $newName = "$distName-$newVersion-$pyTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + # write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - - $wheel = (gi '.\dist\*.whl')[0] - $tagVer = $wheel.name.split('-')[1] - Write-Output "TAG_VERSION=$tagVer" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - name: Get Current Date id: get-date @@ -113,7 +128,7 @@ jobs: uses: softprops/action-gh-release@v2 with: files: dist/* - # Set tag_name to -cu--win + # Set tag_name to -cu--win- tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 9b985b75105ea9f5e7f2f5e7988a1149f233af2f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 28 Jan 2026 01:06:48 +0800 Subject: [PATCH 124/518] Bump version to 0.3.23 --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a236b58e2..b5fefa7a03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.23] +- feat: [Implement MiniCPMv45ChatHandler for MiniCPM-V 4.5 with multi-image tracking](https://github.com/JamePeng/llama-cpp-python/commit/83d5839b136a62a2ccac3feabe4eec1dbced961b) + +- feat: Add python 3.14 support and pin numpy version(1.21.4-2.3.2) for compatibility + +- feat: Increased the n_batch parameter in Llama model initialization from 512 to 2048. Slightly improves the speed of some multimodal image processing. + +- fix: catch TemplateSyntaxError when parsing metadata chat templates + + - Some models (e.g., LLaVA 1.5) contain non-standard Jinja2 tags (like {% generation %}) in their metadata. + + - This commit adds a try-except block to prevent initialization crashes, allowing the model to load even if the metadata template is invalid. + +- feat: enhance default system prompt with strong multimodal + same-language capabilities + +- feat: Better Llava1.5 Chat Format + +- ci: Customize wheel filename to improve version identification + - Parse generated wheel filenames in the build step. + - Append CUDA version (cuXXX) and AVX level (basic/avx2) to the version string. + - New format: package-ver+cuXXX.avxver-pyver-plat.whl (e.g., llama_cpp_python-0.3.23+cu130.basic-cp310-win_amd64.whl). + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/68ac3acb435450d5ba1e62748e17671815313dc3](https://github.com/ggml-org/llama.cpp/commit/68ac3acb435450d5ba1e62748e17671815313dc3) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260127 + ## [0.3.22] - perf (TFFT): Optimize longest_token_prefix with Numpy SIMD and fast-fail probe - Vectorization: Replaced standard Python zip loop with Numpy SIMD comparison for high-performance context matching. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 78292de302..eb37da2093 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.22" +__version__ = "0.3.23" From 60ad6a5b4e4d2ddd27385a1c4fe2a657b21564c7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 28 Jan 2026 01:24:00 +0800 Subject: [PATCH 125/518] Update README.md for python 3.14 and HIP (ROCm) guide --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 69287a53e5..2f57f1682f 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,8 @@ pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: - CUDA Version is 12.4, 12.6, 12.8 or 13.0 -- Python Version is 3.10, 3.11, 3.12 or 3.13 -- Basic version: A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). +- Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 +- Basic version(Default): A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). - AVX2 version: A version compiled using AVX2 instructions. Check the releases page: @@ -172,13 +172,20 @@ pip install llama-cpp-python \
-hipBLAS (ROCm) +HIP (ROCm) -To install with hipBLAS / ROCm support for AMD cards, set the `GGML_HIPBLAS=on` environment variable before installing: +This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. + +You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). + +To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: ```bash -CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` +Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. + +More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip
From f3b31ae23793c03b3323dd0f90afb5161e4d51ed Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 29 Jan 2026 12:51:57 +0800 Subject: [PATCH 126/518] fix: Append missing abi_tag --- .github/workflows/build-wheels-cu124-cu126-win.yml | 2 +- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu128-linux.yml | 2 +- .github/workflows/build-wheels-cu128-win.yml | 2 +- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index 19d07f9dee..9caf8157a5 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -107,7 +107,7 @@ jobs: $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - $newName = "$distName-$newVersion-$pyTag-$platTag" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 254eda05e5..65f24a19b2 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -103,7 +103,7 @@ jobs: IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" # Rename wheel file mv "$wheel_path" "dist/$new_filename" diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 8d0b6f25a9..765d557c23 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -103,7 +103,7 @@ jobs: IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" # Rename wheel file mv "$wheel_path" "dist/$new_filename" diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index 3d3af3b210..218523f423 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -103,7 +103,7 @@ jobs: IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" # Rename wheel file mv "$wheel_path" "dist/$new_filename" diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 77b73d710e..d280622dd7 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -107,7 +107,7 @@ jobs: $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - $newName = "$distName-$newVersion-$pyTag-$platTag" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index d50133d14a..f8703fc656 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -103,7 +103,7 @@ jobs: IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${plat_tag}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" # Rename wheel file mv "$wheel_path" "dist/$new_filename" diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index 5ba2704370..110fd70e05 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -107,7 +107,7 @@ jobs: $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - $newName = "$distName-$newVersion-$pyTag-$platTag" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName From 4b1853e96231a7f03713e3447458934d652d896a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 29 Jan 2026 12:58:41 +0800 Subject: [PATCH 127/518] Update Submodule vendor/llama.cpp 68ac3ac..b33df26 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 68ac3acb43..b33df266d0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 68ac3acb435450d5ba1e62748e17671815313dc3 +Subproject commit b33df266d0a53f800c47513386920cff1019d70e From 5bdff0b75217f14053602e72ffba18b58c9585cd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 29 Jan 2026 12:59:10 +0800 Subject: [PATCH 128/518] Sync llama : disable Direct IO by default --- examples/low_level_api/common.py | 2 +- llama_cpp/llama.py | 2 +- llama_cpp/llama_cpp.py | 4 ++-- llama_cpp/server/settings.py | 2 +- tests/test_llama.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 599c78e13b..9f32f8bb3b 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -62,7 +62,7 @@ class GptParams: penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True - use_direct_io: bool = True + use_direct_io: bool = False use_mlock: bool = False mem_test: bool = False verbose_prompt: bool = False diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5d9b3a182d..52bb415e21 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -69,7 +69,7 @@ def __init__( tensor_split: Optional[List[float]] = None, vocab_only: bool = False, use_mmap: bool = True, - use_direct_io: bool = True, + use_direct_io: bool = False, use_mlock: bool = False, check_tensors: bool = False, use_extra_bufts: bool = False, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 294c42ca54..3eef56cc64 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -715,7 +715,7 @@ class llama_model_tensor_buft_override(ctypes.Structure): # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible -# bool use_direct_io; // use direct io, takes precedence over use_mmap +# bool use_direct_io; // use direct io, takes precedence over use_mmap when supported # bool use_mlock; // force system to keep model in RAM # bool check_tensors; // validate model tensor data # bool use_extra_bufts; // use extra buffer types (used for weight repacking) @@ -737,7 +737,7 @@ class llama_model_params(ctypes.Structure): kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible - use_direct_io(bool): use direct io, takes precedence over use_mmap + use_direct_io(bool): use direct io, takes precedence over use_mmap when supported use_mlock (bool): force system to keep model in RAM check_tensors (bool): validate model tensor data use_extra_bufts (bool): use extra buffer types (used for weight repacking) diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index a4b67391e0..bb16527d8b 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -51,7 +51,7 @@ class ModelSettings(BaseSettings): description="Enable mmap to use filesystem cache.", ) use_direct_io: bool = Field( - default=True, + default=False, description="Use direct io, takes precedence over use_mmap.", ) use_mlock: bool = Field( diff --git a/tests/test_llama.py b/tests/test_llama.py index 93719b4b2e..3d51feeaac 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -75,7 +75,7 @@ def test_real_model(llama_cpp_model_path): params = llama_cpp.llama_model_default_params() params.use_mmap = llama_cpp.llama_supports_mmap() - params.use_direct_io = True + params.use_direct_io = False params.use_mlock = llama_cpp.llama_supports_mlock() params.check_tensors = False From b0e1d732576f36a45558198270cc07ce9d92abb4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 29 Jan 2026 13:18:06 +0800 Subject: [PATCH 129/518] Update Changelog --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5fefa7a03..73dc9ec1c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,11 +27,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ci: Customize wheel filename to improve version identification - Parse generated wheel filenames in the build step. - Append CUDA version (cuXXX) and AVX level (basic/avx2) to the version string. - - New format: package-ver+cuXXX.avxver-pyver-plat.whl (e.g., llama_cpp_python-0.3.23+cu130.basic-cp310-win_amd64.whl). + - New format: package-ver+cuXXX.avxver-pyver-abiver-platform.whl (e.g., llama_cpp_python-0.3.23+cu130.basic-cp310-cp310-win_amd64.whl). -- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/68ac3acb435450d5ba1e62748e17671815313dc3](https://github.com/ggml-org/llama.cpp/commit/68ac3acb435450d5ba1e62748e17671815313dc3) +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/b33df266d0a53f800c47513386920cff1019d70e](https://github.com/ggml-org/llama.cpp/commit/b33df266d0a53f800c47513386920cff1019d70e) -- feat: Sync llama.cpp llama/mtmd API Binding 20260127 +- feat: Sync llama.cpp llama/mtmd API Binding 20260129 ## [0.3.22] - perf (TFFT): Optimize longest_token_prefix with Numpy SIMD and fast-fail probe From 88757fd63e790cc179b972e38704ab4da51b282c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Feb 2026 12:02:13 +0800 Subject: [PATCH 130/518] Update Submodule vendor/llama.cpp b33df26..41ea261 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b33df266d0..41ea26144e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b33df266d0a53f800c47513386920cff1019d70e +Subproject commit 41ea26144e55d23f37bb765f88c07588d786567f From 79500ec2f5ec6ba4c83de21d61cb5a420271b8e2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Feb 2026 13:20:52 +0800 Subject: [PATCH 131/518] feat: implement generative reranking with chat template support - Now support Qwen3-reranker series models --- llama_cpp/llama_embedding.py | 88 ++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 69179cb0b5..5da97fa19a 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -30,7 +30,7 @@ class LlamaEmbedding(Llama): Key Features: 1. Auto-configuration: Automatically sets embeddings=True. 2. Streaming Batch: Handles massive datasets without OOM (Out Of Memory). - 3. Native Reranking Support: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker). / + 3. Native Reranking Support: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker, Qwen3-Reranker). / It correctly identifies classification heads to output scalar relevance scores instead of high-dimensional vectors. 4. Advanced Normalization: Implements MaxInt16, Taxicab (L1), and Euclidean (L2) normalization strategies / using NumPy for optimal performance and compatibility with various vector databases. @@ -273,8 +273,10 @@ def rank(self, query: str, documents: List[str]) -> List[float]: """ Calculate relevance scores for a list of documents against a query using a Reranking model. - This method constructs a specific prompt structure ([BOS] Query [SEP] Doc [EOS]) - typically used by Cross-Encoders to estimate similarity. + This method follows the implementation logic of the latest llama.cpp embedding example, + supporting both specialized chat templates and manual sequence construction. + + Link: https://github.com/ggml-org/llama.cpp/blob/master/examples/embedding/embedding.cpp Args: query: The search query string. @@ -283,33 +285,69 @@ def rank(self, query: str, documents: List[str]) -> List[float]: Returns: A list of float scores, where higher values indicate greater relevance. """ + # Ensure the model is configured for Reranking (Cross-Encoding) if self.pooling_type() != LLAMA_POOLING_TYPE_RANK: raise ValueError(f"Model pooling_type is {self.pooling_type()}, but LLAMA_POOLING_TYPE_RANK is required.") - # Prepare Special Tokens - sep_id = self.token_sep() - if sep_id == -1: sep_id = self.token_eos() - eos_id = self.token_eos() - - # Pre-process Query - q_tokens = self.tokenize(query.encode("utf-8"), add_bos=True, special=True) - # Remove the automatically added EOS token from the query - # because we need to append the separator and document tokens after it. - if q_tokens and q_tokens[-1] == eos_id: - q_tokens.pop() + # 1. Attempt to retrieve the built-in 'rerank' chat template from model metadata. + # Modern GGUF models often include a template for formatting query/document pairs. + rerank_template = llama_cpp.llama_model_chat_template(self._model.model, b"rerank") + if rerank_template: + rerank_template = rerank_template.decode("utf-8") - # Construct Batch Inputs batch_inputs: List[List[int]] = [] - for doc in documents: - d_tokens = self.tokenize(doc.encode("utf-8"), add_bos=False, special=True) - full_seq = q_tokens + [sep_id] + d_tokens - # Ensure the sequence ends with an EOS token to mark the end of inference. - if not full_seq or full_seq[-1] != eos_id: - full_seq.append(eos_id) - batch_inputs.append(full_seq) - - # We use NORM_MODE_NONE because rerankers output raw logits/scores, not vectors that need normalization. - return self.embed(batch_inputs, normalize=NORM_MODE_NONE) + + # 2. Case A: Using Model-Specific Template + # If a template exists, we perform dynamic string replacement for {query} and {document}. + if rerank_template: + for doc in documents: + final_prompt = rerank_template.replace("{query}", query).replace("{document}", doc) + # Tokenize the full formatted prompt. Template usually dictates BOS/EOS placement. + tokens = self.tokenize(final_prompt.encode("utf-8"), add_bos=False, special=True) + batch_inputs.append(tokens) + + # 3. Case B: Manual Sequence Construction (Fallback) + # If no template is found, construct the standard [BOS] Query [SEP] Doc [EOS] sequence. + else: + # Determine separator and end-of-sequence tokens + sep_id = self.token_sep() if self.token_sep() != -1 else self.token_eos() + eos_id = self.token_eos() + + # Pre-tokenize the query with BOS (Beginning of Sequence) + q_tokens = self.tokenize(query.encode("utf-8"), add_bos=True, special=True) + + # Remove the automatically added EOS token from the query to allow concatenation. + if q_tokens and q_tokens[-1] == eos_id: + q_tokens.pop() + + for doc in documents: + # Tokenize document without an additional BOS token + d_tokens = self.tokenize(doc.encode("utf-8"), add_bos=False, special=True) + + # Combine: [BOS] Query [SEP] Document + full_seq = q_tokens + [sep_id] + d_tokens + + # Ensure the sequence is properly terminated with an EOS token for inference. + if not full_seq or full_seq[-1] != eos_id: + full_seq.append(eos_id) + + batch_inputs.append(full_seq) + + # Execute embedding inference. Rerankers output raw logits/scores, so we skip normalization. + raw_results = self.embed(batch_inputs, normalize=NORM_MODE_NONE) + results_list = [raw_results] if (len(batch_inputs) == 1 and isinstance(raw_results[0], float)) else raw_results + + # 5. Output Post-Processing + # For generative rerankers like Qwen3-Reranker, output dim is 2 ([yes_logit, no_logit]). + final_scores = [] + # Ensure we iterate through results (embed returns List[Any] for batch inputs) + for res in results_list: + if isinstance(res, (list, np.ndarray)) and len(res) == 2: + final_scores.append(float(res[0])) # Standard scalar score in list form yes_logit + else: + final_scores.append(float(res)) # Raw scalar score + + return final_scores def create_embedding( self, From 30402cac925b4543ffe0737852682682b698c912 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Feb 2026 14:45:09 +0800 Subject: [PATCH 132/518] Update README.md for Qwen3-Reranker --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 2f57f1682f..da596f0104 100644 --- a/README.md +++ b/README.md @@ -778,6 +778,7 @@ print(res["choices"][0]["message"]["content"]) * **Streaming Batch Processing:** Process massive datasets (e.g., Hundreds of documents) without running out of memory (OOM). * **Native Reranking:** Built-in support for Cross-Encoder models (outputting relevance scores instead of vectors). * **Optimized Performance:** Utilizes Unified KV Cache for parallel encoding of multiple documents. +* **Chat Template Support:** Support for rerank templates has been introduced (via `llama_model_chat_template(model, b"rerank")`), which can automatically populate the query and document into a specific format. ### Support Embeddings & Rerank Model: @@ -786,8 +787,9 @@ print(res["choices"][0]["message"]["content"]) |--------------------|-----------|--------------------------------------------------------|--------------| | `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | +|`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | -### TODO(JamePeng): Needs more extensive testing with various embedding and rerank models. :) +#### TODO(JamePeng): Needs more extensive testing with various embedding and rerank models. :) ### 1. Text Embeddings (Vector Search) @@ -840,16 +842,17 @@ Reranking models (like `bge-reranker`) take a **Query** and a list of **Document ```python import llama_cpp -from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_RANK +from llama_cpp.llama_embedding import LlamaEmbedding # Initialize a Reranking model ranker = LlamaEmbedding( - model_path="path/to/bge-reranker-v2-m3.gguf", - pooling_type=LLAMA_POOLING_TYPE_RANK, # Crucial for Rerankers! + model_path="path/to/qwen3-reranker-0.6b-q8_0.gguf", + pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK, # Crucial for Rerankers! n_gpu_layers=-1, + n_ctx=0 ) -query = "What causes rain?" +query = "What causes Rain?" docs = [ "Clouds are made of water droplets...", # Relevant "To bake a cake you need flour...", # Irrelevant @@ -861,8 +864,8 @@ docs = [ scores = ranker.rank(query, docs) # Result: List of floats (higher means more relevant) -print(scores) -# e.g., [-0.15, -8.23, 5.67] -> The 3rd doc is the best match +print(scores) +# e.g., [0.0011407170677557588, 5.614783731289208e-05, 0.7173627614974976] -> The 3rd doc is the best match ``` ### 3. Normalization From 199639a43cf098b4ca73fb591b304f103137f91b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 4 Feb 2026 06:32:24 +0800 Subject: [PATCH 133/518] Update Submodule vendor/llama.cpp 41ea261..3795cc1 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 41ea26144e..3795cc1e89 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 41ea26144e55d23f37bb765f88c07588d786567f +Subproject commit 3795cc1e89e16fbc145f8a5457ea30abd86e0d1d From 2ebd4808c132c6c4ab561c60b25145bee7453999 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 6 Feb 2026 02:01:55 +0800 Subject: [PATCH 134/518] refactor LlamaSamplingParams class - base on llama.cpp/common/common.h Signed-off-by: JamePeng --- llama_cpp/_internals.py | 144 ++++++++++++++++++++++++++++++++-------- 1 file changed, 118 insertions(+), 26 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 0e7d152bf7..33007ff5b8 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1,11 +1,13 @@ from __future__ import annotations -import os import ctypes +import enum +import os from typing import ( Dict, List, + Set, Tuple, Optional, Sequence, @@ -850,36 +852,128 @@ def normalize_embedding(embedding): # Python wrappers over common/sampling structs - +# common/common.h common_params_sampling + +# enum common_sampler_type { +# COMMON_SAMPLER_TYPE_NONE = 0, +# COMMON_SAMPLER_TYPE_DRY = 1, +# COMMON_SAMPLER_TYPE_TOP_K = 2, +# COMMON_SAMPLER_TYPE_TOP_P = 3, +# COMMON_SAMPLER_TYPE_MIN_P = 4, +# //COMMON_SAMPLER_TYPE_TFS_Z = 5, +# COMMON_SAMPLER_TYPE_TYPICAL_P = 6, +# COMMON_SAMPLER_TYPE_TEMPERATURE = 7, +# COMMON_SAMPLER_TYPE_XTC = 8, +# COMMON_SAMPLER_TYPE_INFILL = 9, +# COMMON_SAMPLER_TYPE_PENALTIES = 10, +# COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, +# COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12, +# }; + +class CommonSamplerType(enum.IntEnum): + NONE = 0 + DRY = 1 + TOP_K = 2 + TOP_P = 3 + MIN_P = 4 + TYPICAL_P = 6 + TEMPERATURE = 7 + XTC = 8 + INFILL = 9 + PENALTIES = 10 + TOP_N_SIGMA = 11 + ADAPTIVE_P = 12 @dataclass class LlamaSamplingParams: - n_prev: int = 64 - n_probs: int = 0 - top_k: int = 40 - top_n_sigma: float = -1.00 - top_p: float = 0.95 - min_p: float = 0.05 - typical_p: float = 1.00 - temp: float = 0.80 - penalty_last_n: int = 64 - penalty_repeat: float = 1.0 - penalty_freq: float = 0.00 - penalty_present: float = 0.00 - mirostat: int = 0 - mirostat_tau: float = 5.00 - mirostat_eta: float = 0.10 - penalize_nl: bool = True - - xtc_threshold: float = 0.1 - xtc_probability: float = 0.0 + seed: int = llama_cpp.LLAMA_DEFAULT_SEED # the seed used to initialize llama_sampler + + n_prev: int = 64 # number of previous tokens to remember + n_probs: int = 0 # if greater than 0, output the probabilities of top n_probs tokens. + min_keep: int = 0 # 0 = disabled, otherwise samplers should return at least min_keep tokens + top_k: int = 40 # <= 0 to use vocab size + top_p: float = 0.95 # 1.0 = disabled + min_p: float = 0.05 # 0.0 = disabled + xtc_probability: float = 0.0 # 0.0 = disabled + xtc_threshold: float = 0.1 # > 0.5 disables XTC + typical_p: float = 1.00 # typical_p, 1.0 = disabled + temp: float = 0.80 # <= 0.0 to sample greedily, 0.0 to not output probabilities + dynatemp_range: float = 0.00 # 0.0 = disabled + dynatemp_exponent: float = 1.00 # controls how entropy maps to temperature in dynamic temperature sampler + + penalty_last_n: int = 64 # last n tokens to penalize (0 = disable penalty, -1 = context size) + penalty_repeat: float = 1.0 # 1.0 = disabled + penalty_freq: float = 0.00 # 0.0 = disabled + penalty_present: float = 0.00 # 0.0 = disabled + + dry_multiplier: float = 0.0 # 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + dry_base: float = 1.75 # 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + dry_allowed_length: int = 2 # tokens extending repetitions beyond this receive penalty + dry_penalty_last_n: int = -1 # how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + + adaptive_target: float = -1.0 # select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) + adaptive_decay: float = 0.90 # EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99) + mirostat: int = 0 # 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + top_n_sigma: float = -1.00 # -1.0 = disabled + mirostat_tau: float = 5.00 # target entropy + mirostat_eta: float = 0.10 # learning rate + + ignore_eos: bool = False + no_perf: bool = False # disable performance metrics + timing_per_token: bool = False + backend_sampling: bool = False + user_sampling_config: int = 0 # bitfield to track user-specified samplers + + dry_sequence_breakers: List[str] = field( + default_factory=lambda: ["\n", ":", "\"", "*"] # default sequence breakers for DRY + ) + + samplers: List[CommonSamplerType] = field( + default_factory=lambda: [ + CommonSamplerType.PENALTIES, + CommonSamplerType.DRY, + CommonSamplerType.TOP_N_SIGMA, + CommonSamplerType.TOP_K, + CommonSamplerType.TYPICAL_P, + CommonSamplerType.TOP_P, + CommonSamplerType.MIN_P, + CommonSamplerType.XTC, + CommonSamplerType.TEMPERATURE, + ] + ) grammar: str = "" + grammar_lazy: bool = False + grammar_triggers: List[Any] = field(default_factory=list) + preserved_tokens: Set[int] = field(default_factory=set) + + logit_bias: List[llama_cpp.llama_logit_bias] = field(default_factory=list) + logit_bias_eog: List[llama_cpp.llama_logit_bias] = field(default_factory=list) + + @property + def has_logit_bias(self) -> bool: + return len(self.logit_bias) > 0 - cfg_negative_prompt: str = "" - cfg_scale: float = 1.00 + def print_params(self) -> str: + result = ( + f"\trepeat_last_n = {self.penalty_last_n}, repeat_penalty = {self.penalty_repeat:.3f}, " + f"frequency_penalty = {self.penalty_freq:.3f}, presence_penalty = {self.penalty_present:.3f}\n" + + f"\tdry_multiplier = {self.dry_multiplier:.3f}, dry_base = {self.dry_base:.3f}, " + f"dry_allowed_length = {self.dry_allowed_length}, dry_penalty_last_n = {self.dry_penalty_last_n}\n" + + f"\ttop_k = {self.top_k}, top_p = {self.top_p:.3f}, min_p = {self.min_p:.3f}, " + f"xtc_probability = {self.xtc_probability:.3f}, xtc_threshold = {self.xtc_threshold:.3f}, " + f"typical_p = {self.typ_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" + + f"\tmirostat = {self.mirostat}, mirostat_lr = {self.mirostat_eta:.3f}, " + f"mirostat_ent = {self.mirostat_tau:.3f}, adaptive_target = {self.adaptive_target:.3f}, " + f"adaptive_decay = {self.adaptive_decay:.3f}" + ) + return result - logit_bias: dict[int, float] = field(default_factory=dict) + def __repr__(self) -> str: + return self.print_params() @dataclass @@ -956,8 +1050,6 @@ def sample( self.params.penalty_freq, self.params.penalty_present, ) - if not self.params.penalize_nl: - token_data_array.candidates_data.logit[nl_token] = nl_logit if self.grammar is not None: ctx_main.sample_grammar(token_data_array, self.grammar) From 28264ddd6eabbe1dc2f75ca61bc5249b04dcc18e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Feb 2026 10:19:05 +0800 Subject: [PATCH 135/518] Update Submodule vendor/llama.cpp 3795cc1..b831118 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3795cc1e89..b83111815e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3795cc1e89e16fbc145f8a5457ea30abd86e0d1d +Subproject commit b83111815e9a79949257e9d4b087206b320a3063 From 672fa43ade7b056ab9d078b27433ba900797dddb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Feb 2026 22:11:12 +0800 Subject: [PATCH 136/518] Optimize the method definition of class llama_sampler_i and CtypesPointer definition Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 138 +++++++++++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 41 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3eef56cc64..90fc5e9e83 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -746,7 +746,7 @@ class llama_model_params(ctypes.Structure): if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused - tensor_buft_overrides: ctypes.POINTER(llama_model_tensor_buft_override) + tensor_buft_overrides: CtypesPointer[llama_model_tensor_buft_override] n_gpu_layers: int split_mode: int main_gpu: int @@ -1392,10 +1392,10 @@ class llama_params_fit_status(enum.IntEnum): ) def llama_params_fit( path_model: ctypes.c_char_p, - mparams: llama_model_params_p, - cparams: llama_context_params_p, - tensor_split: ctypes.pointer(ctypes.c_float), - tensor_buft_overrides: ctypes.pointer(llama_model_tensor_buft_override), + mparams: CtypesPointer[llama_model_params], + cparams: CtypesPointer[llama_context_params], + tensor_split: CtypesPointer[ctypes.c_float], + tensor_buft_overrides: CtypesPointer[llama_model_tensor_buft_override], margin: ctypes.c_size_t, n_ctx_min: ctypes.c_uint32, log_level: int, @@ -1989,7 +1989,7 @@ def llama_adapter_get_alora_n_invocation_tokens(adapter: llama_adapter_lora_p, / [llama_adapter_lora_p_ctypes], ctypes.c_uint64, ) -def llama_adapter_get_alora_invocation_tokens(adapter: llama_adapter_lora_p, /) -> llama_token_p: +def llama_adapter_get_alora_invocation_tokens(adapter: llama_adapter_lora_p, /) -> CtypesPointer[llama_token]: ... @@ -2720,7 +2720,7 @@ def llama_state_seq_get_size_ext( ) def llama_state_seq_get_data_ext( ctx: llama_context_p, - dst: ctypes.POINTER(ctypes.c_uint8), + dst: CtypesPointer[ctypes.c_uint8], size: Union[int, ctypes.c_size_t], seq_id: llama_seq_id, flags: llama_state_seq_flags, @@ -2748,7 +2748,7 @@ def llama_state_seq_get_data_ext( ) def llama_state_seq_set_data_ext( ctx: llama_context_p, - src: ctypes.POINTER(ctypes.c_uint8), + src: CtypesPointer[ctypes.c_uint8], size: Union[int, ctypes.c_size_t], dest_seq_id: llama_seq_id, flags: llama_state_seq_flags, @@ -3008,7 +3008,7 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: ) def llama_get_logits_ith( ctx: llama_context_p, i: ctypes.c_int32, / -) -> ctypes.POINTER(ctypes.c_float): +) -> CtypesPointer[ctypes.c_float]: """Logits for the ith token. Equivalent to: llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab""" ... @@ -3835,7 +3835,7 @@ def llama_chat_apply_template( ctypes.c_int32, ) def llama_chat_builtin_templates( - output: CtypesArray[bytes], + output: CtypesArray[ctypes.c_char_p], len: Union[ctypes.c_size_t, int], /, ) -> int: @@ -3945,8 +3945,93 @@ class llama_sampler_data(ctypes.Structure): # // called before graph execution to set inputs for the current ubatch # void (*backend_set_input)(struct llama_sampler * smpl); # }; + +# const char * (*name)(const struct llama_sampler * smpl); +llama_sampler_name_fn = ctypes.CFUNCTYPE( + ctypes.c_char_p, # return type + ctypes.c_void_p # smpl +) + +# void (*accept)(struct llama_sampler * smpl, llama_token token); +llama_sampler_accept_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p, # smpl + llama_token # token +) + +# void (*apply)(struct llama_sampler * smpl, llama_token_data_array * cur_p); +llama_sampler_apply_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p, # smpl + ctypes.POINTER(llama_token_data_array) # cur_p +) + +# void (*reset)(struct llama_sampler * smpl); +llama_sampler_reset_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p # smpl +) + +# struct llama_sampler * (*clone)(const struct llama_sampler * smpl); +llama_sampler_clone_fn = ctypes.CFUNCTYPE( + ctypes.c_void_p, # return struct llama_sampler * + ctypes.c_void_p # smpl (const ignored in ctypes) +) + +# void (*free)(struct llama_sampler * smpl); +llama_sampler_free_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p # smpl +) + +# --- EXPERIMENTAL Backend Sampling Interface --- + +# bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft); +llama_sampler_backend_init_fn = ctypes.CFUNCTYPE( + ctypes.c_bool, # return bool + ctypes.c_void_p, # smpl + ctypes.c_void_p # buft +) + +# void (*backend_accept)(struct llama_sampler * smpl, struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_tensor * selected_token); +llama_sampler_backend_accept_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p, # smpl + ctypes.c_void_p, # ctx + ctypes.c_void_p, # gf + ctypes.c_void_p # selected_token +) + +# void (*backend_apply)(struct llama_sampler * smpl, struct ggml_context * ctx, struct ggml_cgraph * gf, struct llama_sampler_data * data); +llama_sampler_backend_apply_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p, # smpl + ctypes.c_void_p, # ctx + ctypes.c_void_p, # gf + ctypes.POINTER(llama_sampler_data) # data +) + +# void (*backend_set_input)(struct llama_sampler * smpl); +llama_sampler_backend_set_input_fn = ctypes.CFUNCTYPE( + None, # return void + ctypes.c_void_p # smpl +) + class llama_sampler_i(ctypes.Structure): - ... + _fields_ = [ + ("name", llama_sampler_name_fn), + ("accept", llama_sampler_accept_fn), + ("apply", llama_sampler_apply_fn), + ("reset", llama_sampler_reset_fn), + ("clone", llama_sampler_clone_fn), + ("free", llama_sampler_free_fn), + + # [EXPERIMENTAL] Backend sampling interface + ("backend_init", llama_sampler_backend_init_fn), + ("backend_accept", llama_sampler_backend_accept_fn), + ("backend_apply", llama_sampler_backend_apply_fn), + ("backend_set_input", llama_sampler_backend_set_input_fn), + ] # struct llama_sampler { @@ -3965,35 +4050,6 @@ class llama_sampler(ctypes.Structure): llama_sampler_p_ctypes = ctypes.POINTER(llama_sampler) -llama_sampler_i_name = ctypes.CFUNCTYPE(ctypes.c_char_p, llama_sampler_p_ctypes) -llama_sampler_i_accept = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes, llama_token) -llama_sampler_i_apply = ctypes.CFUNCTYPE( - None, llama_sampler_p_ctypes, llama_token_data_array_p) -llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) -llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes) -llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) - -llama_sampler_i_backend_init = ctypes.CFUNCTYPE( - ctypes.c_bool, llama_sampler_p_ctypes, ctypes.c_void_p) -llama_sampler_i_backend_accept = ctypes.CFUNCTYPE( - None, llama_sampler_p_ctypes, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p) -llama_sampler_i_backend_apply = ctypes.CFUNCTYPE( - None, llama_sampler_p_ctypes, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p) -llama_sampler_i_backend_set_input = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) - -llama_sampler_i._fields_ = [ - ("name", llama_sampler_i_name), - ("accept", llama_sampler_i_accept), - ("apply", llama_sampler_i_apply), - ("reset", llama_sampler_i_reset), - ("clone", llama_sampler_i_clone), - ("free", llama_sampler_i_free), - ("backend_init", llama_sampler_i_backend_init), - ("backend_accept", llama_sampler_i_backend_accept), - ("backend_apply", llama_sampler_i_backend_apply), - ("backend_set_input", llama_sampler_i_backend_set_input), -] - # // [EXPERIMENTAL] # // attach a sampler to the context @@ -4024,7 +4080,7 @@ def llama_set_sampler( llama_sampler_p_ctypes, ) def llama_sampler_init( - iface: ctypes.pointer(llama_sampler_i), ctx: llama_sampler_context_t, / + iface: CtypesPointer[llama_sampler_i], ctx: llama_sampler_context_t, / ) -> llama_sampler_p: ... From 1df39b422890db55cb9f6de43cb792a26921752e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Feb 2026 02:23:30 +0800 Subject: [PATCH 137/518] Refactor sampling infrastructure to use llama.cpp sampler chain API - LlamaContext: Remove obsolete manual sampling methods. - LlamaSampler: Wrap C++ sampler chain; add support for DRY, XTC, Adaptive-P, and lazy grammar.`add_grammar` merged the processing branches for regular grammar and lazy grammar. - LlamaSamplingContext: Update to build and manage sampler chains instead of manual logic. - CustomSampler: Rewrite for proper C-struct lifecycle management and ABI compatibility. - Optimizations: Simplified redundant handling and variable usage. --- examples/low_level_api/common.py | 7 - .../low_level_api/low_level_api_chat_cpp.py | 3 - llama_cpp/_internals.py | 807 ++++++++---------- llama_cpp/llama.py | 13 +- llama_cpp/llama_cpp.py | 4 +- 5 files changed, 381 insertions(+), 453 deletions(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 9f32f8bb3b..744ebd411b 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -59,7 +59,6 @@ class GptParams: interactive_start: bool = False instruct: bool = False - penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True use_direct_io: bool = False @@ -362,12 +361,6 @@ def gpt_params_parse(argv=None): help="run in instruction mode (use with Alpaca or Vicuna models)", dest="instruct", ) - parser.add_argument( - "--no-penalize-nl", - action="store_false", - help="do not penalize newline token", - dest="penalize_nl", - ) parser.add_argument( "--perplexity", action="store_true", diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 7aa80ccec4..1f71741b20 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -479,9 +479,6 @@ def generate(self): # _arr, # last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) - if not self.params.penalize_nl: - logits[llama_cpp.llama_token_nl()] = nl_logit - if self.params.temp <= 0: # Greedy sampling id = llama_cpp.llama_sampler_init_greedy(self.ctx, candidates_p) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 33007ff5b8..a2cc4fee16 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -5,13 +5,17 @@ import os from typing import ( + Callable, Dict, List, Set, Tuple, Optional, Sequence, + Union, + TYPE_CHECKING ) + from dataclasses import dataclass, field from contextlib import ExitStack @@ -24,6 +28,11 @@ import llama_cpp.llama_cpp as llama_cpp +if TYPE_CHECKING: + from llama_cpp._ctypes_extensions import ( + CtypesArray, + CtypesPointer, + ) # Python wrappers over llama.h structs @@ -447,7 +456,7 @@ def load_state_file( path_session: bytes, tokens_out: ctypes.Array[llama_cpp.llama_token], n_token_capacity: ctypes.c_size_t, - n_token_count_out: ctypes.pointer(ctypes.c_size_t) + n_token_count_out: CtypesPointer[ctypes.c_size_t] ) -> bool: return llama_cpp.llama_state_load_file(self.ctx, path_session, tokens_out, n_token_capacity, n_token_count_out) @@ -474,7 +483,7 @@ def load_state_seq_file( dest_seq_id: int, tokens_out: ctypes.Array[llama_cpp.llama_token], n_token_capacity: ctypes.c_size_t, - n_token_count_out: ctypes.pointer(ctypes.c_size_t) + n_token_count_out: CtypesPointer[ctypes.c_size_t] ) -> int: return llama_cpp.llama_state_seq_load_file(self.ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out) @@ -569,126 +578,6 @@ def get_embeddings_ith(self, i: int): def get_embeddings_seq(self, seq_id: int): return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id) - # Sampling functions - - def set_rng_seed(self, seed: int): - # TODO: Fix - # llama_cpp.llama_set_rng_seed(self.ctx, seed) - raise NotImplementedError("set_rng_seed is not implemented in llama.cpp") - - def sample_repetition_penalties( - self, - candidates: "_LlamaTokenDataArray", - last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]", - penalty_last_n: int, - penalty_repeat: float, - penalty_freq: float, - penalty_present: float, - ): - # llama_cpp.llama_sample_repetition_penalties( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # last_tokens_data, - # penalty_last_n, - # penalty_repeat, - # penalty_freq, - # penalty_present, - # ) - raise NotImplementedError("sample_repetition_penalties is not implemented in llama.cpp") - - def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - # llama_cpp.llama_sample_top_k( - # self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep - # ) - raise NotImplementedError("sample_top_k is not implemented in llama.cpp") - - def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - # llama_cpp.llama_sample_top_p( - # self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - # ) - raise NotImplementedError("sample_top_p is not implemented in llama.cpp") - - def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - # llama_cpp.llama_sample_min_p( - # self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - # ) - raise NotImplementedError("sample_min_p is not implemented in llama.cpp") - - def sample_typical( - self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int - ): - # llama_cpp.llama_sample_typical( - # self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - # ) - raise NotImplementedError("sample_typical is not implemented in llama.cpp") - - def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): - # llama_cpp.llama_sample_temp( - # self.ctx, llama_cpp.byref(candidates.candidates), temp - # ) - raise NotImplementedError("sample_temp is not implemented in llama.cpp") - - def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - # llama_cpp.llama_sample_grammar( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # grammar.grammar, - # ) - raise NotImplementedError("sample_grammar is not implemented in llama.cpp") - - def sample_token_mirostat( - self, - candidates: "_LlamaTokenDataArray", - tau: float, - eta: float, - m: int, - mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], - ) -> int: - raise NotImplementedError("sample_token_mirostat is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token_mirostat( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # tau, - # eta, - # m, - # mu, - # ) - - def sample_token_mirostat_v2( - self, - candidates: "_LlamaTokenDataArray", - tau: float, - eta: float, - mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], - ) -> int: - raise NotImplementedError("sample_token_mirostat_v2 is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token_mirostat_v2( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # tau, - # eta, - # mu, - # ) - - def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token_greedy is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token_greedy( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # ) - - def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token is not implemented in llama.cpp") - # return llama_cpp.llama_sample_token( - # self.ctx, - # llama_cpp.byref(candidates.candidates), - # ) - - # Grammar - def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - raise NotImplementedError("grammar_accept_token is not implemented in llama.cpp") - # llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token) - def reset_timings(self): llama_cpp.llama_perf_context_reset(self.ctx) @@ -884,6 +773,8 @@ class CommonSamplerType(enum.IntEnum): TOP_N_SIGMA = 11 ADAPTIVE_P = 12 + CUSTOM = 99 + @dataclass class LlamaSamplingParams: seed: int = llama_cpp.LLAMA_DEFAULT_SEED # the seed used to initialize llama_sampler @@ -928,11 +819,14 @@ class LlamaSamplingParams: default_factory=lambda: ["\n", ":", "\"", "*"] # default sequence breakers for DRY ) + custom_samplers: List['CustomSampler'] = field(default_factory=list) + samplers: List[CommonSamplerType] = field( default_factory=lambda: [ CommonSamplerType.PENALTIES, CommonSamplerType.DRY, CommonSamplerType.TOP_N_SIGMA, + CommonSamplerType.CUSTOM, CommonSamplerType.TOP_K, CommonSamplerType.TYPICAL_P, CommonSamplerType.TOP_P, @@ -978,357 +872,435 @@ def __repr__(self) -> str: @dataclass class LlamaSamplingContext: - params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams) - mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float) - grammar: Optional[LlamaGrammar] = None - # NOTE: Missing parsed_grammar - prev: list[int] = field(default_factory=list) - cur: list[llama_cpp.llama_token_data] = field(default_factory=list) + """ + High-level Python wrapper that manages the lifecycle and configuration + of the llama.cpp sampler chain. + """ + def __init__( + self, + params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams), + model: Optional[LlamaModel] = None, + _existing_sampler: Optional[LlamaSampler] = None, # Internal use for cloning + ): + self.params = params + self.model = model - def reset(self): - self.prev = [] - self.cur = [] - if self.grammar is not None: - self.grammar.reset() - - def cp(self): - return LlamaSamplingContext( - params=self.params, - mirostat_mu=self.mirostat_mu, - grammar=self.grammar, - prev=self.prev.copy(), - cur=self.cur.copy(), - ) + # Keep track of generated tokens for Python-side debugging/decoding + self.prev: List[int] = [] - def last(self) -> Optional[int]: - if len(self.prev) > 0: - return self.prev[-1] + if _existing_sampler: + # Use the provided sampler (already configured/cloned) + self.sampler = _existing_sampler else: - return None + # Build a new chain from scratch + self.sampler = LlamaSampler() + self._build_sampler_chain() - def prev_str(self, ctx_main: LlamaContext, n: int) -> str: - return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8") + def _build_sampler_chain(self): + """ + Constructs the sampler chain based on the parameters. + The order generally follows common.cpp practices: + Bias -> Grammar -> Penalties -> DRY -> [Configurable Samplers] -> Dist/Greedy + """ + s = self.sampler + p = self.params + m = self.model + + # --- 1. Logit Bias (Always applied first to mask/boost tokens) --- + if p.logit_bias and m: + s.add_logit_bias(m.n_vocab(), p.logit_bias) + + # --- 2. Usage-Specific Samplers (Infill) --- + # If Infill is required, it often modifies logits based on prefix/suffix + if CommonSamplerType.INFILL in p.samplers and m: + s.add_infill(m) + + # --- 3. Grammar / Syntax Constraints --- + if p.grammar and m: + # Use "root" as default rule name if not specified + root_rule = "root" + s.add_grammar( + model=m, + grammar_str=p.grammar, + root=root_rule, + lazy=p.grammar_lazy, + triggers=p.grammar_triggers + ) - def sample( - self, - ctx_main: LlamaContext, - idx: int = 0, - logits_array: Optional[npt.NDArray[np.single]] = None, - ): - n_vocab = ctx_main.model.n_vocab() - id: int = 0 - - if logits_array is None: - logits = ctx_main.get_logits_ith(idx) - logits_array = np.array( - ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents, - dtype=np.single, + # --- 4. Penalties (Repetition) --- + # Note: In some implementations, penalties come before other samplers + if CommonSamplerType.PENALTIES in p.samplers: + s.add_penalties( + p.penalty_last_n, + p.penalty_repeat, + p.penalty_freq, + p.penalty_present ) - # apply logit_bias - for token, logit_bias in self.params.logit_bias.items(): - logits_array[token] += logit_bias + # --- 5. DRY (Don't Repeat Yourself) --- + if CommonSamplerType.DRY in p.samplers and m: + s.add_dry( + m, + p.dry_multiplier, + p.dry_base, + p.dry_allowed_length, + p.dry_penalty_last_n, + p.dry_sequence_breakers + ) - token_data_array = LlamaTokenDataArray( - n_vocab=n_vocab - ) # TODO: Only create this once - token_data_array.copy_logits(logits_array) + # --- 6. Core Sampling Strategies (The "Filter" Loop) --- + # We iterate through the list to preserve user-defined order for these specific samplers + for stype in p.samplers: + if stype == CommonSamplerType.CUSTOM: + if p.custom_samplers: + for cs in p.custom_samplers: + s.add_custom(cs) - # apply penalties - if len(self.prev) > 0: - nl_token = ctx_main.model.token_nl() - nl_logit = logits_array[nl_token] - last_tokens = self.prev[-self.params.penalty_last_n :] - last_tokens_size = min(len(last_tokens), self.params.penalty_last_n) - if last_tokens_size > 0: - last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens) - ctx_main.sample_repetition_penalties( - token_data_array, - last_tokens_p, - last_tokens_size, - self.params.penalty_repeat, - self.params.penalty_freq, - self.params.penalty_present, - ) + elif stype == CommonSamplerType.TOP_K: + s.add_top_k(p.top_k) + + elif stype == CommonSamplerType.TOP_P: + s.add_top_p(p.top_p, p.min_keep) + + elif stype == CommonSamplerType.MIN_P: + s.add_min_p(p.min_p, p.min_keep) - if self.grammar is not None: - ctx_main.sample_grammar(token_data_array, self.grammar) + elif stype == CommonSamplerType.TYPICAL_P: + s.add_typical(p.typical_p, p.min_keep) - if self.params.temp < 0: - id = token_data_array.candidates_data.id[0] - elif self.params.temp == 0: - id = ctx_main.sample_token_greedy(token_data_array) + elif stype == CommonSamplerType.TEMPERATURE: + s.add_temp(p.temp) + + elif stype == CommonSamplerType.XTC: + s.add_xtc(p.xtc_probability, p.xtc_threshold, p.min_keep, p.seed) + + elif stype == CommonSamplerType.TOP_N_SIGMA: + s.add_top_n_sigma(p.top_n_sigma) + + elif stype == CommonSamplerType.ADAPTIVE_P: + s.add_adaptive_p(p.adaptive_target, p.adaptive_decay, p.seed) + + # --- 7. Final Distribution / Selection --- + # Mirostat overrides standard greedy/dist sampling + if p.mirostat == 1 and m: + s.add_mirostat(m.n_vocab(), p.seed, p.mirostat_tau, p.mirostat_eta, 100) + elif p.mirostat == 2: + s.add_mirostat_v2(p.seed, p.mirostat_tau, p.mirostat_eta) else: - if self.params.mirostat == 1: - mirostat_m = 100 - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token_mirostat( - token_data_array, - self.params.mirostat_tau, - self.params.mirostat_eta, - mirostat_m, - ctypes.pointer(self.mirostat_mu), - ) - elif self.params.mirostat == 2: - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token_mirostat_v2( - token_data_array, - self.params.mirostat_tau, - self.params.mirostat_eta, - ctypes.pointer(self.mirostat_mu), - ) + # If not using Mirostat, use Greedy (if temp=0) or Random Distribution + if p.temp == 0: + s.add_greedy() else: - min_keep = max(1, self.params.n_probs) - ctx_main.sample_top_k( - token_data_array, self.params.top_k, min_keep=min_keep - ) - ctx_main.sample_typical( - token_data_array, self.params.typical_p, min_keep=min_keep - ) - ctx_main.sample_top_p( - token_data_array, self.params.top_p, min_keep=min_keep - ) - ctx_main.sample_min_p( - token_data_array, self.params.min_p, min_keep=min_keep - ) - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token(token_data_array) - return id + s.add_dist(p.seed) - def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): - if apply_grammar and self.grammar is not None: - ctx_main.grammar_accept_token(self.grammar, id) - self.prev.append(id) + def reset(self): + """ + Resets the internal state of all samplers in the chain. + """ + self.sampler.reset() + self.prev = [] + def cp(self) -> 'LlamaSamplingContext': + """ + Creates a deep copy of the sampling context. + This clones the sampler chain state + """ + # 1. Clone the sampler chain using llama_sampler_clone + new_sampler = self.sampler.clone() + + # 2. Create new context wrapping the cloned chain + new_ctx = LlamaSamplingContext( + self.params, + self.model, + _existing_sampler=new_sampler + ) -from typing import List, Callable, Optional, Union -import ctypes -import llama_cpp + # 3. Copy Python-side history + new_ctx.prev = self.prev.copy() + + return new_ctx + + def accept(self, token: int): + """ + Accepts a token into the sampler state. + MUST be called after sampling to update repetition penalties, grammar state, etc. + + Args: + token: The token ID selected. + """ + self.sampler.accept(token) + self.prev.append(token) + + def sample( + self, + ctx: LlamaContext, + idx: int = -1, + ) -> int: + """ + Samples a token from the model's current logits. + + Args: + ctx_main: The context containing the logits. + idx: The batch index to sample from (defaults to last token: -1). + """ + return self.sampler.sample(ctx, idx) + + # --- Utilities --- + + def last(self) -> Optional[int]: + """Returns the last sampled token.""" + if len(self.prev) > 0: + return self.prev[-1] + else: + return None + + def prev_str(self, ctx_main: LlamaContext, n: int) -> str: + """ + Decodes the last n tokens into a string. + Useful for debugging what the sampler chain "sees" as context. + """ + if not self.prev: + return "" + # Get the last n tokens + last_tokens = self.prev[-n:] + # Use the model linked to the context to detokenize + return ctx_main.model.detokenize(last_tokens).decode("utf-8", errors="replace") class CustomSampler: def __init__( - self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] + self, + apply_func: Callable[[llama_cpp.llama_token_data_array], None], + name: str = "custom", + **kwargs ): self.apply_func = apply_func + self.name_bytes = name.encode('utf-8') + + def _cb_name(smpl): + return self.name_bytes - def apply_wrapper( - sampler: llama_cpp.llama_sampler_p, - cur_p: llama_cpp.llama_token_data_array_p, - ): - self.apply_func(cur_p) + def _cb_apply(smpl, cur_p): + if cur_p and self.apply_func: + self.apply_func(cur_p.contents) - def free_wrapper(sampler: llama_cpp.llama_sampler_p): - pass + self._cb_accept = kwargs.get('accept_func') or (lambda smpl, token: None) + self._cb_reset = kwargs.get('reset_func') or (lambda smpl: None) + self._cb_free = kwargs.get('free_func') or (lambda smpl: None) + self._cb_clone = kwargs.get('clone_func') or (lambda smpl: None) - sampler_i = llama_cpp.llama_sampler_i() - sampler_i.apply = llama_cpp.llama_sampler_i_apply(apply_wrapper) - self._apply_wrapper_ref = apply_wrapper + self.llama_sampler_i = llama_cpp.llama_sampler_i() - sampler_i.name = llama_cpp.llama_sampler_i_name(0) - sampler_i.accept = llama_cpp.llama_sampler_i_accept(0) - sampler_i.reset = llama_cpp.llama_sampler_i_reset(0) - sampler_i.clone = llama_cpp.llama_sampler_i_clone(0) - sampler_i.free = llama_cpp.llama_sampler_i_free(0) + self.llama_sampler_i.name = llama_cpp.llama_sampler_name_fn(_cb_name) + self.llama_sampler_i.accept = llama_cpp.llama_sampler_accept_fn(lambda s, t: self._cb_accept(s, t)) + self.llama_sampler_i.apply = llama_cpp.llama_sampler_apply_fn(_cb_apply) + self.llama_sampler_i.reset = llama_cpp.llama_sampler_reset_fn(lambda s: self._cb_reset(s)) + self.llama_sampler_i.clone = llama_cpp.llama_sampler_clone_fn(lambda s: self._cb_clone(s)) + self.llama_sampler_i.free = llama_cpp.llama_sampler_free_fn(lambda s: self._cb_free(s)) - self.sampler = llama_cpp.llama_sampler_init(ctypes.pointer(sampler_i), None) + self.llama_sampler_i.backend_init = ctypes.cast(0, llama_cpp.llama_sampler_backend_init_fn) + self.llama_sampler_i.backend_accept = ctypes.cast(0, llama_cpp.llama_sampler_backend_accept_fn) + self.llama_sampler_i.backend_apply = ctypes.cast(0, llama_cpp.llama_sampler_backend_apply_fn) + self.llama_sampler_i.backend_set_input = ctypes.cast(0, llama_cpp.llama_sampler_backend_set_input_fn) + + self.sampler_p = llama_cpp.llama_sampler_init(ctypes.pointer(self.llama_sampler_i), None) def get_sampler(self) -> llama_cpp.llama_sampler_p: - return self.sampler + return self.sampler_p class LlamaSampler: - def __init__(self): - params = llama_cpp.llama_sampler_chain_params() - self.sampler = llama_cpp.llama_sampler_chain_init(params) + def __init__(self, existing_sampler_p: Optional[llama_cpp.llama_sampler_p] = None): + if existing_sampler_p: + self.sampler = existing_sampler_p + else: + # Initialize new chain + params = llama_cpp.llama_sampler_chain_params() + params.no_perf = False + self.sampler = llama_cpp.llama_sampler_chain_init(params) + self.samplers: List[llama_cpp.llama_sampler_p] = [] - self.custom_samplers: List[Tuple[int, CustomSampler]] = [] + self.custom_samplers: List["CustomSampler"] = [] + + def _add_sampler(self, sampler: llama_cpp.llama_sampler_p): + if not sampler: + raise RuntimeError("Failed to initialize sampler") + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + self.samplers.append(sampler) + + # --- Core Sampling Methods --- + + def accept(self, token: int): + """ + Updates the sampler state (e.g. repetition penalty history). + """ + assert self.sampler is not None + llama_cpp.llama_sampler_accept(self.sampler, token) + + def clone(self) -> 'LlamaSampler': + """ + Clones the sampler chain and its internal state. + """ + if not self.sampler: + raise RuntimeError("Cannot clone: sampler is closed or not initialized") + + # Call C-level llama.cpp clone + new_sampler_p = llama_cpp.llama_sampler_clone(self.sampler) + if not new_sampler_p: + raise RuntimeError("llama_sampler_clone failed") + + return LlamaSampler(existing_sampler_p=new_sampler_p) + + def sample(self, ctx: LlamaContext, idx: int = -1) -> int: + """ + Sample and accept a token from the idx-th output of the last evaluation + """ + assert self.sampler is not None + assert ctx.ctx is not None + return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx) + + def reset(self): + """ + Resets the sampler state. + """ + assert self.sampler is not None + llama_cpp.llama_sampler_reset(self.sampler) + + def close(self): + if self.sampler: + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + for i, _ in reversed(self.custom_samplers): + llama_cpp.llama_sampler_chain_remove(self.sampler, i) + llama_cpp.llama_sampler_free(self.sampler) + self.sampler = None + self.samplers.clear() + self.custom_samplers.clear() + + def __del__(self): + self.close() + + # --- Specific Samplers (aligning with llama-sampler.cpp) --- def add_greedy(self): - sampler = llama_cpp.llama_sampler_init_greedy() - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_greedy()) def add_dist(self, seed: int): - sampler = llama_cpp.llama_sampler_init_dist(seed) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_dist(seed)) def add_top_k(self, k: int): - sampler = llama_cpp.llama_sampler_init_top_k(k) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_top_k(k)) def add_top_p(self, p: float, min_keep: int): - sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_top_p(p, min_keep)) def add_min_p(self, p: float, min_keep: int): - sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_min_p(p, min_keep)) def add_typical(self, p: float, min_keep: int): - sampler = llama_cpp.llama_sampler_init_typical(p, min_keep) - self._add_sampler(sampler) - - def add_xtc(self, p: float, t: float, min_keep: int, seed: int): - sampler = llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_typical(p, min_keep)) def add_temp(self, temp: float): - sampler = llama_cpp.llama_sampler_init_temp(temp) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_temp(temp)) def add_temp_ext(self, t: float, delta: float, exponent: float): - sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent)) + + def add_xtc(self, p: float, t: float, min_keep: int, seed: int): + self._add_sampler(llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed)) def add_top_n_sigma(self, n: float): - sampler = llama_cpp.llama_sampler_init_top_n_sigma(n) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_top_n_sigma(n)) def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int): - sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m)) def add_mirostat_v2(self, seed: int, tau: float, eta: float): - sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta) - self._add_sampler(sampler) - - def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): - sampler = llama_cpp.llama_sampler_init_grammar( - model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8") - ) - self._add_sampler(sampler) - - def convert_list_str_to_char_array_ptr(self, str_list: List[str]): - """ - Converts a list of strings to a char** array for C interop, and returns two values: - the char** array and the number of bytes in the list. - - Args: - str_list: List of string objects. + self._add_sampler(llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta)) - Returns: - - A ctypes pointer to a char** array. - - The number of strings in the input list. - """ - # Encode strings to bytes - byte_list = [s.encode('utf-8') for s in str_list] - # Calculate the number of breakers - num_byte_list= len(byte_list) - # Define the type of a char pointer - char_ptr_type = ctypes.POINTER(ctypes.c_char) - # Define the type of an array of char pointers - char_ptr_array_type = char_ptr_type * num_byte_list - - # Allocate memory for the array of char pointers - char_ptr_array = char_ptr_array_type() - - # Populate the array with pointers to the byte strings - for i, byte_string in enumerate(byte_list): - # Create a null-terminated C-style string buffer - c_char_array = ctypes.create_string_buffer(byte_string) - # Cast the buffer to a char pointer and assign it to the array - char_ptr_array[i] = ctypes.cast(c_char_array, char_ptr_type) - - char_array_ptr = ctypes.cast(char_ptr_array, ctypes.POINTER(char_ptr_type)) - - # Return the char** pointer and the number of strings - return char_array_ptr, num_byte_list - - def add_grammar_lazy_patterns( - self, - model: LlamaModel, - grammar: LlamaGrammar, - num_trigger_patterns: int, - trigger_tokens:list[llama_cpp.llama_token], - num_trigger_tokens: int, - trigger_patterns: list[str]=[] - ): - trigger_patterns_char_array_ptr, num_trigger_patterns = self.convert_list_str_to_char_array_ptr(trigger_patterns) - sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns( - model.vocab, - grammar._grammar.encode("utf-8"), - grammar._root.encode("utf-8"), - trigger_patterns_char_array_ptr, - num_trigger_patterns, - trigger_tokens, - num_trigger_tokens - ) - self._add_sampler(sampler) - - - def add_penalties( - self, - n_vocab: int, - special_eos_id: int, - linefeed_id: int, - penalty_last_n: int, - penalty_repeat: float, - penalty_freq: float, - penalty_present: float, - penalize_nl: bool, - ignore_eos: bool, - ): - sampler = llama_cpp.llama_sampler_init_penalties( - penalty_last_n, - penalty_repeat, - penalty_freq, - penalty_present, - ) - self._add_sampler(sampler) - - def add_dry( + def add_grammar( self, model: LlamaModel, - dry_multiplier: float, - dry_base: float, - dry_allowed_length: int, - dry_penalty_last_n: int, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"] + grammar: LlamaGrammar, + lazy: bool = False, + triggers: List[Union[str, int]] = None ): + """ + Adds a grammar sampler. + Args: + grammar_str: The BNF grammar string. + root: The root rule name. + lazy: If True, enables lazy evaluation. + triggers: List of trigger words (str) or tokens (int) for lazy evaluation. + """ + c_grammar_str = grammar._grammar.encode('utf-8') + c_root = grammar._root.encode('utf-8') - dry_seq_breakers_char_array_ptr, num_seq_breakers = self.convert_list_str_to_char_array_ptr(dry_seq_breakers) - - sampler = llama_cpp.llama_sampler_init_dry( + if not lazy: + self._add_sampler(llama_cpp.llama_sampler_init_grammar( + model.vocab, c_grammar_str, c_root + )) + else: + trigger_patterns = [] + trigger_tokens = [] + + if triggers: + for t in triggers: + if isinstance(t, str): + trigger_patterns.append(t) + elif isinstance(t, int): + trigger_tokens.append(t) + + c_trigger_patterns = (ctypes.c_char_p * len(trigger_patterns))() + c_trigger_patterns[:] = [w.encode('utf-8') for w in trigger_patterns] + + c_trigger_tokens = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) + + self._add_sampler(llama_cpp.llama_sampler_init_grammar_lazy_patterns( + model.vocab, + c_grammar_str, + c_root, + c_trigger_patterns, + len(trigger_patterns), + c_trigger_tokens, + len(trigger_tokens) + )) + + def add_penalties(self, penalty_last_n: int, penalty_repeat: float, penalty_freq: float, penalty_present: float): + self._add_sampler(llama_cpp.llama_sampler_init_penalties(penalty_last_n, penalty_repeat, penalty_freq, penalty_present)) + + def add_dry(self, model: LlamaModel, multiplier: float, base: float, allowed_len: int, last_n: int, breakers: List[str]): + """DRY (Don't Repeat Yourself) sampler.""" + # Convert python string list to C char** + c_breakers = (ctypes.c_char_p * len(breakers))() + c_breakers[:] = [b.encode('utf-8') for b in breakers] + + self._add_sampler(llama_cpp.llama_sampler_init_dry( model.vocab, model.n_ctx_train(), - dry_multiplier, - dry_base, - dry_allowed_length, - dry_penalty_last_n, - dry_seq_breakers_char_array_ptr, - num_seq_breakers - ) - self._add_sampler(sampler) - - def add_adaptive_p( - self, - target: float, - decay: float, - seed: int, - ): - sampler = llama_cpp.llama_sampler_init_adaptive_p( - target, - decay, - seed - ) - self._add_sampler(sampler) + multiplier, + base, + allowed_len, + last_n, + c_breakers, + len(breakers) + )) - def add_logit_bias( - self, n_vocab: int, logit_bias: Dict[int, float] - ): - # Construct a C array to store the contents of the logit_bias dictionary - logit_bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))() + def add_logit_bias(self, n_vocab: int, bias_dict: Dict[int, float]): + """Logit bias sampler.""" + if not bias_dict: return - for i, (token, bias) in enumerate(logit_bias.items()): - logit_bias_array[i].token = token - logit_bias_array[i].bias = bias + c_bias = (llama_cpp.llama_logit_bias * len(bias_dict))() + for i, (token, bias) in enumerate(bias_dict.items()): + c_bias[i].token = token + c_bias[i].bias = bias - sampler = llama_cpp.llama_sampler_init_logit_bias(n_vocab, len(logit_bias), logit_bias_array) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_logit_bias(n_vocab, len(bias_dict), c_bias)) def add_infill(self, model: LlamaModel): - sampler = llama_cpp.llama_sampler_init_infill(model.vocab) - self._add_sampler(sampler) + self._add_sampler(llama_cpp.llama_sampler_init_infill(model.vocab)) + + def add_adaptive_p(self, target: float, decay: float, seed: int): + self._add_sampler(llama_cpp.llama_sampler_init_adaptive_p(target, decay, seed)) def add_custom( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] @@ -1341,29 +1313,6 @@ def add_custom( [llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler] ) - def _add_sampler(self, sampler: llama_cpp.llama_sampler_p): - assert self.sampler is not None - llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - self.samplers.append(sampler) - def get_seed(self) -> int: assert self.sampler is not None return llama_cpp.llama_sampler_get_seed(self.sampler) - - def sample(self, ctx: LlamaContext, idx: ctypes.c_int32) -> ctypes.c_int32: - assert self.sampler is not None - assert ctx.ctx is not None - return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx) - - def close(self): - if self.sampler: - # NOTE: Must remove custom samplers before free or llama.cpp will try to free them - for i, _ in reversed(self.custom_samplers): - llama_cpp.llama_sampler_chain_remove(self.sampler, i) - llama_cpp.llama_sampler_free(self.sampler) - self.sampler = None - self.samplers.clear() - self.custom_samplers.clear() - - def __del__(self): - self.close() diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 52bb415e21..2e1d2b4755 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -775,7 +775,6 @@ def _init_sampler( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - penalize_nl: bool = True, adaptive_target : float = -1.0, adaptive_decay : float = 0.9, use_adaptive_p: bool = False, @@ -828,15 +827,10 @@ def _init_sampler( if use_infill: sampler.add_infill(self._model) sampler.add_penalties( - n_vocab=self._n_vocab, - special_eos_id=self._token_eos, - linefeed_id=self._token_nl, penalty_last_n=self.last_n_tokens_size, penalty_repeat=repeat_penalty, penalty_freq=frequency_penalty, - penalty_present=presence_penalty, - penalize_nl=penalize_nl, - ignore_eos=False, + penalty_present=presence_penalty ) if use_adaptive_p: # only if user explicitly included adaptive-p sampler @@ -867,7 +861,6 @@ def sample( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - penalize_nl: bool = True, adaptive_target : float = -1.0, adaptive_decay : float = 0.9, use_adaptive_p: bool = False, @@ -914,7 +907,6 @@ def sample( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, - penalize_nl=penalize_nl, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, use_adaptive_p=use_adaptive_p, @@ -955,7 +947,6 @@ def generate( dry_allowed_length: int = 2, dry_penalty_last_n:int = 0, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - penalize_nl: bool = True, adaptive_target : float = -1.0, adaptive_decay : float = 0.9, use_adaptive_p: bool = False, @@ -1006,7 +997,6 @@ def generate( dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, - penalize_nl=penalize_nl, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, use_adaptive_p=use_adaptive_p, @@ -1068,7 +1058,6 @@ def generate( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, - penalize_nl=penalize_nl, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, use_adaptive_p=use_adaptive_p, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 90fc5e9e83..0cca6d418e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -4457,7 +4457,7 @@ def llama_sampler_init_penalties( ctypes.c_float, ctypes.c_int32, ctypes.c_int32, - ctypes.POINTER(ctypes.POINTER(ctypes.c_char)), + ctypes.POINTER(ctypes.c_char_p), ctypes.c_size_t, ], llama_sampler_p_ctypes, @@ -4469,7 +4469,7 @@ def llama_sampler_init_dry( dry_base: float, dry_allowed_length: int, dry_penalty_last_n: int, - seq_breakers: CtypesArray[bytes], + seq_breakers: CtypesArray[ctypes.c_char_p], num_breakers: int, /, ) -> llama_sampler_p: From da6fb3e8bdf1ee338c072a38484ee241570d360f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Feb 2026 02:51:17 +0800 Subject: [PATCH 138/518] Add grammar_lazy param into llama.py --- llama_cpp/llama.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2e1d2b4755..cb4c375dc4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -782,6 +782,7 @@ def _init_sampler( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy: bool = False ): sampler = internals.LlamaSampler() @@ -789,7 +790,7 @@ def _init_sampler( sampler.add_logit_bias(self.n_vocab(), logit_bias) if grammar is not None: - sampler.add_grammar(self._model, grammar) + sampler.add_grammar(self._model, grammar, grammar_lazy) if temp < 0.0: sampler.add_dist(self._seed) @@ -868,6 +869,7 @@ def sample( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy: bool = False, idx: Optional[int] = None, ): """Sample a token from the model. @@ -914,6 +916,7 @@ def sample( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy ) ridx = idx - self.n_tokens if idx is not None else -1 @@ -955,6 +958,7 @@ def generate( logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy :bool = False, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -1004,6 +1008,7 @@ def generate( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy ) # Check for kv cache prefix match @@ -1058,6 +1063,7 @@ def generate( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, use_adaptive_p=use_adaptive_p, @@ -1302,6 +1308,7 @@ def _create_completion( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy: bool = False ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -1503,6 +1510,7 @@ def logit_bias_processor( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy, ): if llama_cpp.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -1945,6 +1953,7 @@ def create_completion( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy: bool = False, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1985,6 +1994,7 @@ def create_completion( logit_bias: A logit bias to use. logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. + grammar_lazy: If True, enables lazy evaluation. Raises: ValueError: If the requested tokens exceed the context window. @@ -2030,6 +2040,7 @@ def create_completion( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy, ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -2075,6 +2086,7 @@ def __call__( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy: bool = False, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2115,6 +2127,7 @@ def __call__( logit_bias: A logit bias to use. logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. + grammar_lazy: If True, enables lazy evaluation. Raises: ValueError: If the requested tokens exceed the context window. @@ -2160,6 +2173,7 @@ def __call__( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy, ) def create_chat_completion( @@ -2201,6 +2215,7 @@ def create_chat_completion( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, + grammar_lazy: bool = False, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, ) -> Union[ @@ -2246,6 +2261,7 @@ def create_chat_completion( logit_bias: A logit bias to use. logits_processor: A list of logits processors to use. grammar: A grammar to use. + grammar_lazy: If True, enables lazy evaluation. Returns: Generated chat completion or a stream of chat completion chunks. @@ -2296,6 +2312,7 @@ def create_chat_completion( logit_bias=logit_bias, logits_processor=logits_processor, grammar=grammar, + grammar_lazy=grammar_lazy, ) def create_chat_completion_openai_v1( From acb7efa09293cffa5912242c372b715555e2a884 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Feb 2026 12:06:35 +0800 Subject: [PATCH 139/518] build: expand non-AVX flags for Basic build compatibility Updated the 'Basic' profile to include missing SIMD flags. Specifically: - Added -DGGML_NATIVE=OFF to prevent host-leaked optimizations. - Added AVX-VNNI and AVX512 subsets (VBMI, VNNI, BF16). This guarantees maximum compatibility for targets without AVX support. --- .github/workflows/build-wheels-cu124-cu126-win.yml | 4 ++-- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu128-linux.yml | 2 +- .github/workflows/build-wheels-cu128-win.yml | 4 ++-- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 6 +++--- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index 9caf8157a5..a36c5b874a 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -78,14 +78,14 @@ jobs: $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' # } # Basic options for compiling without AVX instructions if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' } python -m build --wheel diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 65f24a19b2..668a26fde3 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -80,7 +80,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 765d557c23..46ea5a50ee 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -80,7 +80,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index 218523f423..e4e4782fd4 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -80,7 +80,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index d280622dd7..9fa5e27358 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -78,14 +78,14 @@ jobs: $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' # } # Basic options for compiling without AVX instructions if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' } python -m build --wheel diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index f8703fc656..6907c518ad 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -80,7 +80,7 @@ jobs: # fi # Basic options for compiling without AVX instructions if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" + CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" fi # Export CMAKE_ARGS environment variable so the python -m build command can use it diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index 110fd70e05..e7e70b5989 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -78,14 +78,14 @@ jobs: $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' } if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' } # if ($env:AVXVER -eq 'AVX512') { # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' # } # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + if ($env:AVXVER -eq 'Basic') {' + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' } python -m build --wheel From b69cbc253e0453b8aa1d1807d8339aa04b772559 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Feb 2026 12:37:11 +0800 Subject: [PATCH 140/518] Bump version to 0.3.24 --- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73dc9ec1c9..8217ea64ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,37 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.24] +- feat: [Refactor sampling infrastructure to use llama.cpp sampler chain API](https://github.com/JamePeng/llama-cpp-python/commit/1df39b422890db55cb9f6de43cb792a26921752e) + - LlamaContext: Remove obsolete manual sampling methods. + + - LlamaSampler: Wrap C++ sampler chain; add support for DRY, XTC, Adaptive-P, and lazy grammar.`add_grammar` merged the processing branches for regular grammar and lazy grammar. + + - LlamaSamplingContext: Update to build and manage sampler chains instead of manual logic. + + - CustomSampler: Rewrite for proper C-struct lifecycle management and ABI compatibility. + + - Optimizations: Simplified redundant handling and variable usage. + +- feat: Optimize the method definition of class llama_sampler_i and CtypesPointer definition + +- feat: [refactor LlamaSamplingParams class](https://github.com/JamePeng/llama-cpp-python/commit/2ebd4808c132c6c4ab561c60b25145bee7453999) + - base on llama.cpp/common/common.h + - support more sampler params + +- feat: [implement generative reranking with chat template support](https://github.com/JamePeng/llama-cpp-python/commit/79500ec2f5ec6ba4c83de21d61cb5a420271b8e2) + - Chat Template Support: Support for rerank templates has been introduced (via `llama_model_chat_template(model, b"rerank")`), which can automatically populate the query and document into a specific format. + - Now support Qwen3-Reranker series model(Non-Vision) + - Update README.md for Qwen3-Reranker + +- feat: Update README.md for python 3.14 and HIP (ROCm) guide + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/b83111815e9a79949257e9d4b087206b320a3063](https://github.com/ggml-org/llama.cpp/commit/b83111815e9a79949257e9d4b087206b320a3063) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260208 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/9b985b75105ea9f5e7f2f5e7988a1149f233af2f...acb7efa09293cffa5912242c372b715555e2a884 + ## [0.3.23] - feat: [Implement MiniCPMv45ChatHandler for MiniCPM-V 4.5 with multi-image tracking](https://github.com/JamePeng/llama-cpp-python/commit/83d5839b136a62a2ccac3feabe4eec1dbced961b) diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index eb37da2093..c48e6fcec0 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.23" +__version__ = "0.3.24" From 4ab182382b87bbbba4fb05ff184b557414740103 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Feb 2026 23:45:46 +0800 Subject: [PATCH 141/518] fix cu130 workflow typo --- .github/workflows/build-wheels-cu130-win.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index e7e70b5989..d572e65efc 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -84,7 +84,7 @@ jobs: # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' # } # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') {' + if ($env:AVXVER -eq 'Basic') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' } python -m build --wheel From 1e6094a327f0fb9dc35d52f84d8ebabc1faa1e95 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 9 Feb 2026 16:00:26 +0800 Subject: [PATCH 142/518] Refactor Llama class to use new LlamaSampler chain API from _internals This commit refactors the high-level Llama class to fully utilize the new C++ `llama_sampler` chain architecture via `LlamaSamplingContext`. Key changes: - Replaced manual sampling logic and obsolete `_init_sampler` with `LlamaSamplingContext`. - Updated `sample()` and `generate()` to support the full suite of modern sampling strategies (DRY, XTC, Adaptive-P, Infill, etc.). - Added new sampling parameters to all generation methods (`create_completion`, `create_chat_completion`, `__call__`): - `dynatemp_range`, `dynatemp_exponent` (Dynamic Temperature) - `min_keep` - Refactored `logits_processor` handling to use `CustomSampler` adapter for better performance and C++ interop. - Improved sampling state management (e.g., repetition penalties) by persisting `_sampling_ctx` during generation. - Removed manual `logit_bias` processing in Python; now delegated to the underlying sampler chain. --- examples/low_level_api/common.py | 8 +- .../low_level_api/low_level_api_chat_cpp.py | 8 +- .../low_level_api/low_level_api_llama_cpp.py | 4 +- llama_cpp/_internals.py | 13 +- llama_cpp/llama.py | 472 +++++++++--------- llama_cpp/llama_chat_format.py | 54 +- llama_cpp/server/types.py | 6 +- tests/test_llama.py | 69 ++- 8 files changed, 312 insertions(+), 322 deletions(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 744ebd411b..8adb2923cc 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -29,7 +29,7 @@ class GptParams: repeat_penalty: float = 1.10 repeat_last_n: int = 64 frequency_penalty: float = 0.0 - presence_penalty: float = 0.0 + present_penalty: float = 0.0 mirostat: int = 0 mirostat_tau: float = 5.0 mirostat_eta: float = 0.1 @@ -184,11 +184,11 @@ def gpt_params_parse(argv=None): dest="frequency_penalty", ) parser.add_argument( - "--presence_penalty", + "--present_penalty", type=float, default=0.0, - help="repeat alpha presence penalty (0.0 = disabled)", - dest="presence_penalty", + help="repeat alpha present penalty (0.0 = disabled)", + dest="present_penalty", ) parser.add_argument( "--mirostat", diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 1f71741b20..1f4f5b3e79 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -273,7 +273,7 @@ def __init__(self, params: GptParams) -> None: print( f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ repeat_penalty = {self.params.repeat_penalty},\ -presence_penalty = {self.params.presence_penalty},\ +present_penalty = {self.params.present_penalty},\ frequency_penalty = {self.params.frequency_penalty},\ top_k = {self.params.top_k},\ top_n_sigma = {self.params.top_n_sigma},\ @@ -471,13 +471,13 @@ def generate(self): penalty_last_n=last_n_repeat, penalty_repeat=llama_cpp.c_float(self.params.repeat_penalty), penalty_freq=llama_cpp.c_float(self.params.frequency_penalty), - penalty_present=llama_cpp.c_float(self.params.presence_penalty), + penalty_present=llama_cpp.c_float(self.params.present_penalty), ) # NOT PRESENT IN CURRENT VERSION ? - # llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p, + # llama_cpp.llama_sample_frequency_and_present_penalty(self.ctx, candidates_p, # _arr, - # last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) + # last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.present_penalty)) if self.params.temp <= 0: # Greedy sampling diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index ba3545771d..f34510fa39 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -55,7 +55,7 @@ last_n_repeat = 64 repeat_penalty = 1 frequency_penalty = 0.0 -presence_penalty = 0.0 +present_penalty = 0.0 while remaining_tokens > 0: if len(embd) > 0: @@ -90,7 +90,7 @@ penalty_last_n=last_n_repeat, penalty_repeat=repeat_penalty, penalty_freq=frequency_penalty, - penalty_present=presence_penalty, + penalty_present=present_penalty, ) llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index a2cc4fee16..c133bdebe6 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -826,7 +826,7 @@ class LlamaSamplingParams: CommonSamplerType.PENALTIES, CommonSamplerType.DRY, CommonSamplerType.TOP_N_SIGMA, - CommonSamplerType.CUSTOM, + # CommonSamplerType.CUSTOM, # When logits_processor is used, CommonSamplerType.CUSTOM is automatically injected into the samplers. CommonSamplerType.TOP_K, CommonSamplerType.TYPICAL_P, CommonSamplerType.TOP_P, @@ -851,7 +851,7 @@ def has_logit_bias(self) -> bool: def print_params(self) -> str: result = ( f"\trepeat_last_n = {self.penalty_last_n}, repeat_penalty = {self.penalty_repeat:.3f}, " - f"frequency_penalty = {self.penalty_freq:.3f}, presence_penalty = {self.penalty_present:.3f}\n" + f"frequency_penalty = {self.penalty_freq:.3f}, present_penalty = {self.penalty_present:.3f}\n" f"\tdry_multiplier = {self.dry_multiplier:.3f}, dry_base = {self.dry_base:.3f}, " f"dry_allowed_length = {self.dry_allowed_length}, dry_penalty_last_n = {self.dry_penalty_last_n}\n" @@ -917,12 +917,9 @@ def _build_sampler_chain(self): # --- 3. Grammar / Syntax Constraints --- if p.grammar and m: - # Use "root" as default rule name if not specified - root_rule = "root" s.add_grammar( model=m, grammar_str=p.grammar, - root=root_rule, lazy=p.grammar_lazy, triggers=p.grammar_triggers ) @@ -1220,7 +1217,7 @@ def add_mirostat_v2(self, seed: int, tau: float, eta: float): def add_grammar( self, model: LlamaModel, - grammar: LlamaGrammar, + grammar_str: str, lazy: bool = False, triggers: List[Union[str, int]] = None ): @@ -1232,8 +1229,8 @@ def add_grammar( lazy: If True, enables lazy evaluation. triggers: List of trigger words (str) or tokens (int) for lazy evaluation. """ - c_grammar_str = grammar._grammar.encode('utf-8') - c_root = grammar._root.encode('utf-8') + c_grammar_str = grammar_str.encode('utf-8') + c_root = "root".encode('utf-8') if not lazy: self._add_sampler(llama_cpp.llama_sampler_init_grammar( diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cb4c375dc4..749e008fac 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -49,6 +49,12 @@ import numpy.typing as npt import llama_cpp._internals as internals +from ._internals import ( + LlamaSamplingContext, + LlamaSamplingParams, + CommonSamplerType, + CustomSampler, +) from ._logger import set_verbose from ._utils import suppress_stdout_stderr @@ -616,7 +622,7 @@ def free_lora_adapter(): f"Using fallback chat format: {self.chat_format}", file=sys.stderr ) - self._sampler = None + self._sampling_ctx: Optional[LlamaSamplingContext] = None @property def ctx(self) -> llama_cpp.llama_context_p: @@ -754,192 +760,160 @@ def eval(self, tokens: Sequence[int]): current_pos += n_batch_tokens self.n_tokens = current_pos - def _init_sampler( - self, - top_k: int = 40, - top_n_sigma: float = -1.00, - top_p: float = 0.95, - min_p: float = 0.05, - typical_p: float = 1.0, - temp: float = 0.80, - repeat_penalty: float = 1.0, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, - use_infill: bool = False, - logit_bias: Optional[Dict[int, float]] = None, - logits_processor: Optional[LogitsProcessorList] = None, - grammar: Optional[LlamaGrammar] = None, - grammar_lazy: bool = False - ): - sampler = internals.LlamaSampler() - - if logit_bias is not None: - sampler.add_logit_bias(self.n_vocab(), logit_bias) - - if grammar is not None: - sampler.add_grammar(self._model, grammar, grammar_lazy) - - if temp < 0.0: - sampler.add_dist(self._seed) - elif temp == 0.0: - sampler.add_greedy() - else: - if mirostat_mode == 1: - sampler.add_temp(temp) - mirostat_m = 100 - sampler.add_mirostat( - self._n_vocab, - self._seed, - mirostat_tau, - mirostat_eta, - mirostat_m, - ) - elif mirostat_mode == 2: - sampler.add_temp(temp) - sampler.add_mirostat_v2( - self._seed, - mirostat_tau, - mirostat_eta, - ) - else: - n_probs = 0 - min_keep = max(1, n_probs) - sampler.add_dry(self._model, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, dry_seq_breakers) - sampler.add_top_k(top_k) - sampler.add_top_p(top_p, min_keep) - sampler.add_top_n_sigma(top_n_sigma) - sampler.add_min_p(min_p, min_keep) - sampler.add_xtc(xtc_probability, xtc_threshold, min_keep, self._seed) - sampler.add_typical(typical_p, min_keep) - sampler.add_temp(temp) - if use_infill: - sampler.add_infill(self._model) - sampler.add_penalties( - penalty_last_n=self.last_n_tokens_size, - penalty_repeat=repeat_penalty, - penalty_freq=frequency_penalty, - penalty_present=presence_penalty - ) - if use_adaptive_p: - # only if user explicitly included adaptive-p sampler - sampler.add_adaptive_p(adaptive_target,adaptive_decay, self._seed) - else: - # default: sample from distribution - sampler.add_dist(self._seed) - return sampler + # Helper method: Convert dict logit_bias to List[llama_logit_bias] + def _convert_logit_bias(self, logit_bias: Optional[Dict[int, float]]) -> List[llama_cpp.llama_logit_bias]: + if not logit_bias: + return [] + bias_list = [] + for token, bias in logit_bias.items(): + lb = llama_cpp.llama_logit_bias() + lb.token = token + lb.bias = bias + bias_list.append(lb) + return bias_list def sample( self, - top_k: int = 40, - top_n_sigma: float = -1.00, - top_p: float = 0.95, - min_p: float = 0.05, - typical_p: float = 1.0, - temp: float = 0.80, - repeat_penalty: float = 1.0, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, - use_infill: bool = False, - logit_bias: Optional[Dict[int, float]] = None, + # Core + top_k: int = 40, # <= 0 to use vocab size + top_p: float = 0.95, # 1.0 = disabled + min_p: float = 0.05, # 0.0 = disabled + typical_p: float = 1.0, # typical_p, 1.0 = disabled + temp: float = 0.80, # <= 0.0 to sample greedily, 0.0 to not output probabilities + # Dynamic Temp + dynatemp_range: float = 0.0, # 0.0 = disabled + dynatemp_exponent: float = 1.0, # controls how entropy maps to temperature in dynamic temperature sampler + # Common + top_n_sigma: float = -1.00, # -1.0 = disabled + min_keep: int = 0, # 0 = disabled, otherwise samplers should return at least min_keep tokens + # Penalties + penalty_last_n: int = 64, # last n tokens to penalize (0 = disable penalty, -1 = context size) + repeat_penalty: float = 1.0, # 1.0 = disabled + frequency_penalty: float = 0.0, # 0.0 = disabled + present_penalty: float = 0.0, # 0.0 = disabled + # Mirostat + mirostat_mode: int = 0, # 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + mirostat_eta: float = 0.1, # learning rate + mirostat_tau: float = 5.0, # target entropy + # XTC + xtc_probability: float = 0.0, # 0.0 = disabled + xtc_threshold: float = 0.1, # > 0.5 disables XTC + # DRY + dry_multiplier: float = 0.0, # 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + dry_base: float = 1.75, # 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + dry_allowed_length: int = 2, # tokens extending repetitions beyond this receive penalty + dry_penalty_last_n:int = -1, # how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], # default sequence breakers for DRY + # Adaptive + adaptive_target : float = -1.0, # select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) + adaptive_decay : float = 0.9, # EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99) + # Config + ignore_eos: bool = False, + # Extra + logit_bias: Optional[Dict[int, float]] = None, # logit biases to apply logits_processor: Optional[LogitsProcessorList] = None, - grammar: Optional[LlamaGrammar] = None, + grammar: Optional[LlamaGrammar] = None, # optional BNF-like grammar to constrain sampling grammar_lazy: bool = False, idx: Optional[int] = None, ): """Sample a token from the model. - - Args: - top_k: The top-k sampling parameter. - top_p: The top-p sampling parameter. - temp: The temperature parameter. - repeat_penalty: The repeat penalty parameter. - Returns: The sampled token. """ assert self.n_tokens > 0 - tmp_sampler = False + s_ctx = self._sampling_ctx - if self._sampler is None: - tmp_sampler = True - self._sampler = self._init_sampler( + if s_ctx is None: + params = LlamaSamplingParams( + # Core top_k=top_k, - top_n_sigma=top_n_sigma, top_p=top_p, min_p=min_p, typical_p=typical_p, temp=temp, - repeat_penalty=repeat_penalty, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - mirostat_mode=mirostat_mode, + top_n_sigma=top_n_sigma, + min_keep=min_keep, + + # Dynamic Temp + dynatemp_range=dynatemp_range, + dynatemp_exponent=dynatemp_exponent, + + # Penalties + penalty_last_n=penalty_last_n if penalty_last_n != 0 else self.last_n_tokens_size, + penalty_repeat=repeat_penalty, + penalty_freq=frequency_penalty, + penalty_present=present_penalty, + + # Mirostat + mirostat=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, + + # XTC xtc_probability=xtc_probability, + xtc_threshold=xtc_threshold, + + # DRY dry_multiplier=dry_multiplier, dry_base=dry_base, dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, + dry_sequence_breakers=dry_seq_breakers, + + # Adaptive adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, - use_infill=use_infill, - logit_bias=logit_bias, - logits_processor=logits_processor, - grammar=grammar, - grammar_lazy=grammar_lazy + + # Misc + ignore_eos=ignore_eos, + logit_bias=self._convert_logit_bias(logit_bias), + grammar=grammar.grammar if grammar else "", + grammar_lazy=grammar_lazy, ) + # LogitsProcessor Adapter + if logits_processor: + def adapter(token_data_array: llama_cpp.llama_token_data_array): + current_scores = self._scores[self.n_tokens - 1, :] + new_scores = logits_processor(self._input_ids, current_scores) + size = token_data_array.size + data_ptr = token_data_array.data + for i in range(size): + tid = data_ptr[i].id + if tid < len(new_scores): + data_ptr[i].logit = new_scores[tid] + + params.custom_samplers.append(CustomSampler(adapter)) + # When logits_processor is used, CommonSamplerType.CUSTOM is automatically injected into the samplers. + if CommonSamplerType.CUSTOM not in params.samplers: + params.samplers.insert(3, CommonSamplerType.CUSTOM) + + s_ctx = LlamaSamplingContext(params, self._model) + ridx = idx - self.n_tokens if idx is not None else -1 + assert s_ctx is not None - assert self.ctx is not None - token = self._sampler.sample(self._ctx, ridx) - if tmp_sampler: - self._sampler = None + token = s_ctx.sample(self._ctx, ridx) return token def generate( self, tokens: Sequence[int], top_k: int = 40, - top_n_sigma: float = -1.00, top_p: float = 0.95, min_p: float = 0.05, typical_p: float = 1.0, temp: float = 0.80, + dynatemp_range: float = 0.0, + dynatemp_exponent: float = 1.0, + top_n_sigma: float = -1.00, + min_keep: int = 0, + penalty_last_n: int = 64, repeat_penalty: float = 1.0, - reset: bool = True, frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, + reset: bool = True, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -948,12 +922,12 @@ def generate( dry_multiplier: float = 0.0, dry_base: float = 1.75, dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, + dry_penalty_last_n:int = -1, dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, + ignore_eos: bool = False, logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, @@ -980,37 +954,73 @@ def generate( The generated tokens. """ # Reset mirostat sampling - self._mirostat_mu = ctypes.c_float(2.0 * mirostat_tau) - self._sampler = self._init_sampler( + params = LlamaSamplingParams( + # Core Sampling top_k=top_k, - top_n_sigma=top_n_sigma, top_p=top_p, min_p=min_p, typical_p=typical_p, temp=temp, - repeat_penalty=repeat_penalty, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - mirostat_mode=mirostat_mode, + top_n_sigma=top_n_sigma, + min_keep=min_keep, + + # Dynamic Temperature + dynatemp_range=dynatemp_range, + dynatemp_exponent=dynatemp_exponent, + + # Penalties + penalty_last_n=penalty_last_n, + penalty_repeat=repeat_penalty, + penalty_freq=frequency_penalty, + penalty_present=present_penalty, + + # Mirostat + mirostat=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, + + # XTC xtc_probability=xtc_probability, + xtc_threshold=xtc_threshold, + + # DRY (Don't Repeat Yourself) dry_multiplier=dry_multiplier, dry_base=dry_base, dry_allowed_length=dry_allowed_length, dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, + dry_sequence_breakers=dry_seq_breakers, + + # Adaptive P adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, - use_infill=use_infill, - logit_bias=logit_bias, - logits_processor=logits_processor, - grammar=grammar, - grammar_lazy=grammar_lazy + + # Misc + ignore_eos=ignore_eos, + logit_bias=self._convert_logit_bias(logit_bias), + grammar=grammar._grammar if grammar else "", + grammar_lazy=grammar_lazy, ) + if logits_processor: + def adapter(token_data_array: llama_cpp.llama_token_data_array): + current_scores = self._scores[self.n_tokens - 1, :] + new_scores = logits_processor(self._input_ids, current_scores) + + size = token_data_array.size + data_ptr = token_data_array.data + for i in range(size): + tid = data_ptr[i].id + if tid < len(new_scores): + data_ptr[i].logit = new_scores[tid] + + custom_sampler = CustomSampler(adapter) + params.custom_samplers.append(custom_sampler) + + if CommonSamplerType.CUSTOM not in params.samplers: + params.samplers.insert(3, CommonSamplerType.CUSTOM) + + self._sampling_ctx = LlamaSamplingContext(params, self._model) + # Check for kv cache prefix match if reset and self.n_tokens > 0: longest_prefix = self.longest_token_prefix(self._input_ids.tolist(), tokens[:-1]) @@ -1029,10 +1039,6 @@ def generate( if reset: self.reset() - # # Reset the grammar - # if grammar is not None: - # grammar.reset() - sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) @@ -1040,36 +1046,8 @@ def generate( while True: self.eval(tokens) while sample_idx < self.n_tokens: - token = self.sample( - top_k=top_k, - top_n_sigma=top_n_sigma, - top_p=top_p, - min_p=min_p, - typical_p=typical_p, - temp=temp, - repeat_penalty=repeat_penalty, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - logit_bias=logit_bias, - logits_processor=logits_processor, - grammar=grammar, - grammar_lazy=grammar_lazy, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, - use_infill=use_infill, - idx=sample_idx, - ) + token = self._sampling_ctx.sample(self._ctx, idx=-1) + self._sampling_ctx.accept(token) sample_idx += 1 if stopping_criteria is not None and stopping_criteria( @@ -1079,6 +1057,7 @@ def generate( tokens_or_none = yield token tokens.clear() tokens.append(token) + if tokens_or_none is not None: tokens.extend(tokens_or_none) @@ -1274,7 +1253,7 @@ def _create_completion( self, prompt: Union[str, List[int]], suffix: Optional[str] = None, - max_tokens: Optional[int] = 16, + max_tokens: Optional[int] = 128, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -1283,10 +1262,14 @@ def _create_completion( echo: bool = False, stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, repeat_penalty: float = 1.0, + penalty_last_n: int = 64, top_k: int = 40, top_n_sigma: float = -1.00, + dynatemp_range: float = 0.0, + dynatemp_exponent: float = 1.0, + min_keep: int = 0, stream: bool = False, seed: Optional[int] = None, mirostat_mode: int = 0, @@ -1301,7 +1284,6 @@ def _create_completion( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, @@ -1406,28 +1388,6 @@ def _create_completion( RuntimeWarning, ) - # NOTE: This likely doesn't work correctly for the first token in the prompt - # because of the extra space added to the start of the prompt_tokens - if logit_bias is not None: - logit_bias_map = {int(k): float(v) for k, v in logit_bias.items()} - - def logit_bias_processor( - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - ) -> npt.NDArray[np.single]: - new_scores = np.copy( - scores - ) # Does it make sense to copy the whole array or can we just overwrite the original one? - for input_id, score in logit_bias_map.items(): - new_scores[input_id] = score + scores[input_id] - return new_scores - - _logit_bias_processor = LogitsProcessorList([logit_bias_processor]) - if logits_processor is None: - logits_processor = _logit_bias_processor - else: - logits_processor = logits_processor.extend(_logit_bias_processor) - if self.verbose: self._ctx.reset_timings() @@ -1489,6 +1449,9 @@ def logit_bias_processor( min_p=min_p, typical_p=typical_p, temp=temperature, + dynatemp_range=dynatemp_range, + dynatemp_exponent=dynatemp_exponent, + min_keep=min_keep, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -1500,12 +1463,12 @@ def logit_bias_processor( dry_penalty_last_n=dry_penalty_last_n, dry_seq_breakers=dry_seq_breakers, frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, + present_penalty=present_penalty, repeat_penalty=repeat_penalty, + penalty_last_n=penalty_last_n, stopping_criteria=stopping_criteria, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, logit_bias=logit_bias, logits_processor=logits_processor, @@ -1928,8 +1891,9 @@ def create_completion( echo: bool = False, stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, repeat_penalty: float = 1.0, + penalty_last_n: int = 64, top_k: int = 40, top_n_sigma: float = -1.00, stream: bool = False, @@ -1937,6 +1901,9 @@ def create_completion( mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + dynatemp_range: float = 0.0, + dynatemp_exponent: float = 1.0, + min_keep: int = 0, xtc_threshold: float = 0.1, xtc_probability: float = 0.0, dry_multiplier: float = 0.0, @@ -1946,7 +1913,6 @@ def create_completion( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, @@ -1958,7 +1924,7 @@ def create_completion( """Generate text from a prompt. Args: - prompt: The prompt to generate text from. +prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. @@ -1969,8 +1935,9 @@ def create_completion( echo: Whether to echo the prompt. stop: A list of strings to stop generation when encountered. frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. - presence_penalty: The penalty to apply to tokens based on their presence in the prompt. + present_penalty: The penalty to controls whether to apply a penalty to tokens that are already present in the current context, helping to reduce repetition and encourage more diverse generation. repeat_penalty: The penalty to apply to repeated tokens. + penalty_last_n: last n tokens to penalize (0 = disable penalty, -1 = context size). top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 top_n_sigma: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1.00, -1.00 = disabled). stream: Whether to stream the results. @@ -1978,6 +1945,9 @@ def create_completion( mirostat_mode: The mirostat sampling mode. mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + dynatemp_range: Range of dynamic temperature. + dynatemp_exponent: Exponent of dynamic temperature. + min_keep: Minimum tokens to keep for sampling. xtc-probability: Sets the chance for token removal (checked once on sampler start) (default: 0.0). XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 xtc-threshold: Sets a minimum probability threshold for tokens to be removed (default: 0.1). XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 dry_multiplier: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. @@ -1987,7 +1957,6 @@ def create_completion( dry_seq_breakers: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` adaptive-target: Adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: %.2f) [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) adaptive-decay: Adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable. (valid range 0.0 to 0.99) (default: %.2f) - use_adaptive_p: The adaptive_p sampler is only checked when use_adaptive_p is true; the default is to use dist. use_infill: Determines whether to activate the specialized fill-in-the-middle sampler that consolidates probabilities of tokens sharing common prefixes to ensure the generated text coherently bridges the gap between the prefix and suffix. model: The name to use for the model in the completion object. stopping_criteria: A list of stopping criteria to use. @@ -2015,8 +1984,9 @@ def create_completion( echo=echo, stop=stop, frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, + present_penalty=present_penalty, repeat_penalty=repeat_penalty, + penalty_last_n=penalty_last_n, top_k=top_k, top_n_sigma=top_n_sigma, stream=stream, @@ -2024,6 +1994,9 @@ def create_completion( mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + dynatemp_range=dynatemp_range, + dynatemp_exponent=dynatemp_exponent, + min_keep=min_keep, xtc_threshold=xtc_threshold, xtc_probability=xtc_probability, dry_multiplier=dry_multiplier, @@ -2033,7 +2006,6 @@ def create_completion( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, stopping_criteria=stopping_criteria, @@ -2052,7 +2024,7 @@ def __call__( self, prompt: str, suffix: Optional[str] = None, - max_tokens: Optional[int] = 16, + max_tokens: Optional[int] = 128, temperature: float = 0.8, top_p: float = 0.95, min_p: float = 0.05, @@ -2061,8 +2033,9 @@ def __call__( echo: bool = False, stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, repeat_penalty: float = 1.0, + penalty_last_n: int = 64, top_k: int = 40, top_n_sigma: float = -1.00, stream: bool = False, @@ -2070,6 +2043,9 @@ def __call__( mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + dynatemp_range: float = 0.0, + dynatemp_exponent: float = 1.0, + min_keep: int = 0, xtc_threshold: float = 0.1, xtc_probability: float = 0.0, dry_multiplier: float = 0.0, @@ -2079,7 +2055,6 @@ def __call__( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, @@ -2102,8 +2077,9 @@ def __call__( echo: Whether to echo the prompt. stop: A list of strings to stop generation when encountered. frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. - presence_penalty: The penalty to apply to tokens based on their presence in the prompt. + present_penalty: The penalty to controls whether to apply a penalty to tokens that are already present in the current context, helping to reduce repetition and encourage more diverse generation. repeat_penalty: The penalty to apply to repeated tokens. + penalty_last_n: last n tokens to penalize (0 = disable penalty, -1 = context size). top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 top_n_sigma: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1.00, -1.00 = disabled). stream: Whether to stream the results. @@ -2111,6 +2087,9 @@ def __call__( mirostat_mode: The mirostat sampling mode. mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + dynatemp_range: Range of dynamic temperature. + dynatemp_exponent: Exponent of dynamic temperature. + min_keep: Minimum tokens to keep for sampling. xtc-probability: Sets the chance for token removal (checked once on sampler start) (default: 0.0). XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 xtc-threshold: Sets a minimum probability threshold for tokens to be removed (default: 0.1). XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 dry_multiplier: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. @@ -2120,7 +2099,6 @@ def __call__( dry_seq_breakers: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` adaptive-target: Adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: %.2f) [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) adaptive-decay: Adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable. (valid range 0.0 to 0.99) (default: %.2f) - use_adaptive_p: The adaptive_p sampler is only checked when use_adaptive_p is true; the default is to use dist. use_infill: Determines whether to activate the specialized fill-in-the-middle sampler that consolidates probabilities of tokens sharing common prefixes to ensure the generated text coherently bridges the gap between the prefix and suffix. model: The name to use for the model in the completion object. stopping_criteria: A list of stopping criteria to use. @@ -2148,8 +2126,9 @@ def __call__( echo=echo, stop=stop, frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, + present_penalty=present_penalty, repeat_penalty=repeat_penalty, + penalty_last_n=penalty_last_n, top_k=top_k, top_n_sigma=top_n_sigma, stream=stream, @@ -2157,6 +2136,9 @@ def __call__( mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + dynatemp_range=dynatemp_range, + dynatemp_exponent=dynatemp_exponent, + min_keep=min_keep, xtc_threshold=xtc_threshold, xtc_probability=xtc_probability, dry_multiplier=dry_multiplier, @@ -2166,7 +2148,6 @@ def __call__( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, stopping_criteria=stopping_criteria, @@ -2194,12 +2175,16 @@ def create_chat_completion( seed: Optional[int] = None, response_format: Optional[ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.0, + penalty_last_n: int = 64, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + dynatemp_range: float = 0.0, + dynatemp_exponent: float = 1.0, + min_keep: int = 0, xtc_threshold: float = 0.1, xtc_probability: float = 0.0, dry_multiplier: float = 0.0, @@ -2209,7 +2194,6 @@ def create_chat_completion( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, logit_bias: Optional[Dict[int, float]] = None, @@ -2240,12 +2224,16 @@ def create_chat_completion( seed: The seed to use for sampling. response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. - presence_penalty: The penalty to apply to tokens based on their presence in the prompt. frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. + present_penalty: The penalty to controls whether to apply a penalty to tokens that are already present in the current context, helping to reduce repetition and encourage more diverse generation. repeat_penalty: The penalty to apply to repeated tokens. + penalty_last_n: last n tokens to penalize (0 = disable penalty, -1 = context size). mirostat_mode: The mirostat sampling mode. mirostat_tau: The mirostat sampling tau parameter. mirostat_eta: The mirostat sampling eta parameter. + dynatemp_range: Range of dynamic temperature. + dynatemp_exponent: Exponent of dynamic temperature. + min_keep: Minimum tokens to keep for sampling. xtc-probability: Sets the chance for token removal (checked once on sampler start) (default: 0.0). XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 xtc-threshold: Sets a minimum probability threshold for tokens to be removed (default: 0.1).XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 dry_multiplier: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. @@ -2255,7 +2243,6 @@ def create_chat_completion( dry_seq_breakers: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` adaptive-target: Adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: %.2f) [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) adaptive-decay: Adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable. (valid range 0.0 to 0.99) (default: %.2f) - use_adaptive_p: The adaptive_p sampler is only checked when use_adaptive_p is true; the default is to use dist. use_infill: Determines whether to activate the specialized fill-in-the-middle sampler that consolidates probabilities of tokens sharing common prefixes to ensure the generated text coherently bridges the gap between the prefix and suffix. model: The name to use for the model in the completion object. logit_bias: A logit bias to use. @@ -2291,12 +2278,16 @@ def create_chat_completion( seed=seed, response_format=response_format, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, + penalty_last_n=penalty_last_n, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + dynatemp_range=dynatemp_range, + dynatemp_exponent=dynatemp_exponent, + min_keep=min_keep, xtc_threshold=xtc_threshold, xtc_probability=xtc_probability, dry_multiplier=dry_multiplier, @@ -2306,7 +2297,6 @@ def create_chat_completion( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logit_bias=logit_bias, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 27c0437626..99c4bfb460 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -99,7 +99,7 @@ def __call__( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, model: Optional[str] = None, @@ -119,7 +119,6 @@ def __call__( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, @@ -600,7 +599,7 @@ def chat_completion_handler( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, top_n_sigma: float = -1.00, @@ -616,7 +615,6 @@ def chat_completion_handler( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, @@ -714,7 +712,7 @@ def chat_completion_handler( stop=stop, seed=seed, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -730,7 +728,6 @@ def chat_completion_handler( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -1481,7 +1478,7 @@ def functionary_chat_handler( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, top_n_sigma: float = -1.00, @@ -1497,7 +1494,6 @@ def functionary_chat_handler( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, @@ -1698,7 +1694,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): stream=stream, stop=["user:", ""], max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -1714,7 +1710,6 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -1790,7 +1785,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): top_k=top_k, min_p=min_p, typical_p=typical_p, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -1806,7 +1801,6 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -1873,7 +1867,7 @@ def functionary_v1_v2_chat_handler( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, top_n_sigma: float = -1.00, @@ -1889,7 +1883,6 @@ def functionary_v1_v2_chat_handler( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, @@ -2100,7 +2093,7 @@ def prepare_messages_for_inference( stream=stream, stop=stop, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -2116,7 +2109,6 @@ def prepare_messages_for_inference( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -2174,7 +2166,7 @@ def create_completion(prompt, stop, grammar): stream=stream, stop=stop, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -2190,7 +2182,6 @@ def create_completion(prompt, stop, grammar): dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -2936,7 +2927,7 @@ def __call__( llama_types.ChatCompletionRequestResponseFormat ] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, top_n_sigma: float = -1.00, @@ -2952,7 +2943,6 @@ def __call__( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, @@ -3173,7 +3163,7 @@ def __call__( stop=stop, seed=seed, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -3189,7 +3179,6 @@ def __call__( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -4450,7 +4439,7 @@ def chatml_function_calling( stop: Optional[Union[str, List[str]]] = [], response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, max_tokens: Optional[int] = None, - presence_penalty: float = 0.0, + present_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, top_n_sigma: float = -1.00, @@ -4466,7 +4455,6 @@ def chatml_function_calling( dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], adaptive_target : float = -1.0, adaptive_decay : float = 0.9, - use_adaptive_p: bool = False, use_infill: bool = False, model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, @@ -4592,7 +4580,7 @@ def chatml_function_calling( stream=stream, stop=stop, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -4608,7 +4596,6 @@ def chatml_function_calling( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -4656,7 +4643,7 @@ def chatml_function_calling( stream=stream, stop=stop, max_tokens=max_tokens, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -4672,7 +4659,6 @@ def chatml_function_calling( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -4711,7 +4697,7 @@ def chatml_function_calling( stream=False, stop=[":"], max_tokens=None, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -4727,7 +4713,6 @@ def chatml_function_calling( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -4750,7 +4735,7 @@ def chatml_function_calling( stop=["<|im_end|>"], logprobs=top_logprobs if logprobs else None, max_tokens=None, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -4766,7 +4751,6 @@ def chatml_function_calling( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -4808,7 +4792,7 @@ def chatml_function_calling( stream=False, stop=stop, max_tokens=None, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -4824,7 +4808,6 @@ def chatml_function_calling( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, @@ -4848,7 +4831,7 @@ def chatml_function_calling( stream=False, stop=stop, max_tokens=None, - presence_penalty=presence_penalty, + present_penalty=present_penalty, frequency_penalty=frequency_penalty, repeat_penalty=repeat_penalty, top_n_sigma=top_n_sigma, @@ -4864,7 +4847,6 @@ def chatml_function_calling( dry_seq_breakers=dry_seq_breakers, adaptive_target=adaptive_target, adaptive_decay=adaptive_decay, - use_adaptive_p=use_adaptive_p, use_infill=use_infill, model=model, logits_processor=logits_processor, diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index fdd1644568..ada0c5e377 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -68,7 +68,7 @@ + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", ) -presence_penalty_field = Field( +present_penalty_field = Field( default=0.0, ge=-2.0, le=2.0, @@ -132,7 +132,7 @@ class CreateCompletionRequest(BaseModel): ge=0, description="The number of logprobs to generate. If None, no logprobs are generated.", ) - presence_penalty: Optional[float] = presence_penalty_field + present_penalty: Optional[float] = present_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) seed: Optional[int] = Field(None) @@ -228,7 +228,7 @@ class CreateChatCompletionRequest(BaseModel): min_p: float = min_p_field stop: Optional[Union[str, List[str]]] = stop_field stream: bool = stream_field - presence_penalty: Optional[float] = presence_penalty_field + present_penalty: Optional[float] = present_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) seed: Optional[int] = Field(None) diff --git a/tests/test_llama.py b/tests/test_llama.py index 3d51feeaac..43ec9c959d 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -1,15 +1,14 @@ import ctypes import multiprocessing - +import os +import pytest import numpy as np from scipy.special import log_softmax - from huggingface_hub import hf_hub_download -import pytest - import llama_cpp import llama_cpp._internals as internals +from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_NONE from typing import ( List, @@ -25,6 +24,10 @@ def test_llama_cpp_version(): def test_llama_cpp_tokenization(): + """ + Test the tokenizer API (Llama.tokenize and Llama.detokenize). + Verifies handling of BOS (Begin of Sentence), EOS (End of Sentence), and special tokens. + """ llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False) assert llama @@ -63,6 +66,7 @@ def test_llama_cpp_tokenization(): @pytest.fixture def llama_cpp_model_path(): + """Fixture to download a real GGUF model for integration tests.""" repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF" filename = "qwen2-0_5b-instruct-q8_0.gguf" model_path = hf_hub_download(repo_id, filename) @@ -70,17 +74,23 @@ def llama_cpp_model_path(): def test_real_model(llama_cpp_model_path): - import os + """ + Test the Low-Level API (internals.*). + This manually constructs the Model, Context, Batch, and Sampler Chain. + """ assert os.path.exists(llama_cpp_model_path) + # 1. Setup Model Parameters params = llama_cpp.llama_model_default_params() params.use_mmap = llama_cpp.llama_supports_mmap() params.use_direct_io = False params.use_mlock = llama_cpp.llama_supports_mlock() params.check_tensors = False + # 2. Load the Model model = internals.LlamaModel(path_model=llama_cpp_model_path, params=params) + # 3. Setup Context Parameters cparams = llama_cpp.llama_context_default_params() cparams.n_ctx = 16 cparams.n_batch = 16 @@ -90,11 +100,13 @@ def test_real_model(llama_cpp_model_path): cparams.swa_full = True cparams.kv_unified = True + # 4. Create the Context context = internals.LlamaContext(model=model, params=cparams) tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True) assert tokens == [9707, 11, 1879, 0] + # New prompt for generation test tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True) batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1) @@ -106,18 +118,31 @@ def test_real_model(llama_cpp_model_path): sampler.add_temp(0.8) sampler.add_dist(seed) - result = tokens + result = list(tokens) n_eval = 0 + curr_tokens = tokens + for _ in range(4): - batch.set_batch(tokens, n_past=n_eval, logits_all=False) + # Prepare batch with current tokens + batch.set_batch(curr_tokens, n_past=n_eval, logits_all=False) + + # Decode (run inference) context.decode(batch) - n_eval += len(tokens) + n_eval += len(curr_tokens) + + # Sample the next token (index -1 means the last token in the batch) token_id = sampler.sample(context, -1) - tokens = [token_id] - result += tokens - output = result[5:] + # Accept the token to update internal sampler state + sampler.accept(token_id) + + # Update loop variables + curr_tokens = [token_id] + result.append(token_id) + + output = result[len(tokens):] output_text = model.detokenize(output, special=True) + print(output_text) assert output_text == b" over the lazy dog" def test_real_llama(llama_cpp_model_path): @@ -225,17 +250,13 @@ def test_real_llama(llama_cpp_model_path): def test_real_llama_embeddings(llama_cpp_model_path): - model = llama_cpp.Llama( - llama_cpp_model_path, - n_ctx=32, - n_batch=32, - n_ubatch=32, - n_threads=multiprocessing.cpu_count(), - n_threads_batch=multiprocessing.cpu_count(), - logits_all=False, - embeddings=True, - kv_unified=True, - swa_full=True, - ) + model = LlamaEmbedding( + model_path=llama_cpp_model_path, + n_ctx=32, + n_batch=32, + n_ubatch=32, + pooling_type=LLAMA_POOLING_TYPE_NONE) # Smoke test for now - model.embed("Hello World") + embeddings = model.embed("Hello, world!") + assert isinstance(embeddings, list) + assert len(embeddings) > 0 From 556976d54789fd25d1b4f291f3f1cfe6b08b8922 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 9 Feb 2026 22:56:18 +0800 Subject: [PATCH 143/518] Separate the grammar sampler, improve the code stability of Sampler Chain processing, and fix some bugs. --- llama_cpp/_internals.py | 371 ++++++++++++++++++++++++++++++---------- llama_cpp/llama.py | 2 +- llama_cpp/llama_cpp.py | 2 +- 3 files changed, 284 insertions(+), 91 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index c133bdebe6..03df3811b2 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -704,6 +704,14 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): self.batch.logits[current_count + n_tokens - 1] = True +# Embedding functions +def normalize_embedding(embedding): + norm = float(np.linalg.norm(embedding)) + if norm == 0.0: + return embedding + return [v / norm for v in embedding] + + class LlamaTokenDataArray: def __init__(self, *, n_vocab: int): self.n_vocab = n_vocab @@ -730,16 +738,6 @@ def copy_logits(self, logits: npt.NDArray[np.single]): self.candidates.size = self.n_vocab -# Embedding functions - - -def normalize_embedding(embedding): - norm = float(np.linalg.norm(embedding)) - if norm == 0.0: - return embedding - return [v / norm for v in embedding] - - # Python wrappers over common/sampling structs # common/common.h common_params_sampling @@ -869,6 +867,30 @@ def print_params(self) -> str: def __repr__(self) -> str: return self.print_params() +class GrammarSampler: + + def __init__(self, model, grammar_str, lazy=False, triggers=None): + + self.model = model + self.vocab = model.vocab + + self.grammar = llama_cpp.llama_sampler_init_grammar( + self.vocab, + grammar_str.encode(), + b"root" + ) + + def apply(self, token_data): + llama_cpp.llama_sampler_apply(self.grammar, token_data) + + def accept(self, token): + llama_cpp.llama_sampler_accept(self.grammar, token) + + def reset(self): + llama_cpp.llama_sampler_reset(self.grammar) + + def free(self): + llama_cpp.llama_sampler_free(self.grammar) @dataclass class LlamaSamplingContext: @@ -884,28 +906,48 @@ def __init__( ): self.params = params self.model = model + self.vocab = llama_cpp.llama_model_get_vocab(model.model) + self.n_vocab = model.n_vocab() + + lparams = llama_cpp.llama_sampler_chain_default_params() + lparams.no_perf = params.no_perf # Keep track of generated tokens for Python-side debugging/decoding self.prev: List[int] = [] + self._cur_p = LlamaTokenDataArray(n_vocab=self.n_vocab) if _existing_sampler: # Use the provided sampler (already configured/cloned) - self.sampler = _existing_sampler + self.sampler_chain = _existing_sampler else: # Build a new chain from scratch - self.sampler = LlamaSampler() + self.grammar_sampler = None + self.sampler_chain = LlamaSampler() + + if params.grammar is not None: + self.grammar_sampler = GrammarSampler( + model, + params.grammar, + params.grammar_lazy, + params.grammar_triggers + ) self._build_sampler_chain() def _build_sampler_chain(self): """ - Constructs the sampler chain based on the parameters. - The order generally follows common.cpp practices: - Bias -> Grammar -> Penalties -> DRY -> [Configurable Samplers] -> Dist/Greedy + Build sampler chain aligned with llama.cpp common_sampler_init + Grammar is intentionally NOT part of the chain. """ - s = self.sampler + + s = self.sampler_chain p = self.params m = self.model + if m is None: + raise RuntimeError("Model required to build sampler chain firstly") + + use_adaptive_p = False + # --- 1. Logit Bias (Always applied first to mask/boost tokens) --- if p.logit_bias and m: s.add_logit_bias(m.n_vocab(), p.logit_bias) @@ -915,16 +957,7 @@ def _build_sampler_chain(self): if CommonSamplerType.INFILL in p.samplers and m: s.add_infill(m) - # --- 3. Grammar / Syntax Constraints --- - if p.grammar and m: - s.add_grammar( - model=m, - grammar_str=p.grammar, - lazy=p.grammar_lazy, - triggers=p.grammar_triggers - ) - - # --- 4. Penalties (Repetition) --- + # --- 3. Penalties (Repetition) --- # Note: In some implementations, penalties come before other samplers if CommonSamplerType.PENALTIES in p.samplers: s.add_penalties( @@ -934,7 +967,7 @@ def _build_sampler_chain(self): p.penalty_present ) - # --- 5. DRY (Don't Repeat Yourself) --- + # --- 4. DRY (Don't Repeat Yourself) --- if CommonSamplerType.DRY in p.samplers and m: s.add_dry( m, @@ -945,7 +978,7 @@ def _build_sampler_chain(self): p.dry_sequence_breakers ) - # --- 6. Core Sampling Strategies (The "Filter" Loop) --- + # --- 5. Core Sampling Strategies (The "Filter" Loop) --- # We iterate through the list to preserve user-defined order for these specific samplers for stype in p.samplers: if stype == CommonSamplerType.CUSTOM: @@ -975,26 +1008,29 @@ def _build_sampler_chain(self): s.add_top_n_sigma(p.top_n_sigma) elif stype == CommonSamplerType.ADAPTIVE_P: - s.add_adaptive_p(p.adaptive_target, p.adaptive_decay, p.seed) + use_adaptive_p = True - # --- 7. Final Distribution / Selection --- + # --- 6. Final Distribution / Selection --- # Mirostat overrides standard greedy/dist sampling if p.mirostat == 1 and m: s.add_mirostat(m.n_vocab(), p.seed, p.mirostat_tau, p.mirostat_eta, 100) elif p.mirostat == 2: s.add_mirostat_v2(p.seed, p.mirostat_tau, p.mirostat_eta) else: - # If not using Mirostat, use Greedy (if temp=0) or Random Distribution - if p.temp == 0: - s.add_greedy() + if use_adaptive_p: + s.add_adaptive_p(p.adaptive_target, p.adaptive_decay, p.seed) else: - s.add_dist(p.seed) + if p.temp == 0: + s.add_greedy() + else: + s.add_dist(p.seed) def reset(self): """ Resets the internal state of all samplers in the chain. """ - self.sampler.reset() + self.grammar_sampler.reset() + self.sampler_chain.reset() self.prev = [] def cp(self) -> 'LlamaSamplingContext': @@ -1003,13 +1039,13 @@ def cp(self) -> 'LlamaSamplingContext': This clones the sampler chain state """ # 1. Clone the sampler chain using llama_sampler_clone - new_sampler = self.sampler.clone() + new_sampler_chain = self.sampler_chain.clone() # 2. Create new context wrapping the cloned chain new_ctx = LlamaSamplingContext( self.params, self.model, - _existing_sampler=new_sampler + _existing_sampler=new_sampler_chain ) # 3. Copy Python-side history @@ -1017,7 +1053,7 @@ def cp(self) -> 'LlamaSamplingContext': return new_ctx - def accept(self, token: int): + def accept(self, token: int, accept_grammar: bool): """ Accepts a token into the sampler state. MUST be called after sampling to update repetition penalties, grammar state, etc. @@ -1025,22 +1061,116 @@ def accept(self, token: int): Args: token: The token ID selected. """ - self.sampler.accept(token) + if self.grammar_sampler and accept_grammar: + self.grammar_sampler.accept(token) + self.sampler_chain.accept(token) self.prev.append(token) def sample( self, ctx: LlamaContext, idx: int = -1, + grammar_first: bool = True, ) -> int: - """ - Samples a token from the model's current logits. - Args: - ctx_main: The context containing the logits. - idx: The batch index to sample from (defaults to last token: -1). - """ - return self.sampler.sample(ctx, idx) + # 1. Synchronize + llama_cpp.llama_synchronize(ctx.ctx) + + # 2. Backend sampler shortcut + sampled = llama_cpp.llama_get_sampled_token_ith(ctx.ctx, idx) + if sampled != llama_cpp.LLAMA_TOKEN_NULL: + if self.grammar_sampler: + raise RuntimeError("Backend sampling + grammar unsupported") + return int(sampled) + + # 3. build cur_p + logits = llama_cpp.llama_get_logits_ith(ctx.ctx, idx) + + logits_array = np.ctypeslib.as_array( + logits, + shape=(self.n_vocab,) + ) + + cur_p = self._cur_p + cur_p.copy_logits(logits_array) + + # logit bias + if self.params.logit_bias: + for item in self.params.logit_bias: + cur_p.candidates_data.logit[item.token] += item.bias + + + # 4. grammar first + if self.grammar_sampler and grammar_first: + llama_cpp.llama_sampler_apply( + self.grammar_sampler.grammar, + ctypes.byref(cur_p.candidates) + ) + + # 5. sampling chain + llama_cpp.llama_sampler_apply( + self.sampler_chain.sampler, + ctypes.byref(cur_p.candidates) + ) + + selected = cur_p.candidates.selected + token = int(cur_p.candidates_data.id[selected]) + + # 6. grammar-first return directly + if self.grammar_sampler and grammar_first: + return token + + # 7. grammar rejection sampling + if self.grammar_sampler: + + single = llama_cpp.llama_token_data( + id=token, + logit=1.0, + p=0.0 + ) + + single_arr = llama_cpp.llama_token_data_array( + data=ctypes.pointer(single), + size=1, + selected=-1, + sorted=False + ) + + llama_cpp.llama_sampler_apply( + self.grammar_sampler.grammar, + ctypes.byref(single_arr) + ) + + valid = not np.isneginf(single.logit) + + if valid: + return token + + + # 8. resample + logits = llama_cpp.llama_get_logits_ith(ctx.ctx, idx) + + logits_array = np.ctypeslib.as_array( + logits, + shape=(self.n_vocab,) + ) + + cur_p.copy_logits(logits_array) + + llama_cpp.llama_sampler_apply( + self.grammar_sampler.grammar, + ctypes.byref(cur_p.candidates) + ) + + llama_cpp.llama_sampler_apply( + self.sampler_chain.sampler, + ctypes.byref(cur_p.candidates) + ) + + selected = cur_p.candidates.selected + token = int(cur_p.candidates_data.id[selected]) + + return token # --- Utilities --- @@ -1065,42 +1195,94 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str: class CustomSampler: + """ + Python wrapper for llama.cpp custom sampler. + + apply_func: + Callable receiving llama_token_data_array + and modifying logits in-place. + """ + def __init__( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None], name: str = "custom", - **kwargs + accept_func: Optional[Callable] = None, + reset_func: Optional[Callable] = None, + free_func: Optional[Callable] = None, + clone_func: Optional[Callable] = None, ): + if not callable(apply_func): + raise TypeError("apply_func must be callable") + self.apply_func = apply_func - self.name_bytes = name.encode('utf-8') + self.name_bytes = name.encode("utf-8") - def _cb_name(smpl): + def _cb_name(_): return self.name_bytes - def _cb_apply(smpl, cur_p): - if cur_p and self.apply_func: + def _cb_apply(_, cur_p): + if cur_p: self.apply_func(cur_p.contents) - self._cb_accept = kwargs.get('accept_func') or (lambda smpl, token: None) - self._cb_reset = kwargs.get('reset_func') or (lambda smpl: None) - self._cb_free = kwargs.get('free_func') or (lambda smpl: None) - self._cb_clone = kwargs.get('clone_func') or (lambda smpl: None) + def _cb_accept(_, token): + if accept_func: + accept_func(token) + + def _cb_reset(_): + if reset_func: + reset_func() + + def _cb_free(_): + if free_func: + free_func() + + def _cb_clone(_): + if clone_func: + return clone_func() + return None + + self._cb_name_ref = llama_cpp.llama_sampler_name_fn(_cb_name) + self._cb_apply_ref = llama_cpp.llama_sampler_apply_fn(_cb_apply) + self._cb_accept_ref = llama_cpp.llama_sampler_accept_fn(_cb_accept) + self._cb_reset_ref = llama_cpp.llama_sampler_reset_fn(_cb_reset) + self._cb_free_ref = llama_cpp.llama_sampler_free_fn(_cb_free) + self._cb_clone_ref = llama_cpp.llama_sampler_clone_fn(_cb_clone) + + # ----------------------------- + # Build llama_sampler_i + # ----------------------------- self.llama_sampler_i = llama_cpp.llama_sampler_i() - self.llama_sampler_i.name = llama_cpp.llama_sampler_name_fn(_cb_name) - self.llama_sampler_i.accept = llama_cpp.llama_sampler_accept_fn(lambda s, t: self._cb_accept(s, t)) - self.llama_sampler_i.apply = llama_cpp.llama_sampler_apply_fn(_cb_apply) - self.llama_sampler_i.reset = llama_cpp.llama_sampler_reset_fn(lambda s: self._cb_reset(s)) - self.llama_sampler_i.clone = llama_cpp.llama_sampler_clone_fn(lambda s: self._cb_clone(s)) - self.llama_sampler_i.free = llama_cpp.llama_sampler_free_fn(lambda s: self._cb_free(s)) + self.llama_sampler_i.name = self._cb_name_ref + self.llama_sampler_i.apply = self._cb_apply_ref + self.llama_sampler_i.accept = self._cb_accept_ref + self.llama_sampler_i.reset = self._cb_reset_ref + self.llama_sampler_i.free = self._cb_free_ref + self.llama_sampler_i.clone = self._cb_clone_ref - self.llama_sampler_i.backend_init = ctypes.cast(0, llama_cpp.llama_sampler_backend_init_fn) - self.llama_sampler_i.backend_accept = ctypes.cast(0, llama_cpp.llama_sampler_backend_accept_fn) - self.llama_sampler_i.backend_apply = ctypes.cast(0, llama_cpp.llama_sampler_backend_apply_fn) - self.llama_sampler_i.backend_set_input = ctypes.cast(0, llama_cpp.llama_sampler_backend_set_input_fn) + # Disable backend hooks + self.llama_sampler_i.backend_init = ctypes.cast( + 0, llama_cpp.llama_sampler_backend_init_fn + ) + self.llama_sampler_i.backend_accept = ctypes.cast( + 0, llama_cpp.llama_sampler_backend_accept_fn + ) + self.llama_sampler_i.backend_apply = ctypes.cast( + 0, llama_cpp.llama_sampler_backend_apply_fn + ) + self.llama_sampler_i.backend_set_input = ctypes.cast( + 0, llama_cpp.llama_sampler_backend_set_input_fn + ) - self.sampler_p = llama_cpp.llama_sampler_init(ctypes.pointer(self.llama_sampler_i), None) + self.sampler_p = llama_cpp.llama_sampler_init( + ctypes.pointer(self.llama_sampler_i), + None + ) + + if not self.sampler_p: + raise RuntimeError("Failed to initialize custom sampler") def get_sampler(self) -> llama_cpp.llama_sampler_p: return self.sampler_p @@ -1112,12 +1294,13 @@ def __init__(self, existing_sampler_p: Optional[llama_cpp.llama_sampler_p] = Non self.sampler = existing_sampler_p else: # Initialize new chain - params = llama_cpp.llama_sampler_chain_params() + params = llama_cpp.llama_sampler_chain_default_params() params.no_perf = False self.sampler = llama_cpp.llama_sampler_chain_init(params) self.samplers: List[llama_cpp.llama_sampler_p] = [] self.custom_samplers: List["CustomSampler"] = [] + self._keep_alive: List[Any] = [] def _add_sampler(self, sampler: llama_cpp.llama_sampler_p): if not sampler: @@ -1132,7 +1315,17 @@ def accept(self, token: int): Updates the sampler state (e.g. repetition penalty history). """ assert self.sampler is not None - llama_cpp.llama_sampler_accept(self.sampler, token) + + if token is None: raise RuntimeError("Sampler returned None token") + + if token < 0: raise RuntimeError(f"Invalid token sampled: {token}") + + try: + llama_cpp.llama_sampler_accept(self.sampler, token) + except Exception as e: + raise RuntimeError( + f"Sampler accept crashed. token={token}" + ) from e def clone(self) -> 'LlamaSampler': """ @@ -1165,13 +1358,13 @@ def reset(self): def close(self): if self.sampler: - # NOTE: Must remove custom samplers before free or llama.cpp will try to free them - for i, _ in reversed(self.custom_samplers): - llama_cpp.llama_sampler_chain_remove(self.sampler, i) + for index, _ in reversed(self.custom_samplers): + llama_cpp.llama_sampler_chain_remove(self.sampler, index) llama_cpp.llama_sampler_free(self.sampler) self.sampler = None self.samplers.clear() self.custom_samplers.clear() + self._keep_alive.clear() def __del__(self): self.close() @@ -1232,6 +1425,9 @@ def add_grammar( c_grammar_str = grammar_str.encode('utf-8') c_root = "root".encode('utf-8') + self._keep_alive.append(c_grammar_str) + self._keep_alive.append(c_root) + if not lazy: self._add_sampler(llama_cpp.llama_sampler_init_grammar( model.vocab, c_grammar_str, c_root @@ -1239,7 +1435,6 @@ def add_grammar( else: trigger_patterns = [] trigger_tokens = [] - if triggers: for t in triggers: if isinstance(t, str): @@ -1249,17 +1444,15 @@ def add_grammar( c_trigger_patterns = (ctypes.c_char_p * len(trigger_patterns))() c_trigger_patterns[:] = [w.encode('utf-8') for w in trigger_patterns] - c_trigger_tokens = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) + self._keep_alive.append(c_trigger_patterns) + self._keep_alive.append(c_trigger_tokens) + self._add_sampler(llama_cpp.llama_sampler_init_grammar_lazy_patterns( - model.vocab, - c_grammar_str, - c_root, - c_trigger_patterns, - len(trigger_patterns), - c_trigger_tokens, - len(trigger_tokens) + model.vocab, c_grammar_str, c_root, + c_trigger_patterns, len(trigger_patterns), + c_trigger_tokens, len(trigger_tokens) )) def add_penalties(self, penalty_last_n: int, penalty_repeat: float, penalty_freq: float, penalty_present: float): @@ -1282,14 +1475,14 @@ def add_dry(self, model: LlamaModel, multiplier: float, base: float, allowed_len len(breakers) )) - def add_logit_bias(self, n_vocab: int, bias_dict: Dict[int, float]): + def add_logit_bias(self, n_vocab: int, bias_dict: List[llama_cpp.llama_logit_bias]): """Logit bias sampler.""" if not bias_dict: return c_bias = (llama_cpp.llama_logit_bias * len(bias_dict))() - for i, (token, bias) in enumerate(bias_dict.items()): - c_bias[i].token = token - c_bias[i].bias = bias + for i, bias in enumerate(bias_dict): + c_bias[i].token = bias.token + c_bias[i].bias = bias.bias self._add_sampler(llama_cpp.llama_sampler_init_logit_bias(n_vocab, len(bias_dict), c_bias)) @@ -1299,13 +1492,13 @@ def add_infill(self, model: LlamaModel): def add_adaptive_p(self, target: float, decay: float, seed: int): self._add_sampler(llama_cpp.llama_sampler_init_adaptive_p(target, decay, seed)) - def add_custom( - self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] - ): - custom_sampler = CustomSampler(apply_func) + def add_custom(self, custom_sampler: CustomSampler): + if not isinstance(custom_sampler, CustomSampler): + raise TypeError("add_custom expects a CustomSampler instance") + sampler = custom_sampler.get_sampler() self._add_sampler(sampler) - # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + self.custom_samplers.append( [llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler] ) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 749e008fac..df2ad74dc2 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1047,7 +1047,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): self.eval(tokens) while sample_idx < self.n_tokens: token = self._sampling_ctx.sample(self._ctx, idx=-1) - self._sampling_ctx.accept(token) + self._sampling_ctx.accept(token, False if grammar is None else True) sample_idx += 1 if stopping_criteria is not None and stopping_criteria( diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0cca6d418e..6094fb1a06 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -4112,7 +4112,7 @@ def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], None, ) def llama_sampler_apply( - smpl: llama_sampler_p, cur_p: CtypesArray[llama_token_data_array], / + smpl: llama_sampler_p, cur_p: CtypesPointer[llama_token_data_array], / ): ... From 31a9d73d99390bb47897baa8cff2dcb7d997fcf4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 9 Feb 2026 23:35:27 +0800 Subject: [PATCH 144/518] test: update integration tests for new sampler architecture - Switch test model to Qwen2.5-0.5B-Instruct - Add `test_grammar_sampling_safety` to verify GC safety - Add `test_logit_bias` and `test_custom_logits_processor` - Update generation assertions for flexibility - Remove unstable legacy testing methods - Use the modern LlamaEmbedding class for Embedding Test --- .github/workflows/test.yaml | 4 +- tests/test_llama.py | 181 ++++++++++++++++++++++++++++-------- 2 files changed, 143 insertions(+), 42 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 82bf930cf5..335b0f0ac3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -12,8 +12,8 @@ concurrency: cancel-in-progress: true env: - REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF - MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf + REPO_ID: Qwen/Qwen2.5-0.5B-Instruct-GGUF + MODEL_FILE: qwen2.5-0.5b-instruct-q4_k_m.gguf HF_HOME: ${{ github.workspace }}/hf_cache jobs: diff --git a/tests/test_llama.py b/tests/test_llama.py index 43ec9c959d..8f15a16466 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -67,8 +67,8 @@ def test_llama_cpp_tokenization(): @pytest.fixture def llama_cpp_model_path(): """Fixture to download a real GGUF model for integration tests.""" - repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF" - filename = "qwen2-0_5b-instruct-q8_0.gguf" + repo_id = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" + filename = "qwen2.5-0.5b-instruct-q4_k_m.gguf" model_path = hf_hub_download(repo_id, filename) return model_path @@ -143,7 +143,7 @@ def test_real_model(llama_cpp_model_path): output = result[len(tokens):] output_text = model.detokenize(output, special=True) print(output_text) - assert output_text == b" over the lazy dog" + assert b"over" in output_text or b"lazy dog" in output_text def test_real_llama(llama_cpp_model_path): model = llama_cpp.Llama( @@ -166,21 +166,22 @@ def test_real_llama(llama_cpp_model_path): temperature=0.8, seed=1337 ) - assert output["choices"][0]["text"] == " over the lazy dog" + text = output["choices"][0]["text"] + assert "over" in text or "lazy dog" in text output = model.create_completion( - "The capital of france is paris, 'true' or 'false'?:\n", + "Is the Louvre Museum in Paris, France?, 'yes' or 'no'?:\n", max_tokens=4, top_k=50, top_p=0.9, temperature=0.8, seed=1337, grammar=llama_cpp.LlamaGrammar.from_string(""" -root ::= "true" | "false" +root ::= "yes" | "no" """) ) - assert output["choices"][0]["text"] == "true" + assert output["choices"][0]["text"] == "yes" suffix = b"rot" @@ -203,53 +204,153 @@ def test_real_llama(llama_cpp_model_path): assert output["choices"][0]["text"].lower().startswith("rot") - model.set_seed(1337) +def test_grammar_sampling_safety(llama_cpp_model_path): + """ + Test 2: Grammar-constrained sampling (safety / stability check) + This test forces very strict JSON-like output using a minimal grammar. + """ + # Very restrictive grammar — only allows simple { "key": number } + # (intentionally limited to trigger potential accept-stage bugs) + model = llama_cpp.Llama( + llama_cpp_model_path, + n_ctx=32, + n_batch=32, + n_ubatch=32, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + swa_full=True, + kv_unified=True, + ) + grammar_text = r''' + root ::= object + object ::= "{" space pair "}" + pair ::= string ":" space value + string ::= "\"" [a-z]+ "\"" + value ::= number + number ::= [0-9]+ + space ::= [ ]? + ''' + + # Create grammar object from string definition + grammar = llama_cpp.LlamaGrammar.from_string(grammar_text) + + # Prompt that naturally wants to produce something JSON-like + prompt = "Generate a JSON with age:" + + # Generate with grammar constraint + near-greedy sampling + output = model.create_completion( + prompt, + max_tokens=20, + grammar=grammar, + temperature=0.1 + ) - state = model.save_state() + generated_text = output["choices"][0]["text"] + print(f"\n[Grammar] Output: {generated_text}") - output = model.create_completion( - "Pick a number from 1 to 10?:\n", - max_tokens=4, - top_k=50, - top_p=0.9, - temperature=0.8, - grammar=llama_cpp.LlamaGrammar.from_string(""" -root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") + # Basic structural validation (we don't parse full JSON here — just checking survival + minimal shape) + assert "{" in generated_text and "}" in generated_text, \ + "Generated text is missing JSON object braces" + assert ":" in generated_text, \ + "Generated text is missing key-value separator (:)" + +def test_logit_bias(llama_cpp_model_path): + """ + Test 3: Logit Bias + Verifies that specific tokens can be forced using logit bias. + """ + # Load model with minimal context to save memory (just for tokenization & small generation) + model = llama_cpp.Llama( + llama_cpp_model_path, + n_ctx=32, + n_batch=32, + n_ubatch=32, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + swa_full=True, + kv_unified=True, ) - number_1 = output["choices"][0]["text"] + # Target token we want to force the model to generate + target_word = " banana" # Note the leading space — important for most tokenizers + # Get the token ID corresponding to " banana" (Qwen-style tokenizer expected) + target_token = model.tokenize(target_word.encode("utf-8"), add_bos=False)[0] + + # Apply very strong positive bias to make this token extremely likely + bias = {target_token: 100.0} + + # Generate a very short continuation with temperature=0 (greedy) + strong bias output = model.create_completion( - "Pick a number from 1 to 10?:\n", - max_tokens=4, - top_k=50, - top_p=0.9, - temperature=0.8, - grammar=llama_cpp.LlamaGrammar.from_string(""" -root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") + "I like to eat", + max_tokens=3, + logit_bias=bias, + temperature=0.0 ) - number_2 = output["choices"][0]["text"] - model.load_state(state) + # Extract generated text + generated_text = output["choices"][0]["text"] + print(f"\n[Bias] Output: {generated_text}") + + # Verify that our forced token actually appeared in the output + assert "banana" in generated_text, f"Expected 'banana' in output, got: '{generated_text}'" + +def test_custom_logits_processor(llama_cpp_model_path): + """ + Test 4: Custom Logits Processor (Pure Python Implementation). + + Verifies that we can manipulate logits in Python before sampling. + In this test, we suppress any token containing the letter 'e'. + """ + # Load model with minimal context to save memory (just for tokenization & small generation) + model = llama_cpp.Llama( + llama_cpp_model_path, + n_ctx=64, + n_batch=32, + n_ubatch=32, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + swa_full=True, + kv_unified=True, + ) + + def no_e_processor(input_ids, scores): + """ + Filters out tokens containing 'e'. + """ + for token_id in range(len(scores)): + # Decode single token → get its string representation + token_str = model.detokenize([token_id]).decode("utf-8", errors="ignore") + + # Ban tokens that contain 'e' anywhere in their decoded form + if "e" in token_str: + scores[token_id] = -float("inf") + + return scores + + # Generate with greedy sampling (temperature=0) + our custom processor output = model.create_completion( - "Pick a number from 1 to 10?:\n", - max_tokens=4, - top_k=50, - top_p=0.9, - temperature=0.8, - grammar=llama_cpp.LlamaGrammar.from_string(""" -root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") + "The alphabet starts with", + max_tokens=10, + logits_processor=llama_cpp.LogitsProcessorList([no_e_processor]), + temperature=0.0 ) - number_3 = output["choices"][0]["text"] - assert number_1 != number_2 - assert number_1 == number_3 + generated_text = output["choices"][0]["text"] + print(f"\n[Custom] Output (No 'e'): {generated_text}") + # Basic validation: make sure no 'e' appears in the generated text + assert "e" not in generated_text, \ + f"Expected no letter 'e' in output, but found one:\n Output was: '{generated_text}'" def test_real_llama_embeddings(llama_cpp_model_path): + """ + Test Embedding Generation. + Verifies that the model can produce vector embeddings. + """ model = LlamaEmbedding( model_path=llama_cpp_model_path, n_ctx=32, From d69a1b5d6da4e74155c003a4f7aeb22272209c1d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Feb 2026 00:57:32 +0800 Subject: [PATCH 145/518] Update Submodule vendor/llama.cpp b831118..262364e --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b83111815e..262364e31d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b83111815e9a79949257e9d4b087206b320a3063 +Subproject commit 262364e31d1da43596fe84244fba44e94a0de64e From e8a27644d7ccb178bb3424b22e6248b031caa0a8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Feb 2026 22:40:49 +0800 Subject: [PATCH 146/518] Fix: Optimize .gitignore and add macOS system files --- .gitignore | 247 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 139 insertions(+), 108 deletions(-) diff --git a/.gitignore b/.gitignore index 9d68dbcd9d..fad7f43313 100644 --- a/.gitignore +++ b/.gitignore @@ -1,30 +1,106 @@ +# ========================================== +# Operating System & System Files +# ========================================== +# macOS +.DS_Store +.AppleDouble +.LSOverride +._* +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Windows (Common) +Thumbs.db +ehthumbs.db +Desktop.ini +*.exe + +# General User Overrides *.local -.python-version - +# ========================================== +# IDE & Editors +# ========================================== +# VS Code .vscode/ +.vs/ -_skbuild/ +# JetBrains / PyCharm +.idea/ + +# Spyder +.spyderproject +.spyproject + +# Rope +.ropeproject + +# Jupyter / IPython +.ipynb_checkpoints +profile_default/ +ipython_config.py + +# Other +/nppBackup +# ========================================== +# Environment & Secrets (Critical) +# ========================================== +# Virtual Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +.python-version + +# Environment Variables managers .envrc .direnv -models/ +# Secrets (Caution: check your specific needs) +local_settings.py -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class +# ========================================== +# Project Specific: Llama & Models +# ========================================== +# Large Model Files +models/ +docker/open_llama/*.bin -# C extensions +# C extensions (llama_cpp bindings) llama_cpp/*.so llama_cpp/*.dylib llama_cpp/*.metal llama_cpp/*.dll llama_cpp/*.lib -# Distribution / packaging -.Python +# Scikit-build +_skbuild/ + +# ========================================== +# Python Core & Compilation +# ========================================== +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions (General) +*.dll +*.o +*.so + +# ========================================== +# Build, Distribution & Packaging +# ========================================== build/ develop-eggs/ dist/ @@ -44,15 +120,32 @@ share/python-wheels/ MANIFEST # PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec -# Installer logs -pip-log.txt -pip-delete-this-directory.txt +# PyBuilder +.pybuilder/ +target/ + +# ========================================== +# Dependency Managers +# ========================================== +# Pipenv +# Pipfile.lock + +# Poetry +# poetry.lock +# PDM +.pdm.toml +# pdm.lock + +# PEP 582 +__pypackages__/ + +# ========================================== +# Testing, Coverage & Type Checking +# ========================================== # Unit test / coverage reports htmlcov/ .tox/ @@ -68,113 +161,51 @@ coverage.xml .pytest_cache/ cover/ -# Translations -*.mo -*.pot +# Type Checkers (mypy, pyre, pytype) +.mypy_cache/ +.dmypy.json +dmypy.json +.pyre/ +.pytype/ -# Django stuff: +# Cython debug symbols +cython_debug/ + +# ========================================== +# Web Frameworks & Databases +# ========================================== +# Django *.log -local_settings.py db.sqlite3 db.sqlite3-journal -# Flask stuff: +# Flask instance/ .webassets-cache -# Scrapy stuff: +# Scrapy .scrapy -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff +# Celery celerybeat-schedule celerybeat.pid -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject +# ========================================== +# Documentation & Translations +# ========================================== +# Sphinx +docs/_build/ -# mkdocs documentation +# MkDocs /site -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ +# Translations +*.mo +*.pot -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ +# SageMath +*.sage.py -# downloaded model .bin files -docker/open_llama/*.bin +# Installer logs +pip-log.txt +pip-delete-this-directory.txt \ No newline at end of file From 974505e445e75400b0bc76d8e2821a6692b97fdd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 11 Feb 2026 14:20:35 +0800 Subject: [PATCH 147/518] Update Submodule vendor/llama.cpp 262364e..e463bbd --- llama_cpp/llama_cpp.py | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6094fb1a06..3fc463df78 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1354,7 +1354,7 @@ def llama_free(ctx: llama_context_p, /): # enum llama_params_fit_status { # LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit # LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit -# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path +# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path # }; class llama_params_fit_status(enum.IntEnum): LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 262364e31d..e463bbdf65 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 262364e31d1da43596fe84244fba44e94a0de64e +Subproject commit e463bbdf659fed30b0fba822257ec7d8b373d1c2 From 2b6d47083b354ea4d1996eb1627b8a3cb9882300 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Feb 2026 00:44:28 +0800 Subject: [PATCH 148/518] Refactor the build-wheels-metal.yaml --- .github/workflows/build-wheels-metal.yaml | 60 ++++++++++++++++------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 53d33fe7a7..87afdca31d 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -11,20 +11,21 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [macos-14, macos-15] + os: [macos-latest] steps: - - uses: actions/checkout@v4 + - name: Checkout repository + uses: actions/checkout@v6 with: submodules: "recursive" - # Used to host cibuildwheel - - uses: actions/setup-python@v5 + - name: Setup Python + uses: actions/setup-python@v6 with: - python-version: "3.12" + python-version: "3.13" cache: 'pip' - - name: Install dependencies (Linux/MacOS) + - name: Install dependencies (MacOS) run: | python -m pip install --upgrade pip python -m pip install uv @@ -32,37 +33,62 @@ jobs: shell: bash - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v3.3.1 env: - # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_ARCHS: "arm64" - CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on -DCMAKE_CROSSCOMPILING=ON" - CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*" + CIBW_ENVIRONMENT: > + CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 + -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 + -DCMAKE_CROSSCOMPILING=on + -DGGML_METAL=on + -DGGML_METAL_USE_BF16=on + -DGGML_METAL_EMBED_LIBRARY=off + -DGGML_METAL_SHADER_DEBUG=on + -DLLAMA_HTTPLIB=on" with: package-dir: . output-dir: wheelhouse2 - - uses: actions/upload-artifact@v4 + - name: Upload artifacts + uses: actions/upload-artifact@v4 with: - name: wheels-mac_${{ matrix.os }} + name: wheels-metal_${{ matrix.os }} path: ./wheelhouse2/*.whl + - name: Get Wheel Filename + run: | + WHEEL_FILE=$(ls wheelhouse2/*.whl | head -n 1) + FILENAME=$(basename "$WHEEL_FILE") + echo VERSION=$(echo "$FILENAME" | cut -d'-' -f2 | tr -d '[:space:]') >> $GITHUB_ENV + + release: name: Release needs: [build_wheels] runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - name: Download artifacts + uses: actions/download-artifact@v4 with: merge-multiple: true path: dist2 - - - uses: softprops/action-gh-release@v2 + + - name: Get Current Date # Step to get current date for the release tag + id: get-date + run: | + # Get date in YYYYMMDD format using bash date command + currentDate=$(date +%Y%m%d) + # Store the date in environment variable for the release step + echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + + - name: Publish Release + uses: softprops/action-gh-release@v2.2.2 with: files: dist2/* - # set release name to -metal - tag_name: ${{ github.ref_name }}-metal + # set release name to v${VERSION}-metal-macos-${BUILD_DATE} + tag_name: v${{env.VERSION}}-metal-macos-${{env.BUILD_DATE}} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From a7f96a59a04a3fb0048985f525ab2581a21d1eed Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Feb 2026 13:59:19 +0800 Subject: [PATCH 149/518] ci: use llama_cpp.__version__ for release tagging and fix cross-job version propagation --- .github/workflows/build-wheels-metal.yaml | 31 ++++++++++++----------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 87afdca31d..541f4701e8 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -1,17 +1,18 @@ name: Build Wheels (Metal) -on: workflow_dispatch +on: + workflow_dispatch permissions: contents: write jobs: build_wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [macos-latest] + name: Build wheels (Metal macos) + runs-on: macos-latest + + outputs: + version: ${{steps.get_version.outputs.version}} steps: - name: Checkout repository @@ -32,6 +33,14 @@ jobs: RUST_LOG=trace python -m uv pip install -e .[all] --verbose shell: bash + - name: Get Package Version + id: get_version + shell: bash + run: | + VERSION=$(python -c "import llama_cpp; print(llama_cpp.__version__)") + echo "Detected version: $VERSION" + echo "version=$VERSION" >> $GITHUB_OUTPUT + - name: Build wheels uses: pypa/cibuildwheel@v3.3.1 env: @@ -57,13 +66,6 @@ jobs: name: wheels-metal_${{ matrix.os }} path: ./wheelhouse2/*.whl - - name: Get Wheel Filename - run: | - WHEEL_FILE=$(ls wheelhouse2/*.whl | head -n 1) - FILENAME=$(basename "$WHEEL_FILE") - echo VERSION=$(echo "$FILENAME" | cut -d'-' -f2 | tr -d '[:space:]') >> $GITHUB_ENV - - release: name: Release needs: [build_wheels] @@ -88,7 +90,6 @@ jobs: uses: softprops/action-gh-release@v2.2.2 with: files: dist2/* - # set release name to v${VERSION}-metal-macos-${BUILD_DATE} - tag_name: v${{env.VERSION}}-metal-macos-${{env.BUILD_DATE}} + tag_name: v${{ needs.build_wheels.outputs.version }}-Metal-macos-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From a3cf5c3c93c5e2b9661e739bd0f4f79185722f8f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Feb 2026 14:51:21 +0800 Subject: [PATCH 150/518] Remove outdated workflow --- .../generate-index-from-release.yaml | 57 ------------------- 1 file changed, 57 deletions(-) delete mode 100644 .github/workflows/generate-index-from-release.yaml diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml deleted file mode 100644 index 255ee67d6f..0000000000 --- a/.github/workflows/generate-index-from-release.yaml +++ /dev/null @@ -1,57 +0,0 @@ -name: Wheels Index - -on: - # Trigger on new release - workflow_run: - workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)"] - types: - - completed - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages -permissions: - contents: read - pages: write - id-token: write - -# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. -# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. -concurrency: - group: "pages" - cancel-in-progress: false - -jobs: - # Single deploy job since we're just deploying - deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Setup Pages - uses: actions/configure-pages@v5 - - name: Build - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - ./scripts/get-releases.sh - ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' - ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' - ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' - ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$' - ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - # Upload entire repository - path: 'index' - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 From 3dc327110f1a0a8665ffd4583c9c5879b24c1d10 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Feb 2026 15:23:40 +0800 Subject: [PATCH 151/518] Update README.md --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index da596f0104..db42c5c029 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install "llama-cpp-p To install with Metal (MPS), set the `GGML_METAL=on` environment variable before installing: ```bash -CMAKE_ARGS="-DGGML_METAL=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +CMAKE_ARGS="-DGGML_METAL=on -DGGML_METAL_USE_BF16=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` **Pre-built Wheel (New)** @@ -162,12 +162,10 @@ CMAKE_ARGS="-DGGML_METAL=on" pip install "llama-cpp-python @ git+https://github. It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements: - MacOS Version is 11.0 or later -- Python Version is 3.10, 3.11 or 3.12 +- Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 -```bash -pip install llama-cpp-python \ - --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal -``` +Check the releases page: +https://github.com/JamePeng/llama-cpp-python/releases From f68fd9f20316bae8ebf89951f64067aa02e99f84 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Feb 2026 23:48:35 +0800 Subject: [PATCH 152/518] Update Submodule vendor/llama.cpp e463bbd..33a56f9 --- llama_cpp/llama_cpp.py | 4 ++-- vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3fc463df78..d350e6e7e6 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -3771,9 +3771,9 @@ def llama_detokenize( # /// Apply chat template. Inspired by hf apply_chat_template() on python. -# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" +# /// # /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template -# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. +# /// @param tmpl A Jinja template to use for this chat. # /// @param chat Pointer to a list of multiple llama_chat_message # /// @param n_msg Number of llama_chat_message in this chat # /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e463bbdf65..33a56f90a6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e463bbdf659fed30b0fba822257ec7d8b373d1c2 +Subproject commit 33a56f90a6a793a3c7b1f6ca39ff43a1cecd0b61 From a0e3ab43b46159fcd2422e4fb091d71834accae6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 13 Feb 2026 22:15:41 +0800 Subject: [PATCH 153/518] test: replace unstable grammar test with deterministic mechanism check - Replace knowledge-based grammar test (Paris/Louvre) with a "coin flip" test. - The new test enforces "heads" or "tails" via grammar, verifying the sampling mechanism rather than the model's knowledge. - Improves stability for smaller models (e.g., Qwen-0.5B) that might hallucinate answers. Signed-off-by: JamePeng --- tests/test_llama.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_llama.py b/tests/test_llama.py index 8f15a16466..6007e1e101 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -158,6 +158,7 @@ def test_real_llama(llama_cpp_model_path): kv_unified=True, ) + # 1. Basic Completion Test output = model.create_completion( "The quick brown fox jumps", max_tokens=4, @@ -169,24 +170,31 @@ def test_real_llama(llama_cpp_model_path): text = output["choices"][0]["text"] assert "over" in text or "lazy dog" in text - + # 2. Grammar Constraint Test (Updated: Coin Flip) + # We verify that the model ONLY outputs "heads" or "tails". + # This tests the sampler mechanism, not the model's intelligence. output = model.create_completion( - "Is the Louvre Museum in Paris, France?, 'yes' or 'no'?:\n", + "Flip a coin: heads or tails? Result:", max_tokens=4, top_k=50, top_p=0.9, temperature=0.8, seed=1337, grammar=llama_cpp.LlamaGrammar.from_string(""" -root ::= "yes" | "no" -""") + root ::= "heads" | "tails" + """) ) - assert output["choices"][0]["text"] == "yes" - suffix = b"rot" + generated_text = output["choices"][0]["text"] + print(f"\n[Grammar Coin Flip] Output: {generated_text}") - tokens = model.tokenize(suffix, add_bos=True, special=True) + # Assert that the output is strictly one of the allowed grammar options + assert generated_text in ["heads", "tails"], \ + f"Grammar failed! Expected 'heads' or 'tails', got: '{generated_text}'" + # 3. Logit Bias Test + suffix = b"rot" + tokens = model.tokenize(suffix, add_bos=True, special=True) logit_bias: Dict[int, float] = {} for token_id in tokens: From 452668a035ff9ca95558e52278f505a211da711a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 14 Feb 2026 20:26:15 +0800 Subject: [PATCH 154/518] Update Submodule vendor/llama.cpp 33a56f9..1725e31 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 33a56f90a6..1725e316c1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 33a56f90a6a793a3c7b1f6ca39ff43a1cecd0b61 +Subproject commit 1725e316c1a780759ec134ca5a2999f4d53ce273 From af9d925bf26de1404aba6c8b64e5058afcb0b087 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 14 Feb 2026 20:26:38 +0800 Subject: [PATCH 155/518] Update llama.cpp API 20260214 --- llama_cpp/llama_cpp.py | 74 +++++++++++++----------------------------- 1 file changed, 23 insertions(+), 51 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d350e6e7e6..6bfe8db350 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1996,52 +1996,22 @@ def llama_adapter_get_alora_invocation_tokens(adapter: llama_adapter_lora_p, /) # // The following functions operate on a llama_context, hence the naming: llama_verb_... -# // Add a loaded LoRA adapter to given context -# // This will not modify model's weight -# LLAMA_API int32_t llama_set_adapter_lora( +# // Set LoRa adapters on the context. Will only modify if the adapters currently in context are different. +# LLAMA_API int32_t llama_set_adapters_lora( # struct llama_context * ctx, -# struct llama_adapter_lora * adapter, -# float scale); +# struct llama_adapter_lora ** adapters, +# size_t n_adapters, +# float * scales); @ctypes_function( - "llama_set_adapter_lora", - [llama_context_p_ctypes, llama_adapter_lora_p_ctypes, ctypes.c_float], + "llama_set_adapters_lora", + [llama_context_p_ctypes, ctypes.POINTER(llama_adapter_lora_p_ctypes), ctypes.c_size_t, ctypes.c_float], ctypes.c_int32, ) -def llama_set_adapter_lora( - ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, / +def llama_set_adapters_lora( + ctx: llama_context_p, adapters: CtypesArray[llama_adapter_lora_p], n_adapters: ctypes.c_size_t, scale: float, / ) -> int: - """Add a loaded LoRA adapter to given context - This will not modify model's weight""" - ... - - -# // Remove a specific LoRA adapter from given context -# // Return -1 if the adapter is not present in the context -# LLAMA_API int32_t llama_rm_adapter_lora( -# struct llama_context * ctx, -# struct llama_adapter_lora * adapter); -@ctypes_function( - "llama_rm_adapter_lora", - [llama_context_p_ctypes, llama_adapter_lora_p_ctypes], - ctypes.c_int32, -) -def llama_rm_adapter_lora( - ctx: llama_context_p, adapter: llama_adapter_lora_p, / -) -> int: - """Remove a specific LoRA adapter from given context - Return -1 if the adapter is not present in the context""" - ... - - -# // Remove all LoRA adapters from given context -# LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx); -@ctypes_function( - "llama_clear_adapter_lora", - [llama_context_p_ctypes], - None, -) -def llama_clear_adapter_lora(ctx: llama_context_p, /): - """Remove all LoRA adapters from given context""" + """Set LoRa adapters on the context. + Will only modify if the adapters currently in context are different.""" ... @@ -2051,15 +2021,15 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /): # // to an n_embd x n_layers buffer starting from layer 1. # // il_start and il_end are the layer range the vector should apply to (both inclusive) # // See llama_control_vector_load in common to load a control vector. -# LLAMA_API int32_t llama_apply_adapter_cvec( +# LLAMA_API int32_t llama_set_adapter_cvec( # struct llama_context * ctx, -# const float * data, -# size_t len, -# int32_t n_embd, -# int32_t il_start, -# int32_t il_end); +# const float * data, +# size_t len, +# int32_t n_embd, +# int32_t il_start, +# int32_t il_end); @ctypes_function( - "llama_apply_adapter_cvec", + "llama_set_adapter_cvec", [ llama_context_p_ctypes, ctypes.POINTER(ctypes.c_float), @@ -2070,7 +2040,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /): ], ctypes.c_int32, ) -def llama_apply_adapter_cvec( +def llama_set_adapter_cvec( ctx: llama_context_p, data: CtypesPointerOrRef[ctypes.c_float], len: int, @@ -2079,12 +2049,14 @@ def llama_apply_adapter_cvec( il_end: int, /, ) -> int: - """Apply a loaded control vector to a llama_context, or if data is NULL, clear + """ + Apply a loaded control vector to a llama_context, or if data is NULL, clear the currently loaded vector. n_embd should be the size of a single layer's control, and data should point to an n_embd x n_layers buffer starting from layer 1. il_start and il_end are the layer range the vector should apply to (both inclusive) - See llama_control_vector_load in common to load a control vector.""" + See llama_control_vector_load in common to load a control vector. + """ ... From 5ef874cf7e5b08533c7782286eda777e44be9744 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 14 Feb 2026 23:52:58 +0800 Subject: [PATCH 156/518] Improve sampling and grammar lifecycle management, fix memory growth issues - Validate grammar sampler initialization and inputs - Replace unbounded prev token list with bounded deque by LlamaSamplingParams n_prev param - Reuse logits NumPy view to avoid repeated allocations - Reuse single-token buffers for grammar rejection sampling - Minor cleanups and consistency improvements in sampling flow --- llama_cpp/_internals.py | 171 ++++++++++++++++++++++++++-------------- 1 file changed, 110 insertions(+), 61 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 03df3811b2..3b6af632fb 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -17,6 +17,7 @@ ) from dataclasses import dataclass, field +from collections import deque from contextlib import ExitStack import numpy as np @@ -871,15 +872,24 @@ class GrammarSampler: def __init__(self, model, grammar_str, lazy=False, triggers=None): + if model is None: + raise ValueError("model must not be None") + self.model = model self.vocab = model.vocab + if not grammar_str: + raise ValueError("grammar_str must not be empty") + self.grammar = llama_cpp.llama_sampler_init_grammar( self.vocab, - grammar_str.encode(), + grammar_str.encode("utf-8"), b"root" ) + if not self.grammar: + raise RuntimeError("Failed to initialize grammar sampler") + def apply(self, token_data): llama_cpp.llama_sampler_apply(self.grammar, token_data) @@ -889,8 +899,22 @@ def accept(self, token): def reset(self): llama_cpp.llama_sampler_reset(self.grammar) - def free(self): - llama_cpp.llama_sampler_free(self.grammar) + def close(self): + if self.grammar: + try: + llama_cpp.llama_sampler_free(self.grammar) + except Exception: + pass + + self.model = None + self.vocab = None + self.grammar = None + + def __del__(self): + try: + self.close() + except Exception: + pass @dataclass class LlamaSamplingContext: @@ -904,35 +928,50 @@ def __init__( model: Optional[LlamaModel] = None, _existing_sampler: Optional[LlamaSampler] = None, # Internal use for cloning ): - self.params = params + if model is None: + raise RuntimeError("model must not be None") self.model = model + + self.params = params self.vocab = llama_cpp.llama_model_get_vocab(model.model) self.n_vocab = model.n_vocab() lparams = llama_cpp.llama_sampler_chain_default_params() lparams.no_perf = params.no_perf - # Keep track of generated tokens for Python-side debugging/decoding - self.prev: List[int] = [] + # history (bounded) + # params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); + self.prev = deque(maxlen=max(params.n_prev, params.penalty_last_n)) + # reusable token data array self._cur_p = LlamaTokenDataArray(n_vocab=self.n_vocab) + # reusable numpy logits view + self._logits_view = None + self._single_token = llama_cpp.llama_token_data() + self._single_array = llama_cpp.llama_token_data_array( + data=ctypes.pointer(self._single_token), + size=1, + selected=-1, + sorted=False, + ) + + # sampler chain if _existing_sampler: - # Use the provided sampler (already configured/cloned) self.sampler_chain = _existing_sampler else: - # Build a new chain from scratch - self.grammar_sampler = None self.sampler_chain = LlamaSampler() - - if params.grammar is not None: - self.grammar_sampler = GrammarSampler( - model, - params.grammar, - params.grammar_lazy, - params.grammar_triggers - ) self._build_sampler_chain() + # grammar sampler + self.grammar_sampler = None + if params.grammar: + self.grammar_sampler = GrammarSampler( + model, + params.grammar, + params.grammar_lazy, + params.grammar_triggers, + ) + def _build_sampler_chain(self): """ Build sampler chain aligned with llama.cpp common_sampler_init @@ -1029,9 +1068,13 @@ def reset(self): """ Resets the internal state of all samplers in the chain. """ - self.grammar_sampler.reset() - self.sampler_chain.reset() - self.prev = [] + self.prev.clear() + + if self.grammar_sampler: + self.grammar_sampler.reset() + + if self.sampler_chain: + self.sampler_chain.reset() def cp(self) -> 'LlamaSamplingContext': """ @@ -1084,14 +1127,17 @@ def sample( return int(sampled) # 3. build cur_p - logits = llama_cpp.llama_get_logits_ith(ctx.ctx, idx) + logits_ptr = llama_cpp.llama_get_logits_ith(ctx.ctx, idx) - logits_array = np.ctypeslib.as_array( - logits, - shape=(self.n_vocab,) - ) + if self._logits_view is None: + self._logits_view = np.ctypeslib.as_array( + logits_ptr, + shape=(self.n_vocab,), + ) + logits_array = self._logits_view cur_p = self._cur_p + cur_p.copy_logits(logits_array) # logit bias @@ -1107,6 +1153,15 @@ def sample( ctypes.byref(cur_p.candidates) ) + llama_cpp.llama_sampler_apply( + self.sampler_chain.sampler, + ctypes.byref(cur_p.candidates) + ) + # grammar-first return directly + selected = cur_p.candidates.selected + return int(cur_p.candidates_data.id[selected]) + + # 5. sampling chain llama_cpp.llama_sampler_apply( self.sampler_chain.sampler, @@ -1116,45 +1171,24 @@ def sample( selected = cur_p.candidates.selected token = int(cur_p.candidates_data.id[selected]) - # 6. grammar-first return directly - if self.grammar_sampler and grammar_first: - return token - - # 7. grammar rejection sampling + # 6. grammar rejection sampling if self.grammar_sampler: - single = llama_cpp.llama_token_data( - id=token, - logit=1.0, - p=0.0 - ) - - single_arr = llama_cpp.llama_token_data_array( - data=ctypes.pointer(single), - size=1, - selected=-1, - sorted=False - ) + self._single_token.id = token + self._single_token.logit = 1.0 + self._single_token.p = 0.0 + self._single_array.selected = -1 + self._single_array.sorted = False llama_cpp.llama_sampler_apply( self.grammar_sampler.grammar, - ctypes.byref(single_arr) + ctypes.byref(self._single_array) ) - valid = not np.isneginf(single.logit) - - if valid: + if not np.isneginf(self._single_token.logit): return token - - # 8. resample - logits = llama_cpp.llama_get_logits_ith(ctx.ctx, idx) - - logits_array = np.ctypeslib.as_array( - logits, - shape=(self.n_vocab,) - ) - + # 7. resample cur_p.copy_logits(logits_array) llama_cpp.llama_sampler_apply( @@ -1172,14 +1206,29 @@ def sample( return token + def close(self): + """ + Clear samplers cache + """ + if self.grammar_sampler: + self.grammar_sampler.close() + self.grammar_sampler = None + + if self.sampler_chain: + self.sampler_chain.close() + self.sampler_chain = None + + def __del__(self): + try: + self.close() + except Exception: + pass + # --- Utilities --- def last(self) -> Optional[int]: """Returns the last sampled token.""" - if len(self.prev) > 0: - return self.prev[-1] - else: - return None + return self.prev[-1] if self.prev else None def prev_str(self, ctx_main: LlamaContext, n: int) -> str: """ @@ -1189,9 +1238,9 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str: if not self.prev: return "" # Get the last n tokens - last_tokens = self.prev[-n:] + last_n_tokens = self.prev[-n:] # Use the model linked to the context to detokenize - return ctx_main.model.detokenize(last_tokens).decode("utf-8", errors="replace") + return ctx_main.model.detokenize(last_n_tokens).decode("utf-8", errors="replace") class CustomSampler: From 9f79b78cb89cef44397f8727adc55e288c74946c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Feb 2026 00:21:53 +0800 Subject: [PATCH 157/518] Fix sampling history alignment with llama.cpp Ensure n_prev >= penalty_last_n and support penalty_last_n == -1 (ctx_size). Signed-off-by: JamePeng --- llama_cpp/_internals.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 3b6af632fb..3bad043154 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -940,10 +940,22 @@ def __init__( lparams.no_perf = params.no_perf # history (bounded) + # last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) + if self.params.penalty_last_n == -1: + # full context + self.params.penalty_last_n = self.model.n_ctx_train() + # params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); - self.prev = deque(maxlen=max(params.n_prev, params.penalty_last_n)) + if self.params.penalty_last_n > 0: + self.params.n_prev = max( + self.params.n_prev, + self.params.penalty_last_n + ) + self.prev = deque(maxlen=max(self.params.n_prev, 32)) + # reusable token data array self._cur_p = LlamaTokenDataArray(n_vocab=self.n_vocab) + # reusable numpy logits view self._logits_view = None From dc5f7e5564dd68af9d62f7d450cda45313f80b5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Feb 2026 11:10:35 +0800 Subject: [PATCH 158/518] Update Submodule vendor/llama.cpp 1725e31..079feab --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6bfe8db350..9511bd3dc8 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -188,6 +188,7 @@ # LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, # LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, # LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, +#. LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -235,6 +236,7 @@ LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 +LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 # // note: these values should be synchronized with ggml_rope diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1725e316c1..079feab9e3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1725e316c1a780759ec134ca5a2999f4d53ce273 +Subproject commit 079feab9e3efee1d6d4ca370eac50f156e2dc6e8 From b03224b2dde3c8cbdd8bf529794e3a41ac7f5751 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Feb 2026 11:24:12 +0800 Subject: [PATCH 159/518] Bump version to 0.3.25 --- CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8217ea64ab..6221d8ac5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.25] +- feat: [Refactor Llama class to use new LlamaSampler chain API from _internals](https://github.com/JamePeng/llama-cpp-python/commit/1e6094a327f0fb9dc35d52f84d8ebabc1faa1e95) +This commit refactors the high-level Llama class to fully utilize the new C++ `llama_sampler` chain architecture via `LlamaSamplingContext`. + - Replaced manual sampling logic and obsolete `_init_sampler` with `LlamaSamplingContext`. + - Updated `sample()` and `generate()` to support the full suite of modern sampling strategies (DRY, XTC, Adaptive-P, Infill, etc.). + - Added new sampling parameters to all generation methods (`create_completion`, `create_chat_completion`, `__call__`): + - `dynatemp_range`, `dynatemp_exponent` (Dynamic Temperature) + - `min_keep` + - Refactored `logits_processor` handling to use `CustomSampler` adapter for better performance and C++ interop. + - Improved sampling state management (e.g., repetition penalties) by persisting `_sampling_ctx` during generation. + - Removed manual `logit_bias` processing in Python; now delegated to the underlying sampler chain. + +- feat: Separate the grammar sampler, improve the code stability of Sampler Chain processing, and fix some bugs. + +- [Improve sampling and grammar lifecycle management, fix memory growth issues](https://github.com/JamePeng/llama-cpp-python/commit/5ef874cf7e5b08533c7782286eda777e44be9744) + - Validate grammar sampler initialization and inputs + - Replace unbounded prev token list with bounded deque by LlamaSamplingParams n_prev param + - Reuse logits NumPy view to avoid repeated allocations + - Reuse single-token buffers for grammar rejection sampling + - Minor cleanups and consistency improvements in sampling flow + +- feat: [Fix sampling history alignment with llama.cpp](https://github.com/JamePeng/llama-cpp-python/commit/9f79b78cb89cef44397f8727adc55e288c74946c) + +- test: update integration tests for new sampler architecture + +- test: replace unstable grammar test with deterministic mechanism check + +- fix: Optimize .gitignore and add macOS system files + +- feat: Refactor the build-wheels-metal.yaml + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/079feab9e3efee1d6d4ca370eac50f156e2dc6e8](https://github.com/ggml-org/llama.cpp/commit/079feab9e3efee1d6d4ca370eac50f156e2dc6e8) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260214 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/4ab182382b87bbbba4fb05ff184b557414740103...dc5f7e5564dd68af9d62f7d450cda45313f80b5d + ## [0.3.24] - feat: [Refactor sampling infrastructure to use llama.cpp sampler chain API](https://github.com/JamePeng/llama-cpp-python/commit/1df39b422890db55cb9f6de43cb792a26921752e) - LlamaContext: Remove obsolete manual sampling methods. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index c48e6fcec0..52101c9b7e 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.24" +__version__ = "0.3.25" From 2516555b9eaa455cf58346bd3191b199c303b820 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 17 Feb 2026 00:14:49 +0800 Subject: [PATCH 160/518] Update Submodule vendor/llama.cpp 079feab..cceb1b4 --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9511bd3dc8..b1f85edeec 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -189,6 +189,7 @@ # LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, # LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, #. LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, +# LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -237,6 +238,7 @@ LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 +LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 # // note: these values should be synchronized with ggml_rope diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 079feab9e3..cceb1b4e33 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 079feab9e3efee1d6d4ca370eac50f156e2dc6e8 +Subproject commit cceb1b4e33cfd9595b4ac1949f2c0857e43af427 From 4b13964a720110fe38bf132abf7da5ffae3d1bb0 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 17 Feb 2026 00:22:04 +0800 Subject: [PATCH 161/518] Sync build : remove LLAMA_HTTPLIB option --- .github/workflows/build-wheels-cu124-cu126-win.yml | 2 +- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu128-linux.yml | 2 +- .github/workflows/build-wheels-cu128-win.yml | 2 +- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 2 +- .github/workflows/build-wheels-metal.yaml | 1 - 8 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-cu126-win.yml index a36c5b874a..6513526cde 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-cu126-win.yml @@ -69,7 +69,7 @@ jobs: $env:VERBOSE = '1' $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 668a26fde3..9a55248124 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -64,7 +64,7 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" if [ "${AVXVER}" = "AVX" ]; then CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 46ea5a50ee..bca09d2f66 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -64,7 +64,7 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" if [ "${AVXVER}" = "AVX" ]; then CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index e4e4782fd4..ad13b30706 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -64,7 +64,7 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" if [ "${AVXVER}" = "AVX" ]; then CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 9fa5e27358..e9d36602bd 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -69,7 +69,7 @@ jobs: $env:VERBOSE = '1' $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 6907c518ad..574690cdf2 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -64,7 +64,7 @@ jobs: # Add project-specific and feature flags CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on -DLLAMA_HTTPLIB=on" + CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" if [ "${AVXVER}" = "AVX" ]; then CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index d572e65efc..d055db43af 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -69,7 +69,7 @@ jobs: $env:VERBOSE = '1' $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off -DLLAMA_HTTPLIB=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 541f4701e8..fa205d8929 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -55,7 +55,6 @@ jobs: -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=off -DGGML_METAL_SHADER_DEBUG=on - -DLLAMA_HTTPLIB=on" with: package-dir: . output-dir: wheelhouse2 From 6e52096687d80a298c288f5dae1f5523f9352745 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 17 Feb 2026 00:24:33 +0800 Subject: [PATCH 162/518] build: set GGML_METAL_USE_BF16 option ON for metal build by default --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 555fb3c522..aef1099165 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,6 +101,7 @@ if (LLAMA_BUILD) # Metal settings (enable for both architectures) set(GGML_METAL "ON" CACHE BOOL "ggml: enable Metal" FORCE) + set(GGML_METAL_USE_BF16 "ON" CACHE BOOL "ggml: use bfloat if available" FORCE) set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE) endif() From 16175e55b8bfa3753abb51db760cf42a4a766d79 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 17 Feb 2026 22:09:19 +0800 Subject: [PATCH 163/518] Update Submodule vendor/llama.cpp cceb1b4..afa6bfe --- llama_cpp/llama_cpp.py | 4 +++- vendor/llama.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b1f85edeec..d32dffcaac 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -188,8 +188,9 @@ # LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, # LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, # LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, -#. LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, +# LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, # LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, +# LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -239,6 +240,7 @@ LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 +LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 # // note: these values should be synchronized with ggml_rope diff --git a/vendor/llama.cpp b/vendor/llama.cpp index cceb1b4e33..afa6bfe4f7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit cceb1b4e33cfd9595b4ac1949f2c0857e43af427 +Subproject commit afa6bfe4f7530c0a6df527a6cd74fa551c36abdf From c6c85b12810972d7e8c48e4a9bb290128dc26d8a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 18 Feb 2026 13:34:48 +0800 Subject: [PATCH 164/518] perf(llama-cpp): optimize LlamaTokenDataArray memory operations - Cache NumPy field views for 'id', 'logit', and 'p' to bypass expensive property lookups. - Refactor copy_logits to use pre-generated ID sequences and cached views. - Ensure logical consistency by resetting token IDs every sampling step to counter C++ reordering. - Minimize redundant memory allocations during the inference loop. --- llama_cpp/_internals.py | 60 ++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 3bad043154..02df13ed88 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -714,29 +714,63 @@ def normalize_embedding(embedding): class LlamaTokenDataArray: + """ + Performance-optimized wrapper for llama_token_data_array. + This class minimizes Python overhead by caching memory views and avoiding + redundant memory allocations during the inference loop. + """ def __init__(self, *, n_vocab: int): self.n_vocab = n_vocab - self.candidates_data = np.recarray( - (self.n_vocab,), + + # Define the structure of llama_token_data to match the C++ memory layout. + # id: token identifier (int32) + # logit: raw prediction score (float32) + # p: probability score (float32) + self.candidates_data = np.empty( + self.n_vocab, dtype=np.dtype( - [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True + [("id", np.intc), ("logit", np.single), ("p", np.single)], + align=True ), ) + + # Optimization: Cache field views to bypass NumPy's expensive field lookup overhead. + # Using these cached views allows for direct memory access in the inference loop. + self._id_view = self.candidates_data["id"] + self._logit_view = self.candidates_data["logit"] + self._p_view = self.candidates_data["p"] + + # Initialization: Pre-generate a standard token ID sequence (0 to n_vocab - 1). + # This acts as the 'golden' reference to reset the buffer after sorting operations. + self._default_ids = np.arange(self.n_vocab, dtype=np.intc) + self._id_view[:] = self._default_ids + + # Construct the llama_cpp C structure. + # 'data' is assigned a direct pointer to the underlying NumPy memory buffer. self.candidates = llama_cpp.llama_token_data_array( data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), size=self.n_vocab, selected=-1, sorted=False, ) - self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) # type: ignore - self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) def copy_logits(self, logits: npt.NDArray[np.single]): - self.candidates_data.id[:] = self.default_candidates_data_id - self.candidates_data.logit[:] = logits - self.candidates_data.p[:] = self.default_candidates_data_p - self.candidates.sorted = False + """ + Synchronizes the memory buffer with new logit data from the model. + """ + # Step 1: Transfer new logits from the model output to our working buffer. + self._logit_view[:] = logits + + # Step 2: Critical Reset. + # Samplers (like top-k or top-p) reorder elements in memory during processing. + # We must reset token IDs every step to ensure logical consistency for the next run. + self._id_view[:] = self._default_ids + + # Step 3: Metadata update. + # Inform the llama.cpp backend that the buffer is full and currently unsorted. self.candidates.size = self.n_vocab + self.candidates.sorted = False + self.candidates.selected = -1 # Python wrappers over common/sampling structs @@ -1155,7 +1189,7 @@ def sample( # logit bias if self.params.logit_bias: for item in self.params.logit_bias: - cur_p.candidates_data.logit[item.token] += item.bias + cur_p._logit_view[item.token] += item.bias # 4. grammar first @@ -1171,7 +1205,7 @@ def sample( ) # grammar-first return directly selected = cur_p.candidates.selected - return int(cur_p.candidates_data.id[selected]) + return int(cur_p._id_view[selected]) # 5. sampling chain @@ -1181,7 +1215,7 @@ def sample( ) selected = cur_p.candidates.selected - token = int(cur_p.candidates_data.id[selected]) + token = int(cur_p._id_view[selected]) # 6. grammar rejection sampling if self.grammar_sampler: @@ -1214,7 +1248,7 @@ def sample( ) selected = cur_p.candidates.selected - token = int(cur_p.candidates_data.id[selected]) + token = int(cur_p._id_view[selected]) return token From 8f5c9b610e587bd4ba431cb133d01b1b00f5b5e8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 18 Feb 2026 17:11:24 +0800 Subject: [PATCH 165/518] feat: Add explicit memory cleanup for sampling contexts Implements `close()` and `__del__` for LlamaTokenDataArray and expands LlamaSamplingContext cleanup. Ensures NumPy views and internal C-references are properly released to allow Python GC to reclaim memory. --- llama_cpp/_internals.py | 56 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 02df13ed88..7b10fe034e 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -772,6 +772,31 @@ def copy_logits(self, logits: npt.NDArray[np.single]): self.candidates.sorted = False self.candidates.selected = -1 + def close(self): + """ + Release internal NumPy buffers and C-structure references. + """ + # Main structured NumPy buffer holding token data (id, logit, prob) + self.candidates_data = None + + # Cached NumPy field views (avoid dangling references) + self._id_view = None + self._logit_view = None + self._p_view = None + + # Precomputed default token id array + self._default_ids = None + + # Setting to None ensures no stale pointer references remain. + self.candidates = None + + def __del__(self): + # Ensures memory cleanup in case close() was not called explicitly. + try: + self.close() + except Exception: + pass + # Python wrappers over common/sampling structs # common/common.h common_params_sampling @@ -1254,16 +1279,45 @@ def sample( def close(self): """ - Clear samplers cache + Release all sampling-related resources and break references + to large buffers to allow Python GC to reclaim memory. + + This method must be called when the sampling context is no longer needed, + especially in long-running services, to prevent memory retention. """ + + # Free grammar sampler if it was initialized. + # This releases underlying llama.cpp sampler memory. if self.grammar_sampler: self.grammar_sampler.close() self.grammar_sampler = None + # Free the sampler chain and all attached C samplers. if self.sampler_chain: self.sampler_chain.close() self.sampler_chain = None + # Release large token data buffer used during sampling. + # Important for high-vocab models to avoid memory retention. + if hasattr(self, "_cur_p"): + try: + self._cur_p.close() + except Exception: + pass + self._cur_p = None + + # Clear token history deque to drop references. + if hasattr(self, "prev"): + self.prev.clear() + self.prev = None + + # Remove NumPy view pointing to llama logits buffer. + self._logits_view = None + + # Break references to small C structs used in grammar rejection sampling. + self._single_token = None + self._single_array = None + def __del__(self): try: self.close() From 645106ef97a7d4784e4f78a07dc357a583fa3272 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 18 Feb 2026 17:39:04 +0800 Subject: [PATCH 166/518] Copy missing _keep_alive and custom_samplers list to new sampler --- llama_cpp/_internals.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 7b10fe034e..424e7d95ca 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1488,7 +1488,16 @@ def clone(self) -> 'LlamaSampler': if not new_sampler_p: raise RuntimeError("llama_sampler_clone failed") - return LlamaSampler(existing_sampler_p=new_sampler_p) + new_sampler = LlamaSampler(existing_sampler_p=new_sampler_p) + + # copy _keep_alive and custom_samplers list to new sampler + if self._keep_alive: + new_sampler._keep_alive = self._keep_alive.copy() + + if self.custom_samplers: + new_sampler.custom_samplers = self.custom_samplers.copy() + + return new_sampler def sample(self, ctx: LlamaContext, idx: int = -1) -> int: """ From bb8437a10dda7ae053211650e99c8bf36bd97192 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 01:35:52 +0800 Subject: [PATCH 167/518] Update Submodule vendor/llama.cpp afa6bfe..eeef3cf --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index afa6bfe4f7..eeef3cfced 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit afa6bfe4f7530c0a6df527a6cd74fa551c36abdf +Subproject commit eeef3cfcedf853a938e3a7ea7f537ff3b8499474 From f9f866994c9b2ca5c0536a67c595e6424b8508a4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 02:23:44 +0800 Subject: [PATCH 168/518] optimize(memory): reduce scores buffer size and optimize state saving - Update save_state and load_state API use. - Refactored self.scores to allocate only a single row (1, n_vocab) when logits_all=False, significantly reducing memory usage for large vocabulary models. - Optimized save_state to eliminate redundant memory allocations and copies by using ctypes.string_at. - Updated load_state, eval, and sampler adapters to correctly handle the dynamic shape of self.scores. --- llama_cpp/llama.py | 107 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 26 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index df2ad74dc2..1664c63bdf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -525,9 +525,8 @@ def free_lora_adapter(): self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) - self.scores: npt.NDArray[np.single] = np.ndarray( - (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single - ) + self.scores: npt.NDArray[np.single] = np.ndarray((n_ctx if self._logits_all else 1, self._n_vocab), dtype=np.single) + self._mirostat_mu = ctypes.c_float( 2.0 * 5.0 @@ -638,7 +637,10 @@ def _input_ids(self) -> npt.NDArray[np.intc]: @property def _scores(self) -> npt.NDArray[np.single]: - return self.scores[: self.n_tokens, :] + if self._logits_all: + return self.scores[: self.n_tokens, :] + else: + return self.scores @property def eval_tokens(self) -> Deque[int]: @@ -747,14 +749,17 @@ def eval(self, tokens: Sequence[int]): ) from e # Save tokens self.input_ids[n_past : n_past + n_batch_tokens] = batch + # Save logits + logits_ptr = self._ctx.get_logits() if self._logits_all: rows = n_batch_tokens cols = self._n_vocab - logits = np.ctypeslib.as_array( - self._ctx.get_logits(), shape=(rows * cols,) - ) - self.scores[n_past : n_past + n_batch_tokens, :].reshape(-1)[::] = logits + logits_view = np.ctypeslib.as_array(logits_ptr, shape=(rows * cols,)) + self.scores[n_past : n_past + n_batch_tokens, :].reshape(-1)[:] = logits_view + else: + logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,)) + self.scores[0, :] = logits_view # Update n_tokens current_pos += n_batch_tokens @@ -875,7 +880,10 @@ def sample( # LogitsProcessor Adapter if logits_processor: def adapter(token_data_array: llama_cpp.llama_token_data_array): - current_scores = self._scores[self.n_tokens - 1, :] + if self._logits_all: + current_scores = self._scores[self.n_tokens - 1, :] + else: + current_scores = self._scores[0, :] new_scores = logits_processor(self._input_ids, current_scores) size = token_data_array.size data_ptr = token_data_array.data @@ -1003,7 +1011,10 @@ def generate( if logits_processor: def adapter(token_data_array: llama_cpp.llama_token_data_array): - current_scores = self._scores[self.n_tokens - 1, :] + if self._logits_all: + current_scores = self._scores[self.n_tokens - 1, :] + else: + current_scores = self._scores[0, :] new_scores = logits_processor(self._input_ids, current_scores) size = token_data_array.size @@ -1050,10 +1061,22 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): self._sampling_ctx.accept(token, False if grammar is None else True) sample_idx += 1 - if stopping_criteria is not None and stopping_criteria( - self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] - ): - return + if stopping_criteria is not None: + if self._logits_all: + logits_idx = sample_idx - self.n_tokens + check_stopping = True + else: + if sample_idx == self.n_tokens: + logits_idx = 0 + check_stopping = True + else: + check_stopping = False + + if check_stopping and stopping_criteria( + self._input_ids[: sample_idx], + self._scores[logits_idx, :] + ): + return tokens_or_none = yield token tokens.clear() tokens.append(token) @@ -1556,7 +1579,10 @@ def _create_completion( ).decode("utf-8", errors="ignore") ) token_offset = len(prompt_tokens) + returned_tokens - logits = self._scores[token_offset - 1, :] + if self._logits_all: + logits = self._scores[token_offset - 1, :] + else: + logits = self._scores[0, :] current_logprobs = Llama.logits_to_logprobs(logits).tolist() sorted_logprobs = list( sorted( @@ -1695,7 +1721,10 @@ def _create_completion( ) ) token_offset = len(prompt_tokens) + returned_tokens - 1 - logits = self._scores[token_offset, :] + if self._logits_all: + logits = self._scores[token_offset, :] + else: + logits = self._scores[0, :] current_logprobs = Llama.logits_to_logprobs(logits).tolist() sorted_logprobs = list( sorted( @@ -2406,46 +2435,72 @@ def __setstate__(self, state): def save_state(self) -> LlamaState: if self.verbose: print("Llama.save_state: saving llama state", file=sys.stderr) - state_size = llama_cpp.llama_get_state_size(self._ctx.ctx) + + # Query the backend for the required buffer size to store the current state. + state_size = llama_cpp.llama_state_get_size(self._ctx.ctx) if self.verbose: print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) + + # Allocate a ctypes uint8 array (buffer) of the required size. llama_state = (ctypes.c_uint8 * int(state_size))() if self.verbose: print("Llama.save_state: allocated state", file=sys.stderr) - n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state) + + # Copy the raw state data from the internal C context into our Python-managed buffer. + # Returns the actual number of bytes written (n_bytes). + n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size) if self.verbose: print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) + + # Safety check to prevent buffer overflow issues. if int(n_bytes) > int(state_size): raise RuntimeError("Failed to copy llama state data") - llama_state_compact = (ctypes.c_uint8 * int(n_bytes))() - llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) + + # Directly read 'n_bytes' from the buffer's memory address to create the Python bytes object. + # Significantly reducing memory overhead by avoiding an intermediate array allocation. + llama_state_bytes = ctypes.string_at(ctypes.addressof(llama_state), int(n_bytes)) if self.verbose: print( f"Llama.save_state: saving {n_bytes} bytes of llama state", file=sys.stderr, ) + + # Create and return the snapshot object. return LlamaState( scores=self._scores.copy(), input_ids=self.input_ids.copy(), n_tokens=self.n_tokens, - llama_state=bytes(llama_state_compact), + llama_state=llama_state_bytes, llama_state_size=n_bytes, seed=self._seed, ) def load_state(self, state: LlamaState) -> None: - # Only filling in up to `n_tokens` and then zero-ing out the rest - self.scores[: state.n_tokens, :] = state.scores.copy() - rest = self.scores[state.n_tokens :, :] - rest[rest > 0] = 0.0 + # Restore metadata: input tokens, token count, and RNG seed. self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens self._seed = state.seed + # Restore Logits (Scores) handling different memory configurations. + if self._logits_all: + # Case A: Full history mode. Restore as many rows as possible. + available_rows = state.scores.shape[0] + # Prevent index out of bounds by taking the minimum valid length. + limit = min(self.n_tokens, available_rows) + # Restore valid history and clear any remaining "future" slots. + self.scores[:limit, :] = state.scores[:limit, :] + self.scores[limit:, :] = 0.0 + else: + # Case B: Optimized mode (1-row buffer). + # Only restore the last token's logits if available. + if state.scores.shape[0] > 0: + self.scores[0, :] = state.scores[-1, :] + state_size = state.llama_state_size LLamaStateArrayType = ctypes.c_uint8 * state_size + # Copy the raw bytes from the Python object into a C-compatible buffer. llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) - if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size: + if llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size) != state_size: raise RuntimeError("Failed to set llama state data") def n_ctx(self) -> int: From 33fd3cbd24bb98211333da596749523118d9ac4f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 18:36:22 +0800 Subject: [PATCH 169/518] Fix CMake install layout to avoid top-level bin directory in site-packages --- CMakeLists.txt | 54 +++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aef1099165..131351c56d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,39 +5,37 @@ project(llama_cpp) option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON) option(MTMD_BUILD "Build mtmd shared library and install alongside python package" ON) + +set(CMAKE_INSTALL_BINDIR llama_cpp/bin CACHE PATH "" FORCE) +set(CMAKE_INSTALL_LIBDIR llama_cpp/lib CACHE PATH "" FORCE) +set(CMAKE_INSTALL_INCLUDEDIR llama_cpp/include CACHE PATH "" FORCE) + + # Helper function to install targets to Python package directories function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) return() endif() - # Define install destinations to avoid code duplication - set(INSTALL_DIRS - "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" - "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + install( + TARGETS ${target} + LIBRARY DESTINATION llama_cpp/lib + RUNTIME DESTINATION llama_cpp/lib + ARCHIVE DESTINATION llama_cpp/lib + FRAMEWORK DESTINATION llama_cpp/lib + RESOURCE DESTINATION llama_cpp/lib ) - foreach(DIR ${INSTALL_DIRS}) + # Windows DLL dependencies + if (WIN32) install( - TARGETS ${target} - LIBRARY DESTINATION ${DIR} - RUNTIME DESTINATION ${DIR} - ARCHIVE DESTINATION ${DIR} - FRAMEWORK DESTINATION ${DIR} - RESOURCE DESTINATION ${DIR} + FILES $ + DESTINATION llama_cpp/lib + OPTIONAL ) + endif() - # Automatically handle Windows DLL installation for each target - if (WIN32) - install( - FILES $ - DESTINATION ${DIR} - OPTIONAL # Prevent errors if the target has no DLLs - ) - endif() - endforeach() - - # Configure RPATH + # Proper RPATH handling if(UNIX) set(INSTALL_RPATH_VAL "$ORIGIN") if(APPLE) @@ -68,6 +66,9 @@ if (LLAMA_BUILD) # Enable building of the common library set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama.cpp: build common utils library" FORCE) + # Enable build and link OpenSSL + set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) + # Disable building of examples set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama.cpp: build examples" FORCE) @@ -77,9 +78,6 @@ if (LLAMA_BUILD) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: use libcurl to download model from an URL" FORCE) - # Enable build and link OpenSSL - set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) - # Architecture detection and settings for Apple platforms if (APPLE) # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture @@ -107,10 +105,8 @@ if (LLAMA_BUILD) add_subdirectory(vendor/llama.cpp) - if (WIN32) - if (TARGET llama) - set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) - endif() + if (WIN32 AND TARGET llama) + set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() # Define list of GGML targets to install From 0a9b2e4ecc72cb16b0468520ead94ba1d1a281ee Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 20:19:08 +0800 Subject: [PATCH 170/518] Update Submodule vendor/llama.cpp eeef3cf..c78e682 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index eeef3cfced..c78e682245 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit eeef3cfcedf853a938e3a7ea7f537ff3b8499474 +Subproject commit c78e682245f856ab5cfc2ffc0f8c20e8e12f163f From 68eacaed63b993a3aac8688ff6c30052711aeb8b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 20:24:50 +0800 Subject: [PATCH 171/518] ggml: Load ggml library from detected paths - Auto-select lib/ or bin/ directories - Add backend loading functions --- CMakeLists.txt | 34 +++++++++++-------- llama_cpp/_ctypes_extensions.py | 59 ++++++++++++++++++--------------- llama_cpp/_ggml.py | 34 +++++++++++++++++-- 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 131351c56d..9d3dfa00d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,23 +17,31 @@ function(llama_cpp_python_install_target target) return() endif() - install( - TARGETS ${target} - LIBRARY DESTINATION llama_cpp/lib - RUNTIME DESTINATION llama_cpp/lib - ARCHIVE DESTINATION llama_cpp/lib - FRAMEWORK DESTINATION llama_cpp/lib - RESOURCE DESTINATION llama_cpp/lib + # Define install destinations to avoid code duplication + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" ) - # Windows DLL dependencies - if (WIN32) + foreach(DIR ${INSTALL_DIRS}) install( - FILES $ - DESTINATION llama_cpp/lib - OPTIONAL + TARGETS ${target} + LIBRARY DESTINATION ${DIR} + RUNTIME DESTINATION ${DIR} + ARCHIVE DESTINATION ${DIR} + FRAMEWORK DESTINATION ${DIR} + RESOURCE DESTINATION ${DIR} ) - endif() + + # Automatically handle Windows DLL installation for each target + if (WIN32) + install( + FILES $ + DESTINATION ${DIR} + OPTIONAL # Prevent errors if the target has no DLLs + ) + endif() + endforeach() # Proper RPATH handling if(UNIX) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index 0ba7f416d9..f978be2245 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -20,25 +20,23 @@ # Load the library -def load_shared_library(lib_base_name: str, base_path: pathlib.Path): - """Platform independent shared library loader""" - # Searching for the library in the current directory under the name "libllama" (default name - # for llamacpp) and "llama" (default name for this repo) - lib_paths: List[pathlib.Path] = [] - # Determine the file extension based on the platform +def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list[pathlib.Path]]): + if isinstance(base_paths, pathlib.Path): + base_paths = [base_paths] + + lib_names = [] + if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): - lib_paths += [ - base_path / f"lib{lib_base_name}.so", - ] + lib_names = [f"lib{lib_base_name}.so"] elif sys.platform == "darwin": - lib_paths += [ - base_path / f"lib{lib_base_name}.so", - base_path / f"lib{lib_base_name}.dylib", + lib_names = [ + f"lib{lib_base_name}.dylib", + f"lib{lib_base_name}.so", ] elif sys.platform == "win32": - lib_paths += [ - base_path / f"{lib_base_name}.dll", - base_path / f"lib{lib_base_name}.dll", + lib_names = [ + f"{lib_base_name}.dll", + f"lib{lib_base_name}.dll", ] else: raise RuntimeError("Unsupported platform") @@ -47,11 +45,13 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32": - os.add_dll_directory(str(base_path)) + for base_path in base_paths: + os.add_dll_directory(str(base_path)) os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"] - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(base_path)) + if sys.platform == "win32" and sys.version_info >= (3, 9): + for base_path in base_paths: + os.add_dll_directory(str(base_path)) if "CUDA_PATH" in os.environ: cuda_path = os.environ["CUDA_PATH"] sub_dirs_to_add = [ @@ -75,16 +75,21 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): cdll_args["winmode"] = ctypes.RTLD_GLOBAL + errors = [] + # Try to load the shared library, handling potential errors - for lib_path in lib_paths: - if lib_path.exists(): - try: - return ctypes.CDLL(str(lib_path), **cdll_args) # type: ignore - except Exception as e: - raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}") - - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" + for base_path in base_paths: + for lib_name in lib_names: + lib_path = base_path / lib_name + if lib_path.exists(): + try: + return ctypes.CDLL(str(lib_path), **cdll_args) + except Exception as e: + errors.append(f"{lib_path}: {e}") + + raise RuntimeError( + f"Failed to load '{lib_base_name}' from {base_paths}\n" + + "\n".join(errors) ) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 3160b92fa9..61792a17ba 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -7,7 +7,11 @@ import pathlib import ctypes -import llama_cpp._ctypes_extensions as ctypes_ext +from llama_cpp._ctypes_extensions import ( + load_shared_library, + byref, + ctypes_function_for_shared_library, +) from typing import ( Callable, @@ -17,8 +21,15 @@ TYPE_CHECKING, ) -libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" -libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path) +libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) +libggml_base_paths = [ + libggml_base_path / "lib", + libggml_base_path / "bin", +] + +libggml = load_shared_library("ggml", libggml_base_paths) + +ggml_function = ctypes_function_for_shared_library(libggml) # // ====== ggml.h ====== @@ -361,3 +372,20 @@ class ggml_opt_optimizer_params(ctypes.Structure): ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE( ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p ) + +# // +# // Backend registry +# // + +# // Load all known backends from dynamic libraries +# GGML_API void ggml_backend_load_all(void); +@ggml_function("ggml_backend_load_all", [], None) +def ggml_backend_load_all(): + """Load all known backends from dynamic libraries""" + ... + +# GGML_API void ggml_backend_load_all_from_path(const char * dir_path); +@ggml_function("ggml_backend_load_all_from_path", [ctypes.c_char_p], None) +def ggml_backend_load_all_from_path(dir_path: ctypes.c_char_p): + """Load all known backends from path""" + ... From 520e548512711d9a0c4290e2ca13a88cf511d9a5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 22:25:28 +0800 Subject: [PATCH 172/518] fix: Enhance the handling logic for non-existent file paths. --- llama_cpp/_ctypes_extensions.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index f978be2245..3fc8b7516c 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -46,12 +46,16 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32": for base_path in base_paths: - os.add_dll_directory(str(base_path)) - os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"] + p = pathlib.Path(base_path) + if p.exists() and p.is_dir(): + os.add_dll_directory(str(p)) + os.environ["PATH"] = str(p) + os.pathsep + os.environ["PATH"] if sys.platform == "win32" and sys.version_info >= (3, 9): for base_path in base_paths: - os.add_dll_directory(str(base_path)) + p = pathlib.Path(base_path) + if p.exists() and p.is_dir(): + os.add_dll_directory(str(p)) if "CUDA_PATH" in os.environ: cuda_path = os.environ["CUDA_PATH"] sub_dirs_to_add = [ From 2ebe0ebff79d911b974ad02fe3ac82744ee4090b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 22:34:34 +0800 Subject: [PATCH 173/518] feat(loader): extend default library search paths on Linux and macOS - Improves reliability of shared library discovery for system-wide installations. --- llama_cpp/_ctypes_extensions.py | 14 ++++++++++++++ llama_cpp/_ggml.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index 3fc8b7516c..619a6a555b 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -28,11 +28,25 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): lib_names = [f"lib{lib_base_name}.so"] + + base_paths.extend([ + "/usr/local/lib", + "/usr/lib", + "/usr/lib64", + ]) + elif sys.platform == "darwin": lib_names = [ f"lib{lib_base_name}.dylib", f"lib{lib_base_name}.so", ] + + base_paths.extend([ + "/usr/local/lib", + "/opt/homebrew/lib", + "/usr/lib", + ]) + elif sys.platform == "win32": lib_names = [ f"{lib_base_name}.dll", diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 61792a17ba..35fc077d4e 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -2,10 +2,10 @@ This module provides a minimal interface for working with ggml tensors from llama-cpp-python """ +import ctypes import enum import os import pathlib -import ctypes from llama_cpp._ctypes_extensions import ( load_shared_library, From 6b38182a17effd0cedbf8736bee2464dd6c7b4da Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 19 Feb 2026 23:56:20 +0800 Subject: [PATCH 174/518] Update Submodule vendor/llama.cpp c78e682..abb9f3c --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d32dffcaac..5d9dc3a7e8 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -191,6 +191,7 @@ # LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, # LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, # LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, +#. LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -241,6 +242,7 @@ LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 +LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 # // note: these values should be synchronized with ggml_rope diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c78e682245..abb9f3c42b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c78e682245f856ab5cfc2ffc0f8c20e8e12f163f +Subproject commit abb9f3c42b5e6acee9e8e37836ef691d1a41bdb8 From 32f2380ec8ebfa0d5f01c22e3ba86d8d5e762882 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 00:07:56 +0800 Subject: [PATCH 175/518] Bump version to 0.3.26 --- CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6221d8ac5c..5fec6cf1df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.26] +- perf(llama-cpp): optimize LlamaTokenDataArray memory operations + - Cache NumPy field views for 'id', 'logit', and 'p' to bypass expensive property lookups. + - Refactor copy_logits to use pre-generated ID sequences and cached views. + - Ensure logical consistency by resetting token IDs every sampling step to counter C++ reordering. + - Minimize redundant memory allocations during the inference loop. + +- feat: Add explicit memory cleanup for sampling contexts + - Implements `close()` and `__del__` for LlamaTokenDataArray and expands LlamaSamplingContext cleanup. + - Ensures NumPy views and internal C-references are properly released to allow Python GC to reclaim memory. + +- optimize(memory): reduce scores buffer size and optimize state saving + - Update save_state and load_state API use. + - Refactored self.scores to allocate only a single row (1, n_vocab) when logits_all=False, significantly reducing memory usage for large vocabulary models. + - Optimized save_state to eliminate redundant memory allocations and copies by using ctypes.string_at. + - Updated load_state, eval, and sampler adapters to correctly handle the dynamic shape of self.scores. + +- Fix CMake install layout to avoid top-level bin directory in site-packages + +- ggml: Load ggml library from candidate path list + - Auto-select lib/ or bin/ directories + - Add backend loading functions + +- feat(loader): extend default library search paths on Linux and macOS + - `load_shared_library` to include a path list feature (allowing you to add custom paths in addition to the default ones). You can later add your own paths to the `libggml_base_paths` candidate list in `_ggml.py`, such as those not commonly used Python paths. + - fix: Enhance the handling logic for non-existent file paths. + - Improves reliability of shared library discovery for system-wide installations. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/abb9f3c42b5e6acee9e8e37836ef691d1a41bdb8](https://github.com/ggml-org/llama.cpp/commit/abb9f3c42b5e6acee9e8e37836ef691d1a41bdb8) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260219 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/b03224b2dde3c8cbdd8bf529794e3a41ac7f5751...6b38182a17effd0cedbf8736bee2464dd6c7b4da + ## [0.3.25] - feat: [Refactor Llama class to use new LlamaSampler chain API from _internals](https://github.com/JamePeng/llama-cpp-python/commit/1e6094a327f0fb9dc35d52f84d8ebabc1faa1e95) This commit refactors the high-level Llama class to fully utilize the new C++ `llama_sampler` chain architecture via `LlamaSamplingContext`. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 52101c9b7e..bbfb73de3f 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.25" +__version__ = "0.3.26" From 3d0fd1b75ee564361a4babf21f88855225ba1fe0 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 08:47:05 +0800 Subject: [PATCH 176/518] fix typo in metal workflow --- .github/workflows/build-wheels-metal.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index fa205d8929..abb8969247 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -54,7 +54,7 @@ jobs: -DGGML_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=off - -DGGML_METAL_SHADER_DEBUG=on + -DGGML_METAL_SHADER_DEBUG=on" with: package-dir: . output-dir: wheelhouse2 From e7dda03e4ab8b50637e5cb0f32ce951a8eaa4f04 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 19:29:20 +0800 Subject: [PATCH 177/518] Fix custom sampler memory cleanup and improve lifecycle management - Add explicit close() and __del__() to CustomSampler to safely free C resources and break Python reference cycles - Ensure custom samplers are properly detached and freed in LlamaSampler.close() - Add minor documentation comments for clarity --- llama_cpp/_internals.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 424e7d95ca..26ed40833e 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1365,8 +1365,10 @@ def __init__( raise TypeError("apply_func must be callable") self.apply_func = apply_func + # Convert the name to bytes for C compatibility self.name_bytes = name.encode("utf-8") + # Define internal Python callbacks def _cb_name(_): return self.name_bytes @@ -1398,10 +1400,7 @@ def _cb_clone(_): self._cb_free_ref = llama_cpp.llama_sampler_free_fn(_cb_free) self._cb_clone_ref = llama_cpp.llama_sampler_clone_fn(_cb_clone) - # ----------------------------- # Build llama_sampler_i - # ----------------------------- - self.llama_sampler_i = llama_cpp.llama_sampler_i() self.llama_sampler_i.name = self._cb_name_ref @@ -1434,8 +1433,31 @@ def _cb_clone(_): raise RuntimeError("Failed to initialize custom sampler") def get_sampler(self) -> llama_cpp.llama_sampler_p: + """Returns the underlying C pointer to the initialized sampler.""" return self.sampler_p + def close(self): + """Safely releases C memory and breaks Python reference cycles.""" + if hasattr(self, 'sampler_p') and self.sampler_p: + try: + llama_cpp.llama_sampler_free(self.sampler_p) + except Exception: + pass + self.sampler_p = None + + self.llama_sampler_i = None + self._cb_name_ref = None + self._cb_apply_ref = None + self._cb_accept_ref = None + self._cb_reset_ref = None + self._cb_free_ref = None + self._cb_clone_ref = None + self.apply_func = None + + def __del__(self): + """Fallback cleanup if the object is GC before close() is called.""" + self.close() + class LlamaSampler: def __init__(self, existing_sampler_p: Optional[llama_cpp.llama_sampler_p] = None): @@ -1516,10 +1538,20 @@ def reset(self): def close(self): if self.sampler: - for index, _ in reversed(self.custom_samplers): + # Iterate backwards to safely remove samplers without shifting indices + for index, custom_sampler in reversed(self.custom_samplers): + # Detach the custom sampler from the C-level chain llama_cpp.llama_sampler_chain_remove(self.sampler, index) + + # Explicitly free the custom sampler's C memory and Python callbacks + if custom_sampler: + custom_sampler.close() + + # Free the main official sampler chain llama_cpp.llama_sampler_free(self.sampler) self.sampler = None + + # Clear cache lists self.samplers.clear() self.custom_samplers.clear() self._keep_alive.clear() From d696ddd64b23e2e4327479f9c437cee91a6bb35c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 19:55:49 +0800 Subject: [PATCH 178/518] fix: resolve memory leaks in sampling context lifecycle - Safely close temporary `LlamaSamplingContext` in `sample()` using a try-finally block. - Explicitly release the previous `_sampling_ctx` in `generate()` before re-assignment to prevent orphaned pointers. - Ensure `_sampling_ctx` is properly freed in `Llama.close()`. --- llama_cpp/llama.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1664c63bdf..1ea950934b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -828,8 +828,10 @@ def sample( assert self.n_tokens > 0 s_ctx = self._sampling_ctx + is_temp_ctx = False if s_ctx is None: + is_temp_ctx = True params = LlamaSamplingParams( # Core top_k=top_k, @@ -902,7 +904,12 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): ridx = idx - self.n_tokens if idx is not None else -1 assert s_ctx is not None - token = s_ctx.sample(self._ctx, ridx) + try: + token = s_ctx.sample(self._ctx, ridx) + finally: + if is_temp_ctx: + s_ctx.close() + return token def generate( @@ -1030,6 +1037,10 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): if CommonSamplerType.CUSTOM not in params.samplers: params.samplers.insert(3, CommonSamplerType.CUSTOM) + if getattr(self, "_sampling_ctx", None) is not None: + self._sampling_ctx.close() + self._sampling_ctx = None + self._sampling_ctx = LlamaSamplingContext(params, self._model) # Check for kv cache prefix match @@ -2585,6 +2596,9 @@ def pooling_type(self) -> str: def close(self) -> None: """Explicitly free the model from memory.""" + if getattr(self, "_sampling_ctx", None) is not None: + self._sampling_ctx.close() + self._sampling_ctx = None self._stack.close() def __del__(self) -> None: From f47a9926e0a5b9eb56b96892629affa3834474f3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 20:05:23 +0800 Subject: [PATCH 179/518] Update Submodule vendor/llama.cpp abb9f3c..b908baf --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index abb9f3c42b..b908baf182 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit abb9f3c42b5e6acee9e8e37836ef691d1a41bdb8 +Subproject commit b908baf1825b1a89afef87b09e22c32af2ca6548 From eedfa528144e06eb2cd41aedc99190ad1ae0b6d1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 20:05:48 +0800 Subject: [PATCH 180/518] Update llama_model_quantize params --- llama_cpp/llama_cpp.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 5d9dc3a7e8..9e05d52baa 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -990,19 +990,20 @@ class llama_context_params(ctypes.Structure): # // model quantization parameters # typedef struct llama_model_quantize_params { -# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum llama_ftype ftype; // quantize to this llama_ftype -# enum ggml_type output_tensor_type; // output tensor type -# enum ggml_type token_embedding_type; // token embeddings tensor type -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored -# bool pure; // quantize all tensors to the default type -# bool keep_split; // quantize to the same number of shards -# void * imatrix; // pointer to importance matrix data -# void * kv_overrides; // pointer to vector containing overrides -# void * tensor_types; // pointer to vector containing tensor types -# void * prune_layers; // pointer to vector containing layer indices to prune +# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum llama_ftype ftype; // quantize to this llama_ftype +# enum ggml_type output_tensor_type; // output tensor type +# enum ggml_type token_embedding_type; // token embeddings tensor type +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored +# bool pure; // quantize all tensors to the default type +# bool keep_split; // quantize to the same number of shards +# bool dry_run; // calculate and show the final quantization size without performing quantization +# void * imatrix; // pointer to importance matrix data +# void * kv_overrides; // pointer to vector containing overrides +# void * tensor_types; // pointer to vector containing tensor types +# void * prune_layers; // pointer to vector containing layer indices to prune # } llama_model_quantize_params; class llama_model_quantize_params(ctypes.Structure): """Parameters for llama_model_quantize @@ -1017,6 +1018,7 @@ class llama_model_quantize_params(ctypes.Structure): only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored pure (bool): quantize all tensors to the default type keep_split (bool): quantize to the same number of shards + dry_run (bool): calculate and show the final quantization size without performing quantization imatrix (ctypes.c_void_p): pointer to importance matrix data kv_overrides (ctypes.c_void_p): pointer to vector containing overrides tensor_types (ctypes.c_void_p): pointer to vector containing tensor types @@ -1033,6 +1035,7 @@ class llama_model_quantize_params(ctypes.Structure): only_copy: bool pure: bool keep_split: bool + dry_run: bool imatrix: ctypes.c_void_p kv_overrides: ctypes.c_void_p tensor_types: ctypes.c_void_p @@ -1048,6 +1051,7 @@ class llama_model_quantize_params(ctypes.Structure): ("only_copy", ctypes.c_bool), ("pure", ctypes.c_bool), ("keep_split", ctypes.c_bool), + ("dry_run", ctypes.c_bool), ("imatrix", ctypes.c_void_p), ("kv_overrides", ctypes.c_void_p), ("tensor_types", ctypes.c_void_p), From 7b360e2e813d30107caca3d8809a51216e2b0254 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 20:19:04 +0800 Subject: [PATCH 181/518] Free _candidates and large numpy arrays during explicit close() --- llama_cpp/llama.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1ea950934b..399259a6af 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2599,6 +2599,14 @@ def close(self) -> None: if getattr(self, "_sampling_ctx", None) is not None: self._sampling_ctx.close() self._sampling_ctx = None + + if getattr(self, "_candidates", None) is not None: + self._candidates.close() + self._candidates = None + + self.scores = None + self.input_ids = None + self._stack.close() def __del__(self) -> None: From 5304cba3daccd4cbd43747f929cb61985aba16c6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Feb 2026 22:56:10 +0800 Subject: [PATCH 182/518] Optimize longest_token_prefix to use zero-copy NumPy arrays and drop .tolist() overhead --- llama_cpp/llama.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 399259a6af..ba3d164f9f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1045,7 +1045,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): # Check for kv cache prefix match if reset and self.n_tokens > 0: - longest_prefix = self.longest_token_prefix(self._input_ids.tolist(), tokens[:-1]) + longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) if longest_prefix > 0: reset = False tokens = tokens[longest_prefix:] @@ -1455,10 +1455,10 @@ def _create_completion( try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( - cache_item.input_ids.tolist(), prompt_tokens + cache_item.input_ids, prompt_tokens ) eval_prefix_len = Llama.longest_token_prefix( - self._input_ids.tolist(), prompt_tokens + self._input_ids, prompt_tokens ) if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) @@ -2631,7 +2631,10 @@ def logits_to_logprobs( return subtract_maxs - out @staticmethod - def longest_token_prefix(current_ids: Sequence[int], new_tokens: Sequence[int]) -> int: + def longest_token_prefix( + current_ids: Union[Sequence[int], npt.NDArray[np.intc]], + new_tokens: Union[Sequence[int], npt.NDArray[np.intc]] + ) -> int: """ Calculates the length of the longest common prefix between two token sequences. @@ -2647,13 +2650,11 @@ def longest_token_prefix(current_ids: Sequence[int], new_tokens: Sequence[int]) int: The number of matching tokens from the start. """ # Fast exit for empty sequences to avoid unnecessary processing - if not current_ids or not new_tokens: + if len(current_ids) == 0 or len(new_tokens) == 0: return 0 # Determine the comparison range (limited by the shorter sequence) min_len = min(len(current_ids), len(new_tokens)) - if min_len == 0: - return 0 # Probe inspection: Use Python to quickly compare the first token # If the tokens are different from the beginning, return immediately to avoid any NumPy overhead. @@ -2663,8 +2664,8 @@ def longest_token_prefix(current_ids: Sequence[int], new_tokens: Sequence[int]) # Accelerating SIMD for Large Data Volumes # Only transform necessary slices, avoid processing irrelevant data # Use asarray to ensure zero-copy (if the input is already an array) - current_ids_array = np.asarray(current_ids[:min_len], dtype=np.int32) - new_tokens_array = np.asarray(new_tokens[:min_len], dtype=np.int32) + current_ids_array = np.asarray(current_ids[:min_len], dtype=np.intc) + new_tokens_array = np.asarray(new_tokens[:min_len], dtype=np.intc) # Perform vectorized element-wise comparison (SIMD instruction set usage) # Creates a boolean array where True indicates a match (e.g., [True, True, False, ...]) From 92f1ccf7ab49ab874593d71cb06e4416d114b4e3 Mon Sep 17 00:00:00 2001 From: bgmt Date: Fri, 20 Feb 2026 21:18:37 -0500 Subject: [PATCH 183/518] search system paths for shared libraries --- llama_cpp/_ctypes_extensions.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index 619a6a555b..cd8a182cae 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -95,10 +95,19 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list errors = [] - # Try to load the shared library, handling potential errors + # First, try to find an available library through the system + lib_path = ctypes.util.find_library(lib_base_name) + if lib_path: + try: + return ctypes.CDLL(lib_path, **cdll_args) + except Exception as e: + pass + + # Then fallback to manually checking the list of paths. for base_path in base_paths: for lib_name in lib_names: - lib_path = base_path / lib_name + lib_path = pathlib.Path(base_path) / lib_name + if lib_path.exists(): try: return ctypes.CDLL(str(lib_path), **cdll_args) From 9a33454260f1f177b7f0694d3fc7ed8ee672b351 Mon Sep 17 00:00:00 2001 From: bgmt Date: Sat, 21 Feb 2026 06:57:23 -0500 Subject: [PATCH 184/518] fix import --- llama_cpp/_ctypes_extensions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index cd8a182cae..0e5153e3aa 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -5,7 +5,7 @@ import ctypes import functools import pathlib - +from ctypes.util import find_library from typing import ( Any, Callable, @@ -96,7 +96,7 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list errors = [] # First, try to find an available library through the system - lib_path = ctypes.util.find_library(lib_base_name) + lib_path = find_library(lib_base_name) if lib_path: try: return ctypes.CDLL(lib_path, **cdll_args) From 1619c16cd542731825352824c0374fd40fc74e44 Mon Sep 17 00:00:00 2001 From: bgmt Date: Sat, 21 Feb 2026 07:26:31 -0500 Subject: [PATCH 185/518] append errors --- llama_cpp/_ctypes_extensions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index 0e5153e3aa..a8936fa2bf 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -101,7 +101,7 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list try: return ctypes.CDLL(lib_path, **cdll_args) except Exception as e: - pass + errors.append(f"{lib_path}: {e}") # Then fallback to manually checking the list of paths. for base_path in base_paths: From 845895dae1c5a57164852eb8e5cce4e5b79da807 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 22 Feb 2026 03:54:19 +0800 Subject: [PATCH 186/518] Update Submodule vendor/llama.cpp b908baf..f75c4e8 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b908baf182..f75c4e8bf5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b908baf1825b1a89afef87b09e22c32af2ca6548 +Subproject commit f75c4e8bf52ea480ece07fd3d9a292f1d7f04bc5 From dc9066695973d1ea4031bb77f471e4ddd07a3bad Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 22 Feb 2026 03:55:50 +0800 Subject: [PATCH 187/518] feat: add memory breakdown and sampler performance timings APIs - Expose `llama_memory_breakdown_print` in `LlamaContext` to debug detailed VRAM/RAM allocations (KV cache, compute buffers, etc.). - Add `print_timings` and `reset_timings` to `LlamaSampler` wrapping `llama_perf_sampler_*` to profile individual sampler latency. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 26ed40833e..bd6bd042e2 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -585,6 +585,9 @@ def reset_timings(self): def print_timings(self): llama_cpp.llama_perf_context_print(self.ctx) + def print_memory_breakdown(self): + llama_cpp.llama_memory_breakdown_print(self.ctx) + # Utility functions @staticmethod def default_params(): @@ -1536,6 +1539,20 @@ def reset(self): assert self.sampler is not None llama_cpp.llama_sampler_reset(self.sampler) + def reset_timings(self): + """ + Reset the performance timings for the sampler chain. + """ + assert self.sampler is not None + llama_cpp.llama_perf_sampler_reset(self.sampler) + + def print_timings(self): + """ + Print the performance timings for each sampler in the chain. + """ + assert self.sampler is not None + llama_cpp.llama_perf_sampler_print(self.sampler) + def close(self): if self.sampler: # Iterate backwards to safely remove samplers without shifting indices From f802785bb30e96e20cea011d3034f35a33d60b04 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Feb 2026 00:27:35 +0800 Subject: [PATCH 188/518] Update Submodule vendor/llama.cpp f75c4e8..cacc371 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f75c4e8bf5..cacc371f99 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f75c4e8bf52ea480ece07fd3d9a292f1d7f04bc5 +Subproject commit cacc371f99fb3b5b431d3fa89ac0c752bbd62a3b From 17e274c48893512d12870f12463f047fb465d3f3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Feb 2026 00:32:19 +0800 Subject: [PATCH 189/518] fix: resolve reload memory leaks by breaking circular references in cleanup - Remove closure-based `ExitStack` callbacks in `LlamaModel`, `LlamaContext`, and `LlamaBatch` to eliminate circular references. - Move explicit C-level memory freeing (`llama_*_free`) directly into `close()` methods. - Nullify additional attributes (`tokenizer_`, `model_params`, etc.) in `Llama.close()` to guarantee immediate garbage collection during unload. - Add todo comment to LoRA process logic, the current LoRa loading logic is outdated and needs to be refactored. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 44 ++++++++++++++++++++-------------------- llama_cpp/llama.py | 45 ++++++++++++++++++++++++----------------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index bd6bd042e2..506de27d50 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -75,15 +75,16 @@ def __init__( self.model = model self.vocab = vocab - def free_model(): - if self.model is None: - return - llama_cpp.llama_model_free(self.model) + def close(self): + """Manually free LlamaModel and Vocab resources.""" + if getattr(self, "model", None) is not None: + try: + llama_cpp.llama_model_free(self.model) + except Exception: + pass self.model = None + self.vocab = None - self._exit_stack.callback(free_model) - - def close(self): self._exit_stack.close() def __del__(self): @@ -377,15 +378,15 @@ def __init__( self.ctx = ctx - def free_ctx(): - if self.ctx is None: - return - llama_cpp.llama_free(self.ctx) + def close(self): + """Manually free LlamaContext resources.""" + if getattr(self, "ctx", None) is not None: + try: + llama_cpp.llama_free(self.ctx) + except Exception: + pass self.ctx = None - self._exit_stack.callback(free_ctx) - - def close(self): self._exit_stack.close() def __del__(self): @@ -625,16 +626,15 @@ def __init__( self.batch = batch - def free_batch(): - if self.batch is None: - return - llama_cpp.llama_batch_free(self.batch) + def close(self): + """Manually free LlamaBatch resources.""" + if getattr(self, "batch", None) is not None: + try: + llama_cpp.llama_batch_free(self.batch) + except Exception: + pass self.batch = None - self._exit_stack.callback(free_batch) - - def close(self): - """Manually free resources.""" self._exit_stack.close() def __del__(self): diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ba3d164f9f..cf543a85f8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -497,7 +497,8 @@ def free_lora_adapter(): self._stack.callback(free_lora_adapter) - if llama_cpp.llama_set_adapter_lora( + # Todo(JamePeng): The current LoRa loading logic is outdated and needs to be refactored. + if llama_cpp.llama_set_adapters_lora( self._ctx.ctx, self._lora_adapter, self.lora_scale ): raise RuntimeError( @@ -623,6 +624,30 @@ def free_lora_adapter(): self._sampling_ctx: Optional[LlamaSamplingContext] = None + def close(self) -> None: + """Explicitly free the model from memory.""" + if getattr(self, "_sampling_ctx", None) is not None: + self._sampling_ctx.close() + self._sampling_ctx = None + + if getattr(self, "_candidates", None) is not None: + self._candidates.close() + self._candidates = None + + self.model_params =None + self.context_params = None + self.input_ids = None + self.scores = None + self.tokenizer_ = None + + self._c_tensor_split = None + self._kv_overrides_array = None + + self._stack.close() + + def __del__(self) -> None: + self.close() + @property def ctx(self) -> llama_cpp.llama_context_p: return self._ctx.ctx @@ -2594,24 +2619,6 @@ def pooling_type(self) -> str: """Return the pooling type.""" return self._ctx.pooling_type() - def close(self) -> None: - """Explicitly free the model from memory.""" - if getattr(self, "_sampling_ctx", None) is not None: - self._sampling_ctx.close() - self._sampling_ctx = None - - if getattr(self, "_candidates", None) is not None: - self._candidates.close() - self._candidates = None - - self.scores = None - self.input_ids = None - - self._stack.close() - - def __del__(self) -> None: - self.close() - @staticmethod def logits_to_logprobs( logits: Union[npt.NDArray[np.single], List], axis: int = -1 From 5456d4e55406426db559dd6a75c0b70a8edfb857 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Feb 2026 02:06:57 +0800 Subject: [PATCH 190/518] fix: resolve VRAM leak in multimodal models by explicitly closing mtmd context - Remove the `ExitStack` closure in `Llava15ChatHandler` to break circular references preventing garbage collection of the vision context. - Implement explicit `close()` and `__del__()` methods in the chat handler to safely free `mtmd_ctx`. - Integrate `chat_handler.close()` into the main `Llama.close()` lifecycle and nullify related attributes for immediate memory reclamation. --- llama_cpp/llama.py | 4 ++++ llama_cpp/llama_chat_format.py | 38 ++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cf543a85f8..ee8b1eb874 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -634,8 +634,12 @@ def close(self) -> None: self._candidates.close() self._candidates = None + if hasattr(self, "chat_handler") and hasattr(self.chat_handler, "close"): + self.chat_handler.close() + self.model_params =None self.context_params = None + self.chat_handler = None self.input_ids = None self.scores = None self.tokenizer_ = None diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 99c4bfb460..1eccdf413b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2850,16 +2850,16 @@ def _init_mtmd_context(self, llama_model: llama.Llama): self._mtmd_cpp.mtmd_helper_log_set(llama_log_callback, ctypes.c_void_p(0)) # Get default parameters - mctx_params = self._mtmd_cpp.mtmd_context_params_default() - mctx_params.use_gpu = self.use_gpu - mctx_params.print_timings = self.verbose - mctx_params.n_threads = llama_model.n_threads - mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - mctx_params.warmup = True + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True if self.image_min_tokens > 0: - mctx_params.image_min_tokens = self.image_min_tokens + self.mctx_params.image_min_tokens = self.image_min_tokens if self.image_max_tokens > 0: - mctx_params.image_max_tokens = self.image_max_tokens + self.mctx_params.image_max_tokens = self.image_max_tokens if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: raise ValueError(f"image_max_pixels {self.image_max_tokens} is less than image_min_pixels {self.image_min_tokens}") @@ -2867,7 +2867,7 @@ def _init_mtmd_context(self, llama_model: llama.Llama): self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( self.clip_model_path.encode(), llama_model.model, - mctx_params + self.mctx_params ) if self.mtmd_ctx is None: @@ -2877,13 +2877,21 @@ def _init_mtmd_context(self, llama_model: llama.Llama): if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): raise ValueError("Vision is not supported by this model") - def mtmd_free(): - with suppress_stdout_stderr(disable=self.verbose): - if self.mtmd_ctx is not None: - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) - self.mtmd_ctx = None + def close(self) -> None: + """Explicitly free the mtmd context and vision model resources.""" + if getattr(self, "mtmd_ctx", None) is not None: + try: + with suppress_stdout_stderr(disable=getattr(self, "verbose", True)): + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + except Exception: + pass + self.mtmd_ctx = None + self.mctx_params = None + + self._exit_stack.close() - self._exit_stack.callback(mtmd_free) + def __del__(self) -> None: + self.close() def load_image(self, image_url: str) -> bytes: return self._load_image(image_url) From 1f8341ee74a2fea15a1008487cbecaccf918c755 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Feb 2026 02:24:49 +0800 Subject: [PATCH 191/518] feat: add `PaddleOCR-VL-1.5` multimodal chat handler `PaddleOCRChatHandler` --- README.md | 1 + llama_cpp/llama_chat_format.py | 120 +++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/README.md b/README.md index db42c5c029..885ae0134d 100644 --- a/README.md +++ b/README.md @@ -519,6 +519,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | | [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | +| [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1eccdf413b..13d62bbaf0 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4227,6 +4227,126 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class PaddleOCRChatHandler(Llava15ChatHandler): + """ + Handler for PaddleOCR 1.5 multimodal models. + """ + + PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" + PADDLEOCR_BOS_TOKEN = "" + PADDLEOCR_EOS_TOKEN = "" + PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" + PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" + PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" + + CHAT_FORMAT = ( + "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" + "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" + "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" + + "{{- cls_token -}}" + "{%- for message in messages -%}" + "{%- if message['role'] == 'user' -%}" + "{{- 'User: ' -}}" + + # Robust parsing: Check if content is string or list + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + # Pass 1: Render all images first + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" + "{{- '<|IMAGE_START|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|IMAGE_END|>' -}}" + "{%- endif -%}" + "{%- endfor -%}" + + # Pass 2: Render all text second + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '\\n' -}}" + + "{%- elif message['role'] == 'assistant' -%}" + "{{- 'Assistant:\\n' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- eos_token -}}" + + "{%- elif message['role'] == 'system' -%}" + "{%- if message['content'] is string -%}" + "{{- message['content'] + '\\n' -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] + '\\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "{{- 'Assistant:\\n' -}}" + "{%- endif -%}" + ) + + def __init__( + self, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + **kwargs + ): + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__( + image_min_tokens=self.image_min_tokens, + image_max_tokens=self.image_max_tokens, + **kwargs + ) + + def __call__(self, **kwargs): + # Set the specific stop token defined in the PaddleOCR template + kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] + + llama = kwargs['llama'] + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"PaddleOCRChatHandler - Cleared state, Processing {image_count} images", file=sys.stderr) + except Exception: + print(f"PaddleOCRChatHandler - Cleared state", file=sys.stderr) + + return super().__call__(**kwargs) + + class Qwen25VLChatHandler(Llava15ChatHandler): CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" From e4861df5fd44bb83ec2b9063ca3375759416aead Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Feb 2026 02:47:33 +0800 Subject: [PATCH 192/518] Bump version to 0.3.27 --- CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fec6cf1df..84e63ad2b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.27] + +- feat: add `PaddleOCR-VL-1.5` multimodal chat handler `PaddleOCRChatHandler` + +- fix: resolve VRAM leak in multimodal models by explicitly closing mtmd context + - Remove the `ExitStack` closure in `Llava15ChatHandler` to break circular references preventing garbage collection of the vision context. + - Implement explicit `close()` and `__del__()` methods in the chat handler to safely free `mtmd_ctx`. + - Integrate `chat_handler.close()` into the main `Llama.close()` lifecycle and nullify related attributes for immediate memory reclamation. + +- fix: resolve reload memory leaks by breaking circular references in cleanup + - Remove closure-based `ExitStack` callbacks in `LlamaModel`, `LlamaContext`, and `LlamaBatch` to eliminate circular references. + - Move explicit C-level memory freeing (`llama_*_free`) directly into `close()` methods. + - Nullify additional attributes (`tokenizer_`, `model_params`, etc.) in `Llama.close()` to guarantee immediate garbage collection during unload. + - Add todo comment to LoRA process logic, the current LoRa loading logic is outdated and needs to be refactored. + +- feat: add memory breakdown and sampler performance timings APIs + +- feat: Search system paths for shared libraries(`by @benniekiss`) + +- Optimize `longest_token_prefix` to use zero-copy NumPy arrays and drop .tolist() overhead + +- Free _candidates and large numpy arrays during explicit close() + +- fix: resolve memory leaks in sampling context lifecycle + - Safely close temporary `LlamaSamplingContext` in `sample()` using a try-finally block. + - Explicitly release the previous `_sampling_ctx` in `generate()` before re-assignment to prevent orphaned pointers. + - Ensure `_sampling_ctx` is properly freed in `Llama.close()`. + +- Fix custom sampler memory cleanup and improve lifecycle management + - Add explicit `close()` and `__del__()` to CustomSampler to safely free C resources and break Python reference cycles + - Ensure custom samplers are properly detached and freed in `LlamaSampler.close()` + - Add minor documentation comments for clarity + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/cacc371f99fb3b5b431d3fa89ac0c752bbd62a3b](https://github.com/ggml-org/llama.cpp/commit/cacc371f99fb3b5b431d3fa89ac0c752bbd62a3b) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260223 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/3d0fd1b75ee564361a4babf21f88855225ba1fe0...1f8341ee74a2fea15a1008487cbecaccf918c755 + ## [0.3.26] - perf(llama-cpp): optimize LlamaTokenDataArray memory operations - Cache NumPy field views for 'id', 'logit', and 'p' to bypass expensive property lookups. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index bbfb73de3f..369f24ca88 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.26" +__version__ = "0.3.27" From 01ced971c0d13b8910df69dd2b834f6f6c5cb47a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Feb 2026 15:03:38 +0800 Subject: [PATCH 193/518] LLama: Optimize KV cache management for multi-round conversations - Implements prefix-matching logic to truncate stale "ghost" tokens in C++ KV cache - Prevents attention misalignment and context poisoning during multi-turn interactions - Reduces memory overhead by reusing matched prefixes efficiently Signed-off-by: JamePeng --- llama_cpp/llama.py | 14 ++++++++++++++ llama_cpp/llama_chat_format.py | 30 ------------------------------ 2 files changed, 14 insertions(+), 30 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ee8b1eb874..f9de500fe6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1077,14 +1077,28 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) if longest_prefix > 0: reset = False + + # Physically erase trailing "ghost" tokens from the C++ KV cache + # to prevent attention misalignment in multi-round chats. + if longest_prefix < self.n_tokens: + if self.verbose: + print(f"Llama.generate: Truncating KV cache size from {self.n_tokens} to {longest_prefix}", file=sys.stderr) + self._ctx.memory_seq_rm(0, longest_prefix, -1) + + # Adjust the tokens array and cursor to reuse the matched cache tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix + if self.verbose: print( f"Llama.generate: {longest_prefix} prefix-match hit, " f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr, ) + else: + # No prefix matched. Completely clear the KV cache to prevent context poisoning. + self.n_tokens = 0 + self._ctx.memory_clear(True) # Reset the model state if reset: diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 13d62bbaf0..f4760838d8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3817,9 +3817,6 @@ def __call__(self, **kwargs): kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] llama = kwargs['llama'] - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -3950,11 +3947,6 @@ def __call__(self, **kwargs): llama = kwargs['llama'] - # Clear state for multiple runs - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4053,9 +4045,6 @@ def __call__(self, **kwargs): kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch llama = kwargs['llama'] - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4140,9 +4129,6 @@ def __call__(self, **kwargs): kwargs['stop'] = [self.GRANITE_EOS_TOKEN] llama = kwargs['llama'] - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4205,9 +4191,6 @@ def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwa def __call__(self, **kwargs): llama = kwargs['llama'] - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4325,9 +4308,6 @@ def __call__(self, **kwargs): kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] llama = kwargs['llama'] - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4381,11 +4361,6 @@ class Qwen25VLChatHandler(Llava15ChatHandler): def __call__(self, **kwargs): llama = kwargs['llama'] - # Clear state for multiple runs - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4525,11 +4500,6 @@ def __call__(self, **kwargs): llama = kwargs['llama'] - # Clear state for multiple runs - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) From a8a443db59245b6c4b99a723df783175265c4878 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 25 Feb 2026 23:44:10 +0800 Subject: [PATCH 194/518] Update Submodule vendor/llama.cpp cacc371..b68d751 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index cacc371f99..b68d75165a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit cacc371f99fb3b5b431d3fa89ac0c752bbd62a3b +Subproject commit b68d75165ad37ba1256cc45a43ec4f51cf813c3e From e230ebb842909469dadfd5f0733cd27618a23406 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 26 Feb 2026 22:51:22 +0800 Subject: [PATCH 195/518] Remove the hack code in llama_chat_format.py --- llama_cpp/llama_chat_format.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f4760838d8..ab60fa0c69 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3091,12 +3091,6 @@ def __call__( n_past = new_n_past.value llama.n_tokens = n_past - n_past = llama.n_tokens - if n_past > 0: - llama._ctx.memory_seq_rm(0, n_past - 1, -1) - if llama._ctx.memory_seq_pos_min(0) == llama._ctx.memory_seq_pos_max(0): - n_past += 1 - llama.n_tokens = n_past # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() From f6dda3878e91476ca6d9fc54c94d0457c3b9213c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 00:25:33 +0800 Subject: [PATCH 196/518] feat(cache): implement HybridCheckpointCache for hybrid/recurrent models Introduces a dedicated caching mechanism to support state rollback for models that cannot physically truncate their KV cache (e.g., Qwen3-Next, Qwen3.5, etc.). Key additions and changes: - Add `HybridCheckpoint` dataclass to store RNN state snapshots along with their binary data and metadata. - Implement `HybridCheckpointCache` to manage sequence-specific states using the `llama_state_seq_*_ext` C++ APIs. - Introduce `_hash_prefix` using SHA-256 to guarantee cryptographic certainty when matching prompt histories, preventing state corruption. - Add `save_checkpoint` with a FIFO eviction policy to strictly bound memory usage based on `max_checkpoints`. - Add `restore_checkpoint` to securely inject valid RNN states back into the C++ backend. - Explicitly disable incompatible dictionary interfaces (`__getitem__`, `__setitem__`, `__contains__`) inherited from `BaseLlamaCache`. - Refactor module imports (alphabetical sorting) and relocate `LlamaDiskCache` for better structural consistency. --- llama_cpp/llama_cache.py | 267 ++++++++++++++++++++++++++++++++------- 1 file changed, 219 insertions(+), 48 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index 6db1f05292..ce76c12556 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -1,15 +1,21 @@ -import sys from abc import ABC, abstractmethod +from collections import OrderedDict +import ctypes +from dataclasses import dataclass +import diskcache +import hashlib +import sys from typing import ( + Any, + List, Optional, Sequence, Tuple, ) -from collections import OrderedDict - -import diskcache import llama_cpp.llama +import llama_cpp._internals as _internals +import llama_cpp.llama_cpp as llama_cpp from .llama_types import * @@ -46,6 +52,60 @@ def __setitem__( raise NotImplementedError +class LlamaDiskCache(BaseLlamaCache): + """Cache for a llama.cpp model using disk.""" + + def __init__( + self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + ): + super().__init__(capacity_bytes) + self.cache = diskcache.Cache(cache_dir) + + @property + def cache_size(self): + return int(self.cache.volume()) # type: ignore + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key: Optional[Tuple[int, ...]] = None + for k in self.cache.iterkeys(): # type: ignore + prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) + if prefix_len > min_len: + min_len = prefix_len + min_key = k # type: ignore + return min_key + + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore + # NOTE: This puts an integer as key in cache, which breaks, + # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # self.cache.push(_key, side="front") # type: ignore + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + print("LlamaDiskCache.__setitem__: called", file=sys.stderr) + key = tuple(key) + if key in self.cache: + print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) + del self.cache[key] + self.cache[key] = value + print("LlamaDiskCache.__setitem__: set", file=sys.stderr) + while self.cache_size > self.capacity_bytes and len(self.cache) > 0: + key_to_remove = next(iter(self.cache)) + del self.cache[key_to_remove] + print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + + class LlamaRAMCache(BaseLlamaCache): """Cache for a llama.cpp model using RAM.""" @@ -259,55 +319,166 @@ def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): LlamaCache = LlamaRAMCache -class LlamaDiskCache(BaseLlamaCache): - """Cache for a llama.cpp model using disk.""" +@dataclass +class HybridCheckpoint: + """Represents a single snapshot of the RNN/Hybrid model's hidden state.""" + pos: int # The token position (cursor) where this snapshot was taken + data: bytes # The raw binary RNN state data + hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching + size: int # Size of the state data in bytes + seq_id: int # Sequence ID this checkpoint belongs to - def __init__( - self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) - ): - super().__init__(capacity_bytes) - self.cache = diskcache.Cache(cache_dir) +class HybridCheckpointCache(BaseLlamaCache): + """ + Manager for RNN state snapshots (Checkpoints) tailored for Hybrid/Recurrent models. + Provides rollback capabilities for models that cannot physically truncate KV cache. + """ + def __init__(self, ctx: llama_cpp.llama_context_p, seq_id: int = 0, max_checkpoints: int = 16, verbose: bool = False): + if ctx is None: + raise ValueError("HybridCheckpointCache: Failed to create HybridCheckpointCache with model context") + self._ctx = ctx + self.seq_id = seq_id + self.max_checkpoints = max_checkpoints + self.checkpoints: list[HybridCheckpoint] = [] + self._current_size = 0 + + # Cache C-type API function pointers for performance + self._get_size_ext = llama_cpp.llama_state_seq_get_size_ext + self._get_data_ext = llama_cpp.llama_state_seq_get_data_ext + self._set_data_ext = llama_cpp.llama_state_seq_set_data_ext + self._flag_partial = llama_cpp.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY + + self.verbose = verbose @property - def cache_size(self): - return int(self.cache.volume()) # type: ignore + def cache_size(self) -> int: + """Returns the total memory used by all stored checkpoints in bytes.""" + return self._current_size - def _find_longest_prefix_key( + def clear(self): + """Clears all stored checkpoints and resets memory tracking.""" + self.checkpoints.clear() + self._current_size = 0 + if self.verbose: + print("HybridCheckpointCache: cleared") + + # Helper tools + + def _hash_prefix(self, tokens: List[int], length: int) -> str: + """ + Computes a SHA-256 hash for a sequence of tokens up to the specified length. + This ensures that checkpoints are only restored for the EXACT same conversation history. + """ + if length <= 0: + return "empty" + tokens_size = len(tokens) + if length > tokens_size: + length = tokens_size + data = bytes(tokens[:length]) + return hashlib.sha256(data).hexdigest()[:32] + + def find_best_checkpoint(self, tokens: List[int], seq_id: int = 0) -> Optional[HybridCheckpoint]: + """ + Finds the longest valid checkpoint that perfectly matches the provided token prefix. + Returns None if no matching checkpoint is found. + """ + best_cp = None + best_pos = -1 + for cp in self.checkpoints: + if cp.seq_id != seq_id or cp.pos > len(tokens): + # Skip if sequence ID mismatches or checkpoint is longer than the current prompt + continue + + # Verify cryptographic integrity of the prompt history + if self._hash_prefix(tokens, cp.pos) == cp.hash_val: + if cp.pos > best_pos: + # Keep the checkpoint with the longest matching prefix (highest pos) + best_pos = cp.pos + best_cp = cp + return best_cp + + def save_checkpoint( self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - min_len = 0 - min_key: Optional[Tuple[int, ...]] = None - for k in self.cache.iterkeys(): # type: ignore - prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) - if prefix_len > min_len: - min_len = prefix_len - min_key = k # type: ignore - return min_key + current_pos: int, + tokens: List[int], + seq_id: int = 0 + ) -> bool: + """ + Extracts the RNN hidden state from the C++ backend and saves it as a checkpoint. + Manages eviction (FIFO) if the maximum number of checkpoints is exceeded. + """ + flags = self._flag_partial + + # 1. Query the required buffer size + size = llama_cpp.llama_state_seq_get_size_ext(self._ctx, seq_id, flags) + if size == 0: + if self.verbose: + print("HybridCheckpointCache: size=0, skip") + return False + + # 2. Allocate buffer and extract data + buffer = (ctypes.c_uint8 * size)() + n_written = llama_cpp.llama_state_seq_get_data_ext(self._ctx, buffer, size, seq_id, flags) + if n_written != size: + if self.verbose: + print(f"HybridCheckpointCache: get failed {n_written}/{size}") + return False + + data_bytes = bytes(buffer[:n_written]) + hash_val = self._hash_prefix(tokens, current_pos) + + # 3. Store the checkpoint + self.checkpoints.append(HybridCheckpoint( + pos=current_pos, + data=data_bytes, + hash_val=hash_val, + size=n_written, + seq_id=seq_id) + ) + self._current_size += n_written - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": - key = tuple(key) - _key = self._find_longest_prefix_key(key) - if _key is None: - raise KeyError("Key not found") - value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore - # NOTE: This puts an integer as key in cache, which breaks, - # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens - # self.cache.push(_key, side="front") # type: ignore - return value + # 4. Enforce capacity limits (FIFO eviction) + while len(self.checkpoints) > self.max_checkpoints: + if not self.checkpoints: + break + old_cp = self.checkpoints.pop(0) + self._current_size -= old_cp.size + if self.verbose: + print(f"HybridCheckpointCache: evicted pos={old_cp.pos}") - def __contains__(self, key: Sequence[int]) -> bool: - return self._find_longest_prefix_key(tuple(key)) is not None + if self.verbose: + print(f"HybridCheckpointCache: Saved checkpoint at pos {current_pos} ({size / 1024 / 1024:.2f} MiB) " + f"total={len(self.checkpoints)} used={self._current_size / 1024 / 1024:.2f} MiB", + file=sys.stderr) - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): - print("LlamaDiskCache.__setitem__: called", file=sys.stderr) - key = tuple(key) - if key in self.cache: - print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) - del self.cache[key] - self.cache[key] = value - print("LlamaDiskCache.__setitem__: set", file=sys.stderr) - while self.cache_size > self.capacity_bytes and len(self.cache) > 0: - key_to_remove = next(iter(self.cache)) - del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + return True + + def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: + """ + Injects a previously saved RNN state checkpoint back into the C++ backend memory. + """ + if cp.seq_id != seq_id: + return False + flags = self._flag_partial + + # Copy data back to a ctypes buffer and push to backend + buffer = (ctypes.c_uint8 * cp.size).from_buffer_copy(cp.data) + ret = llama_cpp.llama_state_seq_set_data_ext( + self._ctx, buffer, cp.size, seq_id, flags + ) + success = (ret == cp.size) + + if self.verbose: + print(f"HybridCheckpointCache: restore {'OK' if success else 'FAIL'} pos={cp.pos}") + return success + + # Disable BaseLlamaCache Dictionary Interfaces + + def __getitem__(self, key): + raise NotImplementedError("HybridCheckpointCache: pls use save_checkpoint or restore_checkpoint method") + + def __setitem__(self, key, value): + raise NotImplementedError("HybridCheckpointCache: pls use save_checkpoint or restore_checkpoint method") + + def __contains__(self, key): + raise NotImplementedError("HybridCheckpointCache: pls use save_checkpoint or restore_checkpoint method") \ No newline at end of file From 5776f1592de5faa380f1626419ffc9d69021ad58 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 00:53:25 +0800 Subject: [PATCH 197/518] Remove redundant seq_id and add resource cleanup - Removed `seq_id` from `HybridCheckpointCache` initialization to make it a stateless, global multi-sequence manager. - Added `close()` and `__del__()` methods to safely release C++ context references and prevent memory leaks. --- llama_cpp/llama_cache.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index ce76c12556..97c117162b 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -333,11 +333,10 @@ class HybridCheckpointCache(BaseLlamaCache): Manager for RNN state snapshots (Checkpoints) tailored for Hybrid/Recurrent models. Provides rollback capabilities for models that cannot physically truncate KV cache. """ - def __init__(self, ctx: llama_cpp.llama_context_p, seq_id: int = 0, max_checkpoints: int = 16, verbose: bool = False): + def __init__(self, ctx: llama_cpp.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): if ctx is None: raise ValueError("HybridCheckpointCache: Failed to create HybridCheckpointCache with model context") self._ctx = ctx - self.seq_id = seq_id self.max_checkpoints = max_checkpoints self.checkpoints: list[HybridCheckpoint] = [] self._current_size = 0 @@ -362,6 +361,16 @@ def clear(self): if self.verbose: print("HybridCheckpointCache: cleared") + def close(self): + self.checkpoints = None + self._ctx = None + self._get_size_ext = None + self._get_data_ext = None + self._set_data_ext = None + + def __del__(self) -> None: + self.close() + # Helper tools def _hash_prefix(self, tokens: List[int], length: int) -> str: From 77b4cd52319618ad85a0fb0e9da6411503bdc1d0 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 01:24:46 +0800 Subject: [PATCH 198/518] Update Submodule vendor/llama.cpp b68d751..37964f4 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b68d75165a..37964f44f9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b68d75165ad37ba1256cc45a43ec4f51cf813c3e +Subproject commit 37964f44f9fab37571b27cccd9f45d4a066e0817 From 29b95223897045900cc3f88733b8de0a6fb0e7ef Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 01:37:10 +0800 Subject: [PATCH 199/518] fix(cache): add safety guards to checkpoint restore and optimize API calls - Replaced direct `llama_cpp` API calls with cached function pointers (`self._get_size_ext`, etc.) for better performance and consistency. - Added sequence ID validation with verbose error logging to prevent cross-sequence contamination. - Added strict state size validation before restoration to prevent buffer overflows and backend segmentation faults. --- llama_cpp/llama_cache.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index 97c117162b..1534e1c846 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -419,7 +419,7 @@ def save_checkpoint( flags = self._flag_partial # 1. Query the required buffer size - size = llama_cpp.llama_state_seq_get_size_ext(self._ctx, seq_id, flags) + size = self._get_size_ext(self._ctx, seq_id, flags) if size == 0: if self.verbose: print("HybridCheckpointCache: size=0, skip") @@ -427,7 +427,7 @@ def save_checkpoint( # 2. Allocate buffer and extract data buffer = (ctypes.c_uint8 * size)() - n_written = llama_cpp.llama_state_seq_get_data_ext(self._ctx, buffer, size, seq_id, flags) + n_written = self._get_data_ext(self._ctx, buffer, size, seq_id, flags) if n_written != size: if self.verbose: print(f"HybridCheckpointCache: get failed {n_written}/{size}") @@ -466,13 +466,24 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: """ Injects a previously saved RNN state checkpoint back into the C++ backend memory. """ + # 1. Verify sequence ID matches to prevent cross-sequence contamination if cp.seq_id != seq_id: + if self.verbose: + print(f"HybridCheckpointCache: [Error] Sequence ID mismatch: checkpoint has {cp.seq_id}, requested {seq_id}", file=sys.stderr) return False flags = self._flag_partial - # Copy data back to a ctypes buffer and push to backend + # 2. Verify the underlying C++ context still expects the exact same state size. + # This prevents buffer overflows if the backend context was unexpectedly altered or reallocated. + current_size = self._get_size_ext(self._ctx, seq_id, flags) + if current_size != cp.size: + if self.verbose: + print(f"HybridCheckpointCache: [Warning] State size mismatch before restore: expected {cp.size}, got {current_size} → possible invalidation") + return False + + # 3. Copy data back to a ctypes buffer and push to the C++ backend buffer = (ctypes.c_uint8 * cp.size).from_buffer_copy(cp.data) - ret = llama_cpp.llama_state_seq_set_data_ext( + ret = self._set_data_ext( self._ctx, buffer, cp.size, seq_id, flags ) success = (ret == cp.size) From 0acb224173ac2ebe95ac5d1887e1c3ccbdfe4103 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 02:52:54 +0800 Subject: [PATCH 200/518] feat: add HybridCheckpointCache detect support for recurrent/hybrid/SWA models - Introduce ctx_checkpoints parameter (default 16) - Detect recurrent / hybrid / n_swa > 0 models in __init__ - Automatically use HybridCheckpointCache when hybrid architecture is detected - Properly close and clear HybridCheckpointCache in __del__ Signed-off-by: JamePeng --- llama_cpp/llama.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f9de500fe6..85005aa4e4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -34,10 +34,11 @@ from .llama_grammar import LlamaGrammar from .llama_cache import ( BaseLlamaCache, - LlamaCache, # type: ignore - LlamaDiskCache, # type: ignore - LlamaRAMCache, # type: ignore - LlamaTrieCache, # type: ignore + LlamaCache, # type: ignore + LlamaDiskCache, # type: ignore + LlamaRAMCache, # type: ignore + LlamaTrieCache, # type: ignore + HybridCheckpointCache, # type: ignore ) from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import llama_cpp.llama_cpp as llama_cpp @@ -109,6 +110,8 @@ def __init__( op_offload: Optional[bool] = None, swa_full: Optional[bool] = None, kv_unified: Optional[bool] = None, + # HybridCheckpointCache Params + ctx_checkpoints: int = 16, # Sampling Params last_n_tokens_size: int = 64, # LoRA Params @@ -197,6 +200,7 @@ def __init__( op_offload: whether to offload host tensor operations to device swa_full: whether to use full-size SWA cache kv_unified: use single unified KV buffer for the KV cache of all sequences + ctx_checkpoints: max number of context checkpoints to create per slot (default: 16)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. @@ -466,6 +470,26 @@ def __init__( ) ) + # Hybrid architecture detection + _is_recurrent = self._model.is_recurrent() + _is_hybrid = self._model.is_hybrid() + _n_swa = self._model.n_swa() + # checkpoints are created only if: + # - the model uses SWA and we are not using `swa_full` + # - the model architecture is marked as recurrent or hybrid + self.is_hybrid = _is_recurrent or _is_hybrid or (_n_swa > 0 and not swa_full) + + if self.is_hybrid: + if self.verbose: + print(f"Llama.__init__: Hybrid/Recurrent model detected." + f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}), swa_full: {swa_full}. " + f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}).", + file=sys.stderr) + self.ctx_checkpoints = ctx_checkpoints + self._hybrid_cache_mgr = HybridCheckpointCache(self._ctx.ctx, max_checkpoints=self.ctx_checkpoints, verbose=self.verbose) + else: + self._hybrid_cache_mgr = None + self._batch = self._stack.enter_context( contextlib.closing( internals.LlamaBatch( @@ -634,6 +658,10 @@ def close(self) -> None: self._candidates.close() self._candidates = None + if getattr(self, "_hybrid_cache_mgr", None) is not None and hasattr(self._hybrid_cache_mgr, "close"): + self._hybrid_cache_mgr.close() + self._hybrid_cache_mgr = None + if hasattr(self, "chat_handler") and hasattr(self.chat_handler, "close"): self.chat_handler.close() @@ -641,6 +669,7 @@ def close(self) -> None: self.context_params = None self.chat_handler = None self.input_ids = None + self.metadata = None self.scores = None self.tokenizer_ = None @@ -1099,6 +1128,8 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): # No prefix matched. Completely clear the KV cache to prevent context poisoning. self.n_tokens = 0 self._ctx.memory_clear(True) + if self.is_hybrid and self._hybrid_cache_mgr is not None: + self._hybrid_cache_mgr.clear() # Reset the model state if reset: From 781790fbf9a83894771b96c1d1555b30cea7aae5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 03:38:57 +0800 Subject: [PATCH 201/518] fix(HybridCheckpointCache): ValueError: bytes must be in range(0, 256) --- llama_cpp/llama_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index 1534e1c846..c211ce6888 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +import array from collections import OrderedDict import ctypes from dataclasses import dataclass @@ -383,7 +384,7 @@ def _hash_prefix(self, tokens: List[int], length: int) -> str: tokens_size = len(tokens) if length > tokens_size: length = tokens_size - data = bytes(tokens[:length]) + data = array.array('i', tokens[:length]).tobytes() return hashlib.sha256(data).hexdigest()[:32] def find_best_checkpoint(self, tokens: List[int], seq_id: int = 0) -> Optional[HybridCheckpoint]: From af3d8bdbc37b07af915ac68bcb469b7b63144d03 Mon Sep 17 00:00:00 2001 From: AlcoftTAO Date: Thu, 26 Feb 2026 21:25:42 +0100 Subject: [PATCH 202/518] updated template --- llama_cpp/llama_chat_format.py | 219 +++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ab60fa0c69..8c556c9dd8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4513,6 +4513,225 @@ def __call__(self, **kwargs): # Use parent implementation return super().__call__(**kwargs) +class Qwen35ChatHandler(Llava15ChatHandler): + CHAT_FORMAT = ( + "{%- set image_count = namespace(value=0) -%}" + "{%- set video_count = namespace(value=0) -%}" + "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" + " {%- if content is string -%}" + " {{- content -}}" + " {%- elif content is iterable and content is not mapping -%}" + " {%- for item in content -%}" + " {%- if 'image' in item or 'image_url' in item -%}" + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain images.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set image_count.value = image_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Picture ' ~ image_count.value ~ ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {%- if 'image' in item -%}" + " {%- if item.image is string -%}" + " {{- item.image -}}" + " {%- else -%}" + " {{- item.image.url -}}" + " {%- endif -%}" + " {%- elif 'image_url' in item -%}" + " {%- if item.image_url is string -%}" + " {{- item.image_url -}}" + " {%- else -%}" + " {{- item.image_url.url -}}" + " {%- endif -%}" + " {%- endif -%}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'video' in item -%}" + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain videos.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set video_count.value = video_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Video ' ~ video_count.value ~ ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {{- item.video -}}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'text' in item -%}" + " {{- item.text -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected item type in content.') -}}" + " {%- endif -%}" + " {%- endfor -%}" + " {%- elif content is none or content is undefined -%}" + " {{- '' -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected content type.') -}}" + " {%- endif -%}" + "{%- endmacro -%}" + "{%- if not messages -%}" + " {{- raise_exception('No messages provided.') -}}" + "{%- endif -%}" + "{%- if tools and tools is iterable and tools is not mapping -%}" + " {{- '<|im_start|>system\n' -}}" + " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" + " {%- for tool in tools -%}" + " {{- '\n' -}}" + " {{- tool | tojson -}}" + " {%- endfor -%}" + " {{- '\n' -}}" + " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" + " {%- if messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) | trim -%}" + " {%- if content -%}" + " {{- '\n\n' + content -}}" + " {%- endif -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + "{%- elif messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) -%}" + " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" + "{%- for message in messages[::-1] -%}" + " {%- set index = messages | length - 1 - loop.index0 -%}" + " {%- if ns.multi_step_tool and message.role == 'user' -%}" + " {%- set content = render_content(message.content, false) | trim -%}" + " {%- if not (content.startswith('') and content.endswith('')) -%}" + " {%- set ns.multi_step_tool = false -%}" + " {%- set ns.last_query_index = index -%}" + " {%- endif -%}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if ns.multi_step_tool -%}" + " {{- raise_exception('No user query found in messages.') -}}" + "{%- endif -%}" + "{%- for message in messages -%}" + " {%- set content = render_content(message.content, true) | trim -%}" + " {%- if message.role == 'system' -%}" + " {%- if not loop.first -%}" + " {{- raise_exception('System message must be at the beginning.') -}}" + " {%- endif -%}" + " {%- elif message.role == 'user' -%}" + " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" + " {%- elif message.role == 'assistant' -%}" + " {%- set reasoning_content = '' -%}" + " {%- if message.reasoning_content is string -%}" + " {%- set reasoning_content = message.reasoning_content -%}" + " {%- elif '
' in content -%}" + " {%- set reasoning_content = content.split('
')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" + " {%- set content = content.split('')[-1].lstrip('\n') -%}" + " {%- endif -%}" + " {%- set reasoning_content = reasoning_content | trim -%}" + " {%- if loop.index0 > ns.last_query_index -%}" + " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" + " {%- else -%}" + " {{- '<|im_start|>' + message.role + '\n' + content -}}" + " {%- endif -%}" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" + " {%- for tool_call in message.tool_call -%}" + " {%- if tool_call.function is defined -%}" + " {%- set tool_call = tool_call.function -%}" + " {%- endif -%}" + " {%- if loop.first -%}" + " {%- if content | trim -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- '\n\n\n' -}}" + " {%- endif -%}" + " {%- if tool_call.arguments is defined -%}" + " {%- for (args_name, args_value) in tool_calls.arguments | items -%}" + " {{- '\n' -}}" + " {%- set args_value = args_value | tojson | safe if args_value is mapping or args_value is sequence and args_value is not string else args_value | string -%}" + " {{- args_value -}}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif message.role == 'tool' -%}" + " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" + " {{- '<|im_start|>user' -}}" + " {%- endif -%}" + " {{- '\n\n' -}}" + " {{- content -}}" + " {{- '\n' -}}" + " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif loop.last -%}" + " {{- '<|im_end|>\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- raise_exception('Unexpected message role.') -}}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + " {{- '<|im_start|>assistant\n' -}}" + " {%- if enable_thinking is false -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n' -}}" + " {%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + reasoning: bool = True, + add_vision_id: bool = True, + **kwargs, + ): + """ + Parameters: + - reasoning (bool): + - True (default): Enables reasoning for better results. + - False: Disables reasoning for faster results. + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + """ + self.reasoning = reasoning + self.add_vision_id = add_vision_id + + super().__init__(**kwargs) + + def __call__(self, **kwargs): + self.extra_template_arguments["enable_thinking"] = self.reasoning + self.extra_template_arguments["add_vision_id"] = self.add_vision_id + + llama = kwargs['llama'] + + # Clear state for multiple runs + llama.reset() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + # Clear any handler state + if hasattr(self, '_last_image_embed'): + self._last_image_embed = None + self._last_image_hash = None + + if self.verbose: + messages = kwargs.get('messages', []) + try: + image_count = len(self.get_image_urls(messages)) + print(f"Qwen35ChatHandler(reasoning={self.reasoning}) - Cleared state, processing {image_count} images", file=sys.stderr) + except Exception: + print(f"Qwen35ChatHandler(reasoning={self.reasoning}) - Cleared state", file=sys.stderr) + + # Use parent implementation + return super().__call__(**kwargs) @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( From 9262fc04e90cefb03fbc2d7d4342c17a752a417f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 05:18:18 +0800 Subject: [PATCH 203/518] refactor(LlamaBatch): replace set_batch with granular add_token + vectorized add_sequence - Introduce high-performance add_token() for single-token append in generation loop - Add flexible add_sequence() with per-token pos/seq_ids/logits arrays - Remove old set_batch() that assumed single-seq + forced last logit - Better support for multi-sequence and precise logit control --- llama_cpp/_internals.py | 91 +++++++++++++++++++++++++++--------- llama_cpp/llama_embedding.py | 14 +++++- tests/test_llama.py | 2 +- 3 files changed, 82 insertions(+), 25 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 506de27d50..cd1af546e0 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -675,37 +675,82 @@ def reset(self): if self.batch is not None: self.batch.n_tokens = 0 - def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_all: bool): - if len(batch) > self.n_tokens_capacity: - raise IndexError(f"Input batch size {len(batch)} exceeds capacity {self.n_tokens_capacity}") + def add_token(self, token: int, pos: int, seq_ids: Sequence[int], logits: bool): + """ + Adds a single token to the batch. + This is a high-performance method for appending a single token during the generation loop, + avoiding the overhead of creating temporary lists required by add_sequence. - n_tokens = len(batch) - self.batch.n_tokens = n_tokens - for i in range(n_tokens): - self.batch.token[i] = batch[i] - self.batch.pos[i] = n_past + i - self.batch.seq_id[i][0] = 0 - self.batch.n_seq_id[i] = 1 - self.batch.logits[i] = logits_all - self.batch.logits[n_tokens - 1] = True - - def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): - n_tokens = len(batch) + Args: + token: The integer ID of the token to add. + pos: The logical sequence position (n_past) of this token. + seq_ids: A sequence of sequence IDs this token belongs to (e.g., [0] for a standard single chat). + A single token can be part of multiple sequences simultaneously. + logits: A boolean flag indicating whether the backend should compute logits for this token. + """ + idx = self.batch.n_tokens + if idx >= self.n_tokens_capacity: + raise IndexError(f"LlamaBatch overflow[add_token]: Cannot add token. Capacity {self.n_tokens_capacity} reached.") + + self.batch.token[idx] = token + self.batch.pos[idx] = pos + + n_seq_id = len(seq_ids) + if n_seq_id > self.n_seq_max: + raise ValueError(f"LlamaBatch Error[add_token]: Token belongs to {n_seq_id} sequences, " + f"but n_seq_max was initialized to {self.n_seq_max}.") + self.batch.n_seq_id[idx] = n_seq_id + + for i, seq_id in enumerate(seq_ids): + self.batch.seq_id[idx][i] = seq_id + self.batch.logits[idx] = logits + + self.batch.n_tokens += 1 + + def add_sequence( + self, + token_array: Sequence[int], + pos_array: Sequence[int], + seq_ids: Sequence[Sequence[int]], + logits_array: Sequence[bool] + ): + """ + Adds a sequence of tokens to the batch in a vectorized manner. + Strictly maps the provided arrays to the underlying C++ batch structure without subjective overriding. + + Args: + token_array: A sequence of token IDs to be evaluated. + pos_array: A sequence of logical positions corresponding to each token. + seq_id_array: A sequence of lists, where each list contains the sequence IDs for the respective token. + (e.g., [[0], [0], [0]] for 3 tokens belonging to sequence 0). + logits_array: A sequence of boolean flags indicating whether to compute logits for each token. + """ + n_tokens = len(token_array) current_count = self.batch.n_tokens + if current_count + n_tokens > self.n_tokens_capacity: raise IndexError( - f"LlamaBatch overflow: Cannot add {n_tokens} tokens. " + f"LlamaBatch overflow[add_sequence]: Cannot add {n_tokens} tokens. " f"Space left: {self.n_tokens_capacity - current_count}" ) - self.batch.n_tokens += n_tokens + + n_seq_id = len(seq_ids) + if n_seq_id > self.n_seq_max: + raise ValueError(f"LlamaBatch Error[add_sequence]: Token belongs to {n_seq_id} sequences, " + f"but n_seq_max was initialized to {self.n_seq_max}.") + for i in range(n_tokens): j = current_count + i - self.batch.token[j] = batch[i] - self.batch.pos[j] = i - self.batch.seq_id[j][0] = seq_id - self.batch.n_seq_id[j] = 1 - self.batch.logits[j] = logits_all - self.batch.logits[current_count + n_tokens - 1] = True + self.batch.token[j] = token_array[i] + self.batch.pos[j] = pos_array[i] + + self.batch.n_seq_id[j] = n_seq_id + for k, seq_id in enumerate(seq_ids): + self.batch.seq_id[j][k] = seq_id + + self.batch.logits[j] = logits_array[i] + + self.batch.n_tokens += n_tokens # Embedding functions diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 5da97fa19a..7c8ad1e90f 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -251,8 +251,20 @@ def _decode_batch(): _decode_batch() idx_in_batch = 0 + pos_array = list(range(n_tokens)) + + if is_none: + logits_array = [True] * n_tokens + else: + logits_array = [False] * (n_tokens - 1) + [True] + # Add to Batch - self._batch.add_sequence(tokens, idx_in_batch, logits_all=logits_all) + self._batch.add_sequence( + token_array=tokens, + pos_array=pos_array, + seq_ids=[idx_in_batch], + logits_array=logits_array + ) batch_seq_lens.append(n_tokens) idx_in_batch += 1 diff --git a/tests/test_llama.py b/tests/test_llama.py index 6007e1e101..a2dc1cf305 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -124,7 +124,7 @@ def test_real_model(llama_cpp_model_path): for _ in range(4): # Prepare batch with current tokens - batch.set_batch(curr_tokens, n_past=n_eval, logits_all=False) + batch.add_token(curr_tokens, pos=n_eval, seq_ids=[0], logits=False) # Decode (run inference) context.decode(batch) From 978ddf7c10af7c300e810221df239b4466c0f42e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 27 Feb 2026 05:48:37 +0800 Subject: [PATCH 204/518] =?UTF-8?q?feat(core):=20overhaul=20generate=20and?= =?UTF-8?q?=20eval=20for=20hybrid=20model=20support(Qwen3-next=E3=80=81Qwe?= =?UTF-8?q?n3.5=20etc.)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Integrated `HybridCheckpointCache` into the generation loop to support state rollback for recurrent/hybrid architectures. - Implemented Context Shift (sliding window) in `eval` to gracefully prevent OOM when exceeding `n_ctx`. - Adapted `eval` to use the newly vectorized `LlamaBatch.add_sequence` API with dynamic `logits_array` configuration. - Fixed the full prefix match bug by forcing a 1-token re-evaluation to refresh logits. - Disabled speculative decoding for hybrid models to prevent irreversible state pollution. - Wrapped the generation loop in a `try...finally` block to guarantee safe checkpoint saving. Signed-off-by: JamePeng --- llama_cpp/llama.py | 274 +++++++++++++++++++++++++++++--------------- tests/test_llama.py | 41 ++++--- 2 files changed, 207 insertions(+), 108 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 85005aa4e4..1aeea79974 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -85,6 +85,7 @@ def __init__( # Context Params seed: int = llama_cpp.LLAMA_DEFAULT_SEED, n_ctx: int = 512, + n_keep: int = 256, n_batch: int = 2048, n_ubatch: int = 512, n_seq_max: int = 1, @@ -177,6 +178,7 @@ def __init__( kv_overrides: Key-value overrides for the model. seed: RNG seed, -1 for random n_ctx: Text context, 0 = from model + n_keep: Number of tokens to keep from initial prompt n_batch: Prompt processing maximum batch size n_ubatch: Physical batch size n_seq_max: max number of sequences (i.e. distinct states for recurrent models) @@ -328,6 +330,7 @@ def __init__( self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? + self.n_keep = n_keep if n_keep > 0 else 256 self.n_seq_max = n_seq_max self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -778,35 +781,69 @@ def eval(self, tokens: Sequence[int]): if len(tokens) == 0: return n_eval = len(tokens) - current_pos = self.n_tokens + if n_eval == 0: + return + + # Context Shift + if self.n_tokens + n_eval > self._n_ctx: + if self.is_hybrid: + raise RuntimeError( + f"Context length exceeded for Hybrid/SWA model! " + f"(n_tokens: {self.n_tokens}, new: {n_eval}, max: {self._n_ctx})" + ) + else: + _n_keep = min(self.n_keep, self.n_tokens) + # number of tokens after n_keep that may be discarded when shifting context + # defaults to half + _n_discard = (self.n_tokens - _n_keep) // 2 - if self._ctx: - # Standard cleanup by current_pos - is_success = self._ctx.memory_seq_rm(0, current_pos, -1) - # Fallback: Broad cleanup - if not is_success: if self.verbose: - print(f"WARN: memory_seq_rm(0, {current_pos}, -1) failed. Executing fallback: memory_seq_rm(0, 0, -1)") - is_success = self._ctx.memory_seq_rm(0, 0, -1) + print(f"Llama.eval: Context limit reached. Shifting context: " + f"discarding {_n_discard} tokens...", file=sys.stderr) + + self._ctx.memory_seq_rm(0, _n_keep, _n_keep + _n_discard) + self._ctx.memory_seq_add(0, _n_keep + _n_discard, self.n_tokens, -_n_discard) + + remaining_len = self.n_tokens - (_n_keep + _n_discard) + if remaining_len > 0: + self.input_ids[_n_keep : _n_keep + remaining_len] = self.input_ids[_n_keep + _n_discard : self.n_tokens] + + self.n_tokens -= _n_discard for i in range(0, n_eval, self.n_batch): - batch = tokens[i : min(n_eval, i + self.n_batch)] + batch_tokens = tokens[i : min(n_eval, i + self.n_batch)] + n_batch_tokens = len(batch_tokens) n_past = self.n_tokens - n_batch_tokens = len(batch) - self._batch.set_batch( - batch=batch, n_past=n_past, logits_all=self._logits_all + + self._batch.reset() + + pos_array = [self.n_tokens + j for j in range(n_batch_tokens)] + + if self._logits_all: + logits_array = [True] * n_batch_tokens + else: + logits_array = [False] * n_batch_tokens + if i + n_batch_tokens == n_eval: + logits_array[-1] = True + + self._batch.add_sequence( + token_array=batch_tokens, + pos_array=pos_array, + seq_ids=[0], + logits_array=logits_array ) + current_batch_size = n_batch_tokens try: self._ctx.decode(self._batch) except Exception as e: raise RuntimeError( - f"Decode Failed at Pos {current_pos}. " + f"Decode Failed at " f"Batch size: {n_batch_tokens}. " - f"Result of memory_seq_rm: {is_success}. " f"Error: {str(e)}." ) from e + # Save tokens - self.input_ids[n_past : n_past + n_batch_tokens] = batch + self.input_ids[n_past : n_past + n_batch_tokens] = batch_tokens # Save logits logits_ptr = self._ctx.get_logits() @@ -820,8 +857,8 @@ def eval(self, tokens: Sequence[int]): self.scores[0, :] = logits_view # Update n_tokens - current_pos += n_batch_tokens - self.n_tokens = current_pos + self.n_tokens += current_batch_size + i += current_batch_size # Helper method: Convert dict logit_bias to List[llama_logit_bias] def _convert_logit_bias(self, logit_bias: Optional[Dict[int, float]]) -> List[llama_cpp.llama_logit_bias]: @@ -1026,6 +1063,63 @@ def generate( Yields: The generated tokens. """ + original_tokens = list(tokens) + # Check for kv cache prefix match + if reset and self.n_tokens > 0: + longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) + if longest_prefix > 0: + reset = False + + if longest_prefix == len(tokens): + if self.verbose: + print(f"Llama.generate: Full match. Forcing prefix-- to evaluate 1 token.", file=sys.stderr) + longest_prefix -= 1 + + # Physically erase trailing "ghost" tokens from the C++ KV cache + # to prevent attention misalignment in multi-round chats. + if longest_prefix < self.n_tokens: + if self.is_hybrid and self._hybrid_cache_mgr is not None: + if self.verbose: + print(f"Llama.generate: Hybrid model rollback triggered.", file=sys.stderr) + + best_ckpt = self._hybrid_cache_mgr.find_best_checkpoint(original_tokens, 0) + if best_ckpt is not None and self._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + actual_prefix = best_ckpt.pos + else: + actual_prefix = 0 + self._hybrid_cache_mgr.clear() + self._ctx.memory_clear(True) + + self.n_tokens = actual_prefix + tokens = original_tokens[actual_prefix:] + if self.verbose: + print( + f"Llama.generate: {actual_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", + file=sys.stderr, + ) + else: + if self.verbose: + print(f"Llama.generate: Truncating KV cache size from {self.n_tokens} to {longest_prefix}", file=sys.stderr) + self._ctx.memory_seq_rm(0, longest_prefix, -1) + + # Adjust the tokens array and cursor to reuse the matched cache + self.n_tokens = longest_prefix + tokens = tokens[longest_prefix:] + + if self.verbose: + print( + f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", + file=sys.stderr, + ) + else: + # No prefix matched. Completely clear the KV cache to prevent context poisoning. + self.n_tokens = 0 + self._ctx.memory_clear(True) + if self.is_hybrid and self._hybrid_cache_mgr is not None: + self._hybrid_cache_mgr.clear() + # Reset mirostat sampling params = LlamaSamplingParams( # Core Sampling @@ -1101,88 +1195,84 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): self._sampling_ctx = LlamaSamplingContext(params, self._model) - # Check for kv cache prefix match - if reset and self.n_tokens > 0: - longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) - if longest_prefix > 0: - reset = False - - # Physically erase trailing "ghost" tokens from the C++ KV cache - # to prevent attention misalignment in multi-round chats. - if longest_prefix < self.n_tokens: - if self.verbose: - print(f"Llama.generate: Truncating KV cache size from {self.n_tokens} to {longest_prefix}", file=sys.stderr) - self._ctx.memory_seq_rm(0, longest_prefix, -1) - - # Adjust the tokens array and cursor to reuse the matched cache - tokens = tokens[longest_prefix:] - self.n_tokens = longest_prefix - - if self.verbose: - print( - f"Llama.generate: {longest_prefix} prefix-match hit, " - f"remaining {len(tokens)} prompt tokens to eval", - file=sys.stderr, - ) - else: - # No prefix matched. Completely clear the KV cache to prevent context poisoning. - self.n_tokens = 0 - self._ctx.memory_clear(True) - if self.is_hybrid and self._hybrid_cache_mgr is not None: - self._hybrid_cache_mgr.clear() - - # Reset the model state - if reset: - self.reset() - sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) # Eval and sample - while True: - self.eval(tokens) - while sample_idx < self.n_tokens: - token = self._sampling_ctx.sample(self._ctx, idx=-1) - self._sampling_ctx.accept(token, False if grammar is None else True) - - sample_idx += 1 - if stopping_criteria is not None: - if self._logits_all: - logits_idx = sample_idx - self.n_tokens - check_stopping = True - else: - if sample_idx == self.n_tokens: - logits_idx = 0 + try: + while True: + if len(tokens) > 0: + self.eval(tokens) + while sample_idx < self.n_tokens: + token = self._sampling_ctx.sample(self._ctx, idx=-1) + self._sampling_ctx.accept(token, False if grammar is None else True) + + sample_idx += 1 + + if stopping_criteria is not None: + if self._logits_all: + logits_idx = sample_idx - self.n_tokens check_stopping = True else: - check_stopping = False - - if check_stopping and stopping_criteria( - self._input_ids[: sample_idx], - self._scores[logits_idx, :] - ): - return - tokens_or_none = yield token - tokens.clear() - tokens.append(token) - - if tokens_or_none is not None: - tokens.extend(tokens_or_none) - - if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]: - self.n_tokens = sample_idx - self._ctx.memory_seq_rm(0, self.n_tokens, -1) - break + if sample_idx == self.n_tokens: + logits_idx = 0 + check_stopping = True + else: + check_stopping = False + + if check_stopping and stopping_criteria( + self._input_ids[: sample_idx], + self._scores[logits_idx, :] + ): + return + + tokens_or_none = yield token + tokens.clear() + tokens.append(token) + + if tokens_or_none is not None: + tokens.extend(tokens_or_none) + + if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]: + self.n_tokens = sample_idx + if self.is_hybrid: + if self.verbose: + print("Llama.generate: Draft token rejected for Hybrid model. Rolling back via Checkpoint.", file=sys.stderr) + if self._hybrid_cache_mgr: + best_ckpt = self._hybrid_cache_mgr.find_best_checkpoint(self._input_ids[:self.n_tokens].tolist(), 0) + if best_ckpt and self._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + self.n_tokens = best_ckpt.pos + else: + self._hybrid_cache_mgr.clear() + self._ctx.memory_clear(True) + self.n_tokens = 0 + else: + self._ctx.memory_seq_rm(0, self.n_tokens, -1) - if self.draft_model is not None: - self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens - draft_tokens = self.draft_model( - self.input_ids[: self.n_tokens + len(tokens)] - ) - tokens.extend( - draft_tokens.astype(int)[ - : self._n_ctx - self.n_tokens - len(tokens) - ] + break + + if self.draft_model is not None: + if self.is_hybrid: + if self.verbose: + print("Llama.generate: Speculative decoding is skipped for Hybrid models.", file=sys.stderr) + else: + self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens + draft_tokens = self.draft_model( + self.input_ids[: self.n_tokens + len(tokens)] + ) + tokens.extend( + draft_tokens.astype(int)[ + : self._n_ctx - self.n_tokens - len(tokens) + ] + ) + finally: + if self.is_hybrid and self._hybrid_cache_mgr is not None: + current_history = self._input_ids[:self.n_tokens].tolist() + + self._hybrid_cache_mgr.save_checkpoint( + current_pos=self.n_tokens, + tokens=current_history, + seq_id=0 ) def create_embedding( diff --git a/tests/test_llama.py b/tests/test_llama.py index a2dc1cf305..bf075845e0 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -92,7 +92,7 @@ def test_real_model(llama_cpp_model_path): # 3. Setup Context Parameters cparams = llama_cpp.llama_context_default_params() - cparams.n_ctx = 16 + cparams.n_ctx = 32 cparams.n_batch = 16 cparams.n_ubatch = 16 cparams.n_threads = multiprocessing.cpu_count() @@ -119,27 +119,36 @@ def test_real_model(llama_cpp_model_path): sampler.add_dist(seed) result = list(tokens) - n_eval = 0 - curr_tokens = tokens + n_eval = len(tokens) + batch.reset() + pos_array = list(range(n_eval)) + logits_array = [False] * (n_eval - 1) + [True] + + batch.add_sequence( + token_array=tokens, + pos_array=pos_array, + seq_ids=[0], + logits_array=logits_array + ) + context.decode(batch) for _ in range(4): - # Prepare batch with current tokens - batch.add_token(curr_tokens, pos=n_eval, seq_ids=[0], logits=False) - - # Decode (run inference) - context.decode(batch) - n_eval += len(curr_tokens) - - # Sample the next token (index -1 means the last token in the batch) token_id = sampler.sample(context, -1) - - # Accept the token to update internal sampler state sampler.accept(token_id) - - # Update loop variables - curr_tokens = [token_id] result.append(token_id) + batch.reset() + + batch.add_token( + token=token_id, + pos=n_eval, + seq_ids=[0], + logits=True + ) + + context.decode(batch) + n_eval += 1 + output = result[len(tokens):] output_text = model.detokenize(output, special=True) print(output_text) From 60e101584dd5a525f63734b5b480e0d50fdbf574 Mon Sep 17 00:00:00 2001 From: AlcoftTAO Date: Fri, 27 Feb 2026 04:50:32 +0100 Subject: [PATCH 205/518] tested chat template, changed some parameters --- llama_cpp/llama_chat_format.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 8c556c9dd8..b9f3c8651d 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4467,7 +4467,6 @@ def __init__( self, force_reasoning: bool = False, add_vision_id: bool = True, - image_min_tokens: int = -1, **kwargs, ): """ @@ -4478,20 +4477,13 @@ def __init__( - add_vision_id (bool): - True (default): Count all the images. Recommended for multi-image. - False: Doesn't count the images. Can save tokens with single-image. - - image_min_tokens (int): - It only takes effect when the value is greater than zero. the default value is -1 (i.e., using the default parameters in the model's preprocessor_config.json). - Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks """ - self.force_reasoning = force_reasoning - self.add_vision_id = add_vision_id - self.image_min_tokens = image_min_tokens + super().__init__(**kwargs) - super().__init__(image_min_tokens=self.image_min_tokens, **kwargs) + self.extra_template_arguments["force_reasoning"] = force_reasoning + self.extra_template_arguments["add_vision_id"] = add_vision_id def __call__(self, **kwargs): - self.extra_template_arguments["force_reasoning"] = self.force_reasoning - self.extra_template_arguments["add_vision_id"] = self.add_vision_id - llama = kwargs['llama'] if hasattr(llama, 'input_ids'): @@ -4698,15 +4690,12 @@ def __init__( - True (default): Count all the images. Recommended for multi-image. - False: Doesn't count the images. Can save tokens with single-image. """ - self.reasoning = reasoning - self.add_vision_id = add_vision_id - super().__init__(**kwargs) - def __call__(self, **kwargs): - self.extra_template_arguments["enable_thinking"] = self.reasoning - self.extra_template_arguments["add_vision_id"] = self.add_vision_id + self.extra_template_arguments["enable_thinking"] = reasoning + self.extra_template_arguments["add_vision_id"] = add_vision_id + def __call__(self, **kwargs): llama = kwargs['llama'] # Clear state for multiple runs From 1a5b3d6467908a3012a5f278812b87ccf8dcb1ff Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Feb 2026 00:22:37 +0800 Subject: [PATCH 206/518] Update Submodule vendor/llama.cpp 37964f4..8d3b962 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 37964f44f9..8d3b962f47 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 37964f44f9fab37571b27cccd9f45d4a066e0817 +Subproject commit 8d3b962f47cdb05df8ab8801da38cb3c55d50337 From ff3a7c8a3477b8655579853382770bfd34d1ece6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Feb 2026 00:51:55 +0800 Subject: [PATCH 207/518] Optimization (decode): treat KV slot exhaustion (code 1) as a recoverable return value - Updated the `decode` wrapper to explicitly return `1` instead of raising a `RuntimeError` when `llama_decode` indicates no KV slots are available. - Aligned Python API behavior with the underlying C++ contract, treating code 1 as a recoverable signal rather than a fatal crash. - Enabled upper-level caller loops (like `eval`) to gracefully handle VRAM fragmentation via dynamic batch halving without relying on clumsy try-except block string parsing. - Retained strict `RuntimeError` exceptions for truly fatal backend failures (e.g., codes -1, -2, -3). - Added comprehensive docstrings detailing return codes and exception scenarios. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index cd1af546e0..bd2a8691d7 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -529,19 +529,43 @@ def encode(self, batch: LlamaBatch): if return_code != 0: raise RuntimeError(f"llama_encode returned {return_code}") - def decode(self, batch: LlamaBatch): + def decode(self, batch: 'LlamaBatch') -> int: + """ + Evaluate the batch of tokens using the transformer model. + + This method executes the forward pass. If the KV cache is heavily fragmented + or out of space, it may return 1, indicating the caller should try to reduce + the batch size or evict idle sequences. + + Returns: + 0: Success. + 1: No KV slot available (Recoverable). The caller should implement a + fallback strategy, such as reducing the batch size and retrying. + + Raises: + RuntimeError: If a fatal, non-recoverable error occurs during decoding + (e.g., negative error codes or invalid batch structures). + """ return_code = llama_cpp.llama_decode(self.ctx, batch.batch) if return_code == 0: - return + return 0 + + # 1 means "No KV slot available". + # We explicitly return 1 instead of raising an exception so that the caller + # can gracefully handle it via dynamic batch sizing (batch_size //= 2). + elif return_code == 1: + return 1 + # Any other code indicates a fatal failure. error_map = { - 1: "No KV slot available: try reducing batch size or increasing context window", - 2: "Decoding aborted", - -1: "Invalid input batch", + 2: "Decoding aborted by user callback", + -1: "Invalid input batch (e.g. n_tokens == 0 or exceeding capacity)", + -2: "Could not allocate space for the compute graph (VRAM exhausted)", + -3: "Graph computation failed internally", } - msg = error_map.get(return_code, "Fatal internal error") + msg = error_map.get(return_code, "Unknown fatal internal error") raise RuntimeError(f"llama_decode failed (code {return_code}): {msg}") def set_n_threads(self, n_threads: int, n_threads_batch: int): From 39afb9b26e464f8f333c52614faf1e1fac26ac1e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Feb 2026 03:58:58 +0800 Subject: [PATCH 208/518] feat(hybrid): add periodic checkpointing and adaptive batch handling - Increase default `ctx_checkpoints` from 16 to 32 - Add new parameter `checkpoint_interval` (default: 4096) for hybrid model state snapshots - Implement robust dynamic batch downgrade on KV cache exhaustion (status=1) - Introduce periodic checkpoint saves during eval in hybrid mode - Improve error handling and logging around context shifts and decoding failures Signed-off-by: JamePeng --- llama_cpp/llama.py | 119 +++++++++++++++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 32 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1aeea79974..381366ea84 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -112,7 +112,8 @@ def __init__( swa_full: Optional[bool] = None, kv_unified: Optional[bool] = None, # HybridCheckpointCache Params - ctx_checkpoints: int = 16, + ctx_checkpoints: int = 32, + checkpoint_interval: int = 4096, # Sampling Params last_n_tokens_size: int = 64, # LoRA Params @@ -203,6 +204,7 @@ def __init__( swa_full: whether to use full-size SWA cache kv_unified: use single unified KV buffer for the KV cache of all sequences ctx_checkpoints: max number of context checkpoints to create per slot (default: 16)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) + checkpoint_interval: Hybrid model checkpoint token intervals, and archiving of text with interval sizes along the way. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. @@ -485,10 +487,11 @@ def __init__( if self.is_hybrid: if self.verbose: print(f"Llama.__init__: Hybrid/Recurrent model detected." - f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}), swa_full: {swa_full}. " - f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}).", + f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}, swa_full: {swa_full}). " + f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}, checkpoint_interval={checkpoint_interval}).", file=sys.stderr) self.ctx_checkpoints = ctx_checkpoints + self.checkpoint_interval = checkpoint_interval self._hybrid_cache_mgr = HybridCheckpointCache(self._ctx.ctx, max_checkpoints=self.ctx_checkpoints, verbose=self.verbose) else: self._hybrid_cache_mgr = None @@ -784,7 +787,7 @@ def eval(self, tokens: Sequence[int]): if n_eval == 0: return - # Context Shift + # Context Shift: Prevent OOM by discarding older tokens when context limit is reached. if self.n_tokens + n_eval > self._n_ctx: if self.is_hybrid: raise RuntimeError( @@ -793,7 +796,7 @@ def eval(self, tokens: Sequence[int]): ) else: _n_keep = min(self.n_keep, self.n_tokens) - # number of tokens after n_keep that may be discarded when shifting context + # Number of tokens after n_keep that may be discarded when shifting context # defaults to half _n_discard = (self.n_tokens - _n_keep) // 2 @@ -810,56 +813,108 @@ def eval(self, tokens: Sequence[int]): self.n_tokens -= _n_discard - for i in range(0, n_eval, self.n_batch): - batch_tokens = tokens[i : min(n_eval, i + self.n_batch)] - n_batch_tokens = len(batch_tokens) + # Adaptive batch downgrade limit initialization + current_max_batch = self.n_batch + last_ckpt_pos = self.n_tokens + + # If KV slots are full, `current_batch_size` will be halved. + # A `while` loop allows us to correctly resume from the exact cut-off point. + i = 0 + while i < n_eval: + # Chunk the tokens using the adaptive current_max_batch + n_chunk = min(n_eval - i, current_max_batch) + chunk = tokens[i : i + n_chunk] n_past = self.n_tokens self._batch.reset() - pos_array = [self.n_tokens + j for j in range(n_batch_tokens)] + pos_array = [self.n_tokens + j for j in range(n_chunk)] + # Configure logits extraction: + # If _logits_all is True, calculate for every token. + # Otherwise, only calculate for the very last token in the entire evaluation sequence. if self._logits_all: - logits_array = [True] * n_batch_tokens + logits_array = [True] * n_chunk else: - logits_array = [False] * n_batch_tokens - if i + n_batch_tokens == n_eval: + logits_array = [False] * n_chunk + if i + n_chunk == n_eval: logits_array[-1] = True self._batch.add_sequence( - token_array=batch_tokens, + token_array=chunk, pos_array=pos_array, seq_ids=[0], logits_array=logits_array ) - current_batch_size = n_batch_tokens - try: - self._ctx.decode(self._batch) - except Exception as e: - raise RuntimeError( - f"Decode Failed at " - f"Batch size: {n_batch_tokens}. " - f"Error: {str(e)}." - ) from e - # Save tokens - self.input_ids[n_past : n_past + n_batch_tokens] = batch_tokens + # Dynamic Batch Downgrade: Attempt to decode, reduce batch size if KV cache is fragmented + current_batch_size = n_chunk + success = False - # Save logits - logits_ptr = self._ctx.get_logits() + while current_batch_size > 0: + # Tell the C++ backend to only process up to `current_batch_size` tokens + self._batch.batch.n_tokens = current_batch_size + + try: + status = self._ctx.decode(self._batch) + + # 0: Success + if status == 0: + success = True + # If we successfully decoded after a downgrade, + # update current_max_batch to prevent repeated failures in next iterations. + if current_batch_size < current_max_batch: + current_max_batch = current_batch_size + break + + # 1: No KV slot available (Recoverable) + elif status == 1: + if self.verbose: + print(f"Llama.eval: KV slots full (Code 1). Halving batch size " + f"from {current_batch_size} to {current_batch_size // 2}...", file=sys.stderr) + current_batch_size //= 2 + + except Exception as e: + # Catch fatal backend failures (e.g., Code -2, -3) + raise RuntimeError(f"Llama.eval(decode): Fatal Decode Error at Pos {self.n_tokens}, " + f"Batch size {current_batch_size}: {str(e)}") from e + + if not success: + raise RuntimeError("Llama.eval(decode): Failed completely even with batch size 1.") + + # Save successfully processed tokens into the Python-side ledger + self.input_ids[n_past : n_past + current_batch_size] = chunk[:current_batch_size] + + # Extract and save all logits if requested, ensuring we only copy the successfully processed rows if self._logits_all: - rows = n_batch_tokens + logits_ptr = self._ctx.get_logits() + rows = current_batch_size cols = self._n_vocab logits_view = np.ctypeslib.as_array(logits_ptr, shape=(rows * cols,)) - self.scores[n_past : n_past + n_batch_tokens, :].reshape(-1)[:] = logits_view - else: - logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,)) - self.scores[0, :] = logits_view + self.scores[n_past : n_past + current_batch_size, :].reshape(-1)[:] = logits_view - # Update n_tokens + # Update indices based on actual processed batch size self.n_tokens += current_batch_size i += current_batch_size + # Periodic Checkpoint: Save states for hybrid models to avoid massive rollbacks + if self.is_hybrid and self._hybrid_cache_mgr is not None: + if (self.n_tokens - last_ckpt_pos >= self.checkpoint_interval) and (i < n_eval): + if self.verbose: + print(f"Llama.eval: [Periodic Checkpoint] Saving hybrid state at pos {self.n_tokens}.", file=sys.stderr) + self._hybrid_cache_mgr.save_checkpoint( + current_pos=self.n_tokens, + tokens=self.input_ids[:self.n_tokens].tolist(), + seq_id=0 + ) + last_ckpt_pos = self.n_tokens + + # Save the final logit if not in _logits_all mode + if not self._logits_all: + logits_ptr = self._ctx.get_logits() + logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,)) + self.scores[0, :] = logits_view + # Helper method: Convert dict logit_bias to List[llama_logit_bias] def _convert_logit_bias(self, logit_bias: Optional[Dict[int, float]]) -> List[llama_cpp.llama_logit_bias]: if not logit_bias: From 6286f7bd51c458399aea17090e5e44b3c419c494 Mon Sep 17 00:00:00 2001 From: AlcoftTAO Date: Sat, 28 Feb 2026 04:51:19 +0100 Subject: [PATCH 209/518] updated chat template. --- llama_cpp/llama_chat_format.py | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index b9f3c8651d..4169a8a9bb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4514,7 +4514,7 @@ class Qwen35ChatHandler(Llava15ChatHandler): " {{- content -}}" " {%- elif content is iterable and content is not mapping -%}" " {%- for item in content -%}" - " {%- if 'image' in item or 'image_url' in item -%}" + " {%- if 'image_url' in item or item.type == 'image_url' -%}" " {%- if is_system_content -%}" " {{- raise_exception('System message cannot contain images.') -}}" " {%- endif -%}" @@ -4522,24 +4522,19 @@ class Qwen35ChatHandler(Llava15ChatHandler): " {%- set image_count.value = image_count.value + 1 -%}" " {%- endif -%}" " {%- if add_vision_id -%}" - " {{- 'Picture ' ~ image_count.value ~ ': ' -}}" + " {{- 'Picture ' -}}" + " {{- image_count.value | string -}}" + " {{- ': ' -}}" " {%- endif -%}" " {{- '<|vision_start|>' -}}" - " {%- if 'image' in item -%}" - " {%- if item.image is string -%}" - " {{- item.image -}}" - " {%- else -%}" - " {{- item.image.url -}}" - " {%- endif -%}" - " {%- elif 'image_url' in item -%}" - " {%- if item.image_url is string -%}" - " {{- item.image_url -}}" - " {%- else -%}" - " {{- item.image_url.url -}}" - " {%- endif -%}" + " {%- if item.image_url is string -%}" + " {{- item.image_url -}}" + " {%- else -%}" + " {{- item.image_url.url -}}" " {%- endif -%}" " {{- '<|vision_end|>' -}}" " {%- elif 'video' in item -%}" + " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception " {%- if is_system_content -%}" " {{- raise_exception('System message cannot contain videos.') -}}" " {%- endif -%}" @@ -4698,11 +4693,6 @@ def __init__( def __call__(self, **kwargs): llama = kwargs['llama'] - # Clear state for multiple runs - llama.reset() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) @@ -4715,9 +4705,9 @@ def __call__(self, **kwargs): messages = kwargs.get('messages', []) try: image_count = len(self.get_image_urls(messages)) - print(f"Qwen35ChatHandler(reasoning={self.reasoning}) - Cleared state, processing {image_count} images", file=sys.stderr) + print(f"Qwen35ChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) except Exception: - print(f"Qwen35ChatHandler(reasoning={self.reasoning}) - Cleared state", file=sys.stderr) + print(f"Qwen35ChatHandler - Cleared state", file=sys.stderr) # Use parent implementation return super().__call__(**kwargs) From 0872695bf4272b941d05268682db57e9bb272f24 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Feb 2026 20:25:55 +0800 Subject: [PATCH 210/518] Update Submodule vendor/llama.cpp 8d3b962..d979f2b --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8d3b962f47..d979f2b176 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8d3b962f47cdb05df8ab8801da38cb3c55d50337 +Subproject commit d979f2b176217a062c57c24355db8c1dc3dfc7d5 From 2c754763a35cf6d25f057bbe88e6f578eb644838 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Feb 2026 20:31:44 +0800 Subject: [PATCH 211/518] fix(eval): prevent batch size from halving below 1 during KV slot exhaustion - Added an explicit guard to break the dynamic batch downgrade loop when `current_batch_size` is exactly 1 and a Code 1 (No KV slot) is returned. - Prevents the engine from executing an invalid `1 // 2` operation and generating the confusing "Halving batch size from 1 to 0" verbose log. - Ensures the evaluation process fails fast and aborts gracefully when physical VRAM is completely depleted and no further fallback is mathematically possible. --- llama_cpp/llama.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 381366ea84..1985ba4775 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -869,6 +869,11 @@ def eval(self, tokens: Sequence[int]): # 1: No KV slot available (Recoverable) elif status == 1: + if current_batch_size == 1: + if self.verbose: + print("Llama.eval: KV slots completely full. " + "Cannot reduce batch size below 1. Aborting...", file=sys.stderr) + break if self.verbose: print(f"Llama.eval: KV slots full (Code 1). Halving batch size " f"from {current_batch_size} to {current_batch_size // 2}...", file=sys.stderr) From 4bfe5ed99530357197e6e32d2c1188ef78f53423 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Feb 2026 22:19:00 +0800 Subject: [PATCH 212/518] feat(eval): enable native context shift for hybrid/recurrent models - Removed the `RuntimeError` that previously blocked context shifting for hybrid and SWA architectures. - Delegated the shift logic to the underlying C++ backend, which automatically handles Attention KV removal and RNN `pos` shifting. - Added dynamic verbose logging to clearly identify the model type (Transformer vs. Hybrid/Recurrent/SWA) during a context shift event. --- llama_cpp/llama.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1985ba4775..d073b07061 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -789,29 +789,25 @@ def eval(self, tokens: Sequence[int]): # Context Shift: Prevent OOM by discarding older tokens when context limit is reached. if self.n_tokens + n_eval > self._n_ctx: - if self.is_hybrid: - raise RuntimeError( - f"Context length exceeded for Hybrid/SWA model! " - f"(n_tokens: {self.n_tokens}, new: {n_eval}, max: {self._n_ctx})" - ) - else: - _n_keep = min(self.n_keep, self.n_tokens) - # Number of tokens after n_keep that may be discarded when shifting context - # defaults to half - _n_discard = (self.n_tokens - _n_keep) // 2 + _n_keep = min(self.n_keep, self.n_tokens) + # Number of tokens after n_keep that may be discarded when shifting context + # defaults to half + _n_discard = (self.n_tokens - _n_keep) // 2 - if self.verbose: - print(f"Llama.eval: Context limit reached. Shifting context: " - f"discarding {_n_discard} tokens...", file=sys.stderr) + if self.verbose: + model_type = "Hybrid/Recurrent/SWA" if self.is_hybrid else "Transformer" + print(f"Llama.eval: {model_type} context limit reached. Shifting context: " + f"discarding {_n_discard} tokens...", file=sys.stderr) - self._ctx.memory_seq_rm(0, _n_keep, _n_keep + _n_discard) - self._ctx.memory_seq_add(0, _n_keep + _n_discard, self.n_tokens, -_n_discard) + # Use context memory methods for handles both Attention KV removal and RNN pos shifting automatically + self._ctx.memory_seq_rm(0, _n_keep, _n_keep + _n_discard) + self._ctx.memory_seq_add(0, _n_keep + _n_discard, self.n_tokens, -_n_discard) - remaining_len = self.n_tokens - (_n_keep + _n_discard) - if remaining_len > 0: - self.input_ids[_n_keep : _n_keep + remaining_len] = self.input_ids[_n_keep + _n_discard : self.n_tokens] + remaining_len = self.n_tokens - (_n_keep + _n_discard) + if remaining_len > 0: + self.input_ids[_n_keep : _n_keep + remaining_len] = self.input_ids[_n_keep + _n_discard : self.n_tokens] - self.n_tokens -= _n_discard + self.n_tokens -= _n_discard # Adaptive batch downgrade limit initialization current_max_batch = self.n_batch From 667f43366496a5387bbd329410af1ebdfb8ed8dd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Mar 2026 10:18:35 +0800 Subject: [PATCH 213/518] Add the memory_can_shift API to class LlamaContext --- llama_cpp/_internals.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index bd2a8691d7..b4519e555e 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -442,6 +442,9 @@ def memory_seq_pos_max(self, seq_id: int) -> int: def memory_seq_pos_min(self, seq_id: int) -> int: return llama_cpp.llama_memory_seq_pos_min(self.get_memory(), seq_id) + def memory_can_shift(self) -> bool: + return llama_cpp.llama_memory_can_shift(self.get_memory()) + # // State / sessions API def get_state_size(self) -> int: From c27513d204efa72292f31a8993e6c7e01e425d5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Mar 2026 10:28:29 +0800 Subject: [PATCH 214/518] fix(eval): make context shift mathematically robust and architecture-safe - Added a `memory_can_shift()` pre-flight check to proactively intercept and abort gracefully on architectures that physically forbid shifting (e.g., multimodal mmproj where `n_pos_per_embd > 1` or incompatible M-RoPE), preventing fatal `GGML_ASSERT` C++ crashes. - Implemented dynamic mathematical bounds for `n_keep` and `n_discard` to guarantee that enough space is always freed, completely eliminating the edge-case where `n_discard` evaluates to 0 (causing a dead-loop when `n_ctx` is extremely small). - Wrapped underlying C++ memory shift operations in a try-except block for defense-in-depth against unexpected backend failures. - Expanded in-code documentation to clarify the arithmetic constraints and architectural limitations of the KV shift mechanism. --- llama_cpp/llama.py | 62 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d073b07061..af4f0b0cfa 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -789,25 +789,57 @@ def eval(self, tokens: Sequence[int]): # Context Shift: Prevent OOM by discarding older tokens when context limit is reached. if self.n_tokens + n_eval > self._n_ctx: - _n_keep = min(self.n_keep, self.n_tokens) - # Number of tokens after n_keep that may be discarded when shifting context - # defaults to half - _n_discard = (self.n_tokens - _n_keep) // 2 + # 0. Check if the memory supports shifting + if not self._ctx.memory_can_shift(): + raise RuntimeError( + f"Llama.eval: Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"You MUST increase n_ctx (currently {self._n_ctx}) to fit the dialogue." + ) + # 1. Calculate the absolute minimum number of tokens we must discard to fit the new chunk. + required_discard = (self.n_tokens + n_eval) - self._n_ctx - if self.verbose: - model_type = "Hybrid/Recurrent/SWA" if self.is_hybrid else "Transformer" - print(f"Llama.eval: {model_type} context limit reached. Shifting context: " - f"discarding {_n_discard} tokens...", file=sys.stderr) + # 2. Sanity check: If the incoming chunk itself is larger than the entire context window, + # shifting is physically impossible. + if required_discard > self.n_tokens: + raise RuntimeError(f"Llama.eval: Context shift failed. The incoming chunk ({n_eval} tokens) " + f"is larger than the entire context window ({self._n_ctx}).") + + # 3. Determine how many tokens to keep at the beginning (usually the System Prompt). + _n_keep_desired = min(self.n_keep, self.n_tokens) + + # Ensure that keeping these tokens doesn't prevent us from discarding the required amount. + max_keep_allowed = max(0, self.n_tokens - required_discard) + _n_keep = min(_n_keep_desired, max_keep_allowed) + + # 4. Calculate the final discard count. Default strategy is to discard half of the available + # past tokens to minimize frequent shifting, but it must be at least `required_discard`. + _n_discard = max(required_discard, (self.n_tokens - _n_keep) // 2) - # Use context memory methods for handles both Attention KV removal and RNN pos shifting automatically - self._ctx.memory_seq_rm(0, _n_keep, _n_keep + _n_discard) - self._ctx.memory_seq_add(0, _n_keep + _n_discard, self.n_tokens, -_n_discard) + # 5. Execute the shift only if there are tokens to discard. + if _n_discard > 0: + if self.verbose: + model_type = "Hybrid/Recurrent/SWA" if getattr(self, 'is_hybrid', False) else "Transformer" + print(f"Llama.eval: {model_type} context limit reached. Shifting context: " + f"keeping {_n_keep}, discarding {_n_discard} tokens...", file=sys.stderr) + + try: + # Remove the specified block of tokens from the physical KV cache + self._ctx.memory_seq_rm(0, _n_keep, _n_keep + _n_discard) + + # Shift the positional IDs of all subsequent tokens to the left to close the gap + self._ctx.memory_seq_add(0, _n_keep + _n_discard, self.n_tokens, -_n_discard) + except Exception as e: + # Defense-in-depth: Catch any other recoverable backend errors + raise RuntimeError(f"Llama.eval: Context Shift failed at the C++ level. Error: {str(e)}") from e - remaining_len = self.n_tokens - (_n_keep + _n_discard) - if remaining_len > 0: - self.input_ids[_n_keep : _n_keep + remaining_len] = self.input_ids[_n_keep + _n_discard : self.n_tokens] + # 6. Synchronize the Python-side token tracking array (ledger) + remaining_len = self.n_tokens - (_n_keep + _n_discard) + if remaining_len > 0: + self.input_ids[_n_keep : _n_keep + remaining_len] = self.input_ids[_n_keep + _n_discard : self.n_tokens] - self.n_tokens -= _n_discard + # 7. Update the global token counter + self.n_tokens -= _n_discard # Adaptive batch downgrade limit initialization current_max_batch = self.n_batch From 00da43699c3c1c36a0886751401b1058f31d8783 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Mar 2026 12:06:12 +0800 Subject: [PATCH 215/518] perf(eval): implement adaptive checkpoint intervals for hybrid models - Dynamically scale checkpoint frequency during large prompt pre-filling (max 3 triggers per eval) to minimize I/O bottlenecks and stuttering. - Add success validation to `save_checkpoint`, ensuring the `last_ckpt_pos` tracker is only updated when the state is successfully saved to disk/memory. - Enhance verbose logging to track dynamic interval calculations and save failures. --- llama_cpp/llama.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index af4f0b0cfa..d06af85386 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -845,6 +845,12 @@ def eval(self, tokens: Sequence[int]): current_max_batch = self.n_batch last_ckpt_pos = self.n_tokens + # Adaptive Periodic Checkpointing for Hybrid Models + # Following the "no more than three times" principle :) + # when pre-filling very large blocks, dilute the save frequency to minimize I/O blocking. + if self.is_hybrid and self._hybrid_cache_mgr is not None: + dynamic_interval = max(self.checkpoint_interval, n_eval // 3) # Maximum of 3 triggers + # If KV slots are full, `current_batch_size` will be halved. # A `while` loop allows us to correctly resume from the exact cut-off point. i = 0 @@ -932,15 +938,23 @@ def eval(self, tokens: Sequence[int]): # Periodic Checkpoint: Save states for hybrid models to avoid massive rollbacks if self.is_hybrid and self._hybrid_cache_mgr is not None: - if (self.n_tokens - last_ckpt_pos >= self.checkpoint_interval) and (i < n_eval): + current_pos = self.n_tokens + if (current_pos - last_ckpt_pos >= dynamic_interval) and (i < n_eval): + if self.verbose: - print(f"Llama.eval: [Periodic Checkpoint] Saving hybrid state at pos {self.n_tokens}.", file=sys.stderr) - self._hybrid_cache_mgr.save_checkpoint( - current_pos=self.n_tokens, - tokens=self.input_ids[:self.n_tokens].tolist(), + print(f"Llama.eval: [Periodic Checkpoint] Saving hybrid state at pos {current_pos} " + f"(checkpoint_interval({dynamic_interval}) reached, last={last_ckpt_pos}).", file=sys.stderr) + + success = self._hybrid_cache_mgr.save_checkpoint( + current_pos=current_pos, + tokens=self.input_ids[:current_pos].tolist(), seq_id=0 ) - last_ckpt_pos = self.n_tokens + if success: + last_ckpt_pos = current_pos + else: + if self.verbose: + print(f"Llama.eval: [Periodic Checkpoint] HybridCheckpoint save failed at pos {current_pos}, skipping update", file=sys.stderr) # Save the final logit if not in _logits_all mode if not self._logits_all: From 8b29b88de02b02a743dd7f8e6ecdf6a5bcf53613 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Mar 2026 15:20:27 +0800 Subject: [PATCH 216/518] refactor(chat_handler): extract MTMDChatHandler base class and Simplify the complexity of subsequent multimodal adaptation - Extracted the core multimodal processing pipeline from `Llava15ChatHandler` into a generic `MTMDChatHandler` base class, separating pipeline logic from model-specific prompt formats. - Updated all multimodal subclass handlers (e.g., Qwen2.5vl, Qwen3-vl, MiniCPM, GLM4, LFM2-VL) to inherit from the new `MTMDChatHandler`. - Implemented strict `**kwargs` validation in the baseconstructor to gracefully intercept and report unsupported parameters, significantly improving Developer Experience (DX). - Introduced dynamic `self.log_prefix` (`self.__class__.__name__`) for accurate and consistent logging across all subclasses. - Cleaned up redundant state-clearing, image-count logic and hardcoded print statements across subclass `__call__` implementations. - To avoid exceptions occurring when the close method is called due to initialization failure and the call to exit_stack. --- llama_cpp/_internals.py | 13 +- llama_cpp/llama.py | 4 +- llama_cpp/llama_chat_format.py | 233 ++++++++++++++++----------------- 3 files changed, 129 insertions(+), 121 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b4519e555e..9e89893efa 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -85,7 +85,9 @@ def close(self): self.model = None self.vocab = None - self._exit_stack.close() + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None def __del__(self): self.close() @@ -386,8 +388,11 @@ def close(self): except Exception: pass self.ctx = None + self.params = None - self._exit_stack.close() + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None def __del__(self): self.close() @@ -662,7 +667,9 @@ def close(self): pass self.batch = None - self._exit_stack.close() + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None def __del__(self): self.close() diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d06af85386..1d67f12af0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -682,7 +682,9 @@ def close(self) -> None: self._c_tensor_split = None self._kv_overrides_array = None - self._stack.close() + if getattr(self, "_stack", None) is not None and hasattr(self._stack, "close"): + self._stack.close() + self._stack = None def __del__(self) -> None: self.close() diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ab60fa0c69..2e652c46d5 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2779,7 +2779,7 @@ def generate_streaming(tools, functions, function_call, prompt): ) -class Llava15ChatHandler: +class MTMDChatHandler: DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( """You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful.""" @@ -2819,21 +2819,38 @@ class Llava15ChatHandler: "{% endif %}" ) - def __init__(self, clip_model_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1): - import llama_cpp.mtmd_cpp as mtmd_cpp + def __init__( + self, + clip_model_path: str, + verbose: bool = True, + use_gpu: bool = True, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + **kwargs + ): + + self.log_prefix = self.__class__.__name__ + if kwargs: + unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) + raise TypeError( + f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" + f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." + ) self.clip_model_path = clip_model_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens self.use_gpu = use_gpu self.verbose = verbose + + import llama_cpp.mtmd_cpp as mtmd_cpp self._mtmd_cpp = mtmd_cpp self._exit_stack = ExitStack() self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} if not os.path.exists(clip_model_path): - raise ValueError(f"Clip model path does not exist: {clip_model_path}") + raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") # Pre-compile Jinja template self.chat_template = ImmutableSandboxedEnvironment( @@ -2861,7 +2878,8 @@ def _init_mtmd_context(self, llama_model: llama.Llama): if self.image_max_tokens > 0: self.mctx_params.image_max_tokens = self.image_max_tokens if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"image_max_pixels {self.image_max_tokens} is less than image_min_pixels {self.image_min_tokens}") + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( @@ -2871,11 +2889,16 @@ def _init_mtmd_context(self, llama_model: llama.Llama): ) if self.mtmd_ctx is None: - raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}") + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") # Check if vision is supported - if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): + if self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) and self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: raise ValueError("Vision is not supported by this model") + # Check if audio is supported + if self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) and self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" @@ -2887,15 +2910,15 @@ def close(self) -> None: pass self.mtmd_ctx = None self.mctx_params = None + self.chat_template = None - self._exit_stack.close() + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None def __del__(self) -> None: self.close() - def load_image(self, image_url: str) -> bytes: - return self._load_image(image_url) - def _create_bitmap_from_bytes(self, image_bytes: bytes): """Create mtmd_bitmap from image bytes.""" if self.mtmd_ctx is None: @@ -2914,6 +2937,15 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes): return bitmap + # Todo(JamePeng): Separate the workflow for building the prompt in __call__ + def _process_mtmd_prompt( + self, + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + ) -> Tuple[List[int], List[tuple], Any, List[Any]]: + pass + + def __call__( self, *, @@ -2975,7 +3007,13 @@ def __call__( ) ] + messages - image_urls = self.get_image_urls(messages) + try: + image_urls = self.get_image_urls(messages) + image_count = len(image_urls) + if self.verbose: + print(f"{self.log_prefix} - processing {image_count} images", file=sys.stderr) + except Exception: + print(f"{self.log_prefix} - get_image_urls() failed from the messages", file=sys.stderr) # Get the default media marker media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') @@ -3032,10 +3070,6 @@ def __call__( if result != 0: raise ValueError(f"Failed to tokenize input: error code {result}") - # Reset llama context - llama.reset() - llama._ctx.memory_clear(True) - # Process each chunk n_past = 0 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) @@ -3101,6 +3135,7 @@ def __call__( # Cleanup bitmaps for bitmap in bitmap_cleanup: self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_array = None # Handle response format and tools (same as before) if response_format is not None and response_format["type"] == "json_object": @@ -3195,6 +3230,9 @@ def __call__( ) return _convert_completion_to_chat(completion_or_chunks, stream=stream) + def load_image(self, image_url: str) -> bytes: + return self._load_image(image_url) + @staticmethod def _load_image(image_url: str) -> bytes: """ @@ -3328,7 +3366,7 @@ def from_pretrained( local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", cache_dir: Optional[Union[str, os.PathLike[str]]] = None, **kwargs: Any, - ) -> "Llava15ChatHandler": + ) -> "MTMDChatHandler": import fnmatch from pathlib import Path @@ -3404,7 +3442,43 @@ def from_pretrained( ) -class ObsidianChatHandler(Llava15ChatHandler): +class Llava15ChatHandler(MTMDChatHandler): + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + + "{% if message.role == 'user' %}" + "{% if message.content is string %}" + "\nUSER: {{ message.content }}" + "{% elif message.content is iterable %}" + "\nUSER: " + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% endif %}" + + "{% if message.role == 'assistant' and message.content is not none %}" + "\nASSISTANT: {{ message.content }}" + "{% endif %}" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "\nASSISTANT: " + "{% endif %}" + ) + + +class ObsidianChatHandler(MTMDChatHandler): # Prompt Format # The model followed ChatML format. However, with ### as the seperator @@ -3460,7 +3534,7 @@ class ObsidianChatHandler(Llava15ChatHandler): ) -class MoondreamChatHandler(Llava15ChatHandler): +class MoondreamChatHandler(MTMDChatHandler): # Chat Format: # f"\n\n{chat_history}Question: {question}\n\nAnswer:" CHAT_FORMAT = ( @@ -3502,7 +3576,7 @@ class MoondreamChatHandler(Llava15ChatHandler): ) -class Llava16ChatHandler(Llava15ChatHandler): +class Llava16ChatHandler(MTMDChatHandler): # Example prompt # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" @@ -3548,7 +3622,7 @@ class Llava16ChatHandler(Llava15ChatHandler): ) -class NanoLlavaChatHandler(Llava15ChatHandler): +class NanoLlavaChatHandler(MTMDChatHandler): # Prompt Format # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: @@ -3603,7 +3677,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler): ) -class Llama3VisionAlphaChatHandler(Llava15ChatHandler): +class Llama3VisionAlphaChatHandler(MTMDChatHandler): # question = "" + q # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" @@ -3655,7 +3729,7 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler): Llama3VisionAlpha = Llama3VisionAlphaChatHandler -class MiniCPMv26ChatHandler(Llava15ChatHandler): +class MiniCPMv26ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" @@ -3695,7 +3769,7 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): ) -class MiniCPMv45ChatHandler(Llava15ChatHandler): +class MiniCPMv45ChatHandler(MTMDChatHandler): """ Handler for MiniCPM-V 4.5 models. @@ -3798,7 +3872,7 @@ def __init__(self, enable_thinking: bool = True, **kwargs): Args: enable_thinking (bool): If True, model generates reasoning before the final answer. - **kwargs: Additional arguments for the base Llava15ChatHandler. + **kwargs: Additional arguments for the base MTMDChatHandler. """ self.enable_thinking = enable_thinking super().__init__(**kwargs) @@ -3815,22 +3889,12 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"MiniCPMV45ChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr) - except Exception: - print(f"MiniCPMV45ChatHandler - Cleared state", file=sys.stderr) - + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") return super().__call__(**kwargs) -class Gemma3ChatHandler(Llava15ChatHandler): +class Gemma3ChatHandler(MTMDChatHandler): GEMMA3_BOI_TOKEN = "" GEMMA3_EOI_TOKEN = "" @@ -3888,7 +3952,7 @@ class Gemma3ChatHandler(Llava15ChatHandler): ) -class GLM41VChatHandler(Llava15ChatHandler): +class GLM41VChatHandler(MTMDChatHandler): # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. GLM41V_EOS_TOKEN = "<|endoftext|>" @@ -3944,24 +4008,14 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - # Clear any handler state - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"GLM4VChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) - except Exception: - print(f"GLM4VChatHandler - Cleared state", file=sys.stderr) + print(f"{self.log_prefix} - Start processing") # Use parent implementation return super().__call__(**kwargs) -class GLM46VChatHandler(Llava15ChatHandler): +class GLM46VChatHandler(MTMDChatHandler): GLM46V_EOS_TOKEN = "<|endoftext|>" GLM46V_PAD_TOKEN = "<|endoftext|>" GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" @@ -4043,22 +4097,13 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr) - except Exception: - print(f"GLM46VChatHandler(enable_thinking={self.enable_thinking}) - Cleared state", file=sys.stderr) + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") return super().__call__(**kwargs) -class GraniteDoclingChatHandler(Llava15ChatHandler): +class GraniteDoclingChatHandler(MTMDChatHandler): """ Handler for Granite-Docling models. @@ -4127,22 +4172,14 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"GraniteDoclingChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) - except Exception: - print(f"GraniteDoclingChatHandler - Cleared state", file=sys.stderr) + print(f"{self.log_prefix} - Start processing") + return super().__call__(**kwargs) -class LFM2VLChatHandler(Llava15ChatHandler): +class LFM2VLChatHandler(MTMDChatHandler): LFM2VL_BOS_TOKEN = "<|startoftext|>" LFM2VL_EOS_TOKEN = "<|im_end|>" LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" @@ -4189,22 +4226,13 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"LFM2VLChatHandler - Cleared state, Processing {image_count} images", file=sys.stderr) - except Exception: - print(f"LFM2VLChatHandler - Cleared state", file=sys.stderr) + print(f"{self.log_prefix} - Start processing") return super().__call__(**kwargs) -class PaddleOCRChatHandler(Llava15ChatHandler): +class PaddleOCRChatHandler(MTMDChatHandler): """ Handler for PaddleOCR 1.5 multimodal models. """ @@ -4306,22 +4334,13 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"PaddleOCRChatHandler - Cleared state, Processing {image_count} images", file=sys.stderr) - except Exception: - print(f"PaddleOCRChatHandler - Cleared state", file=sys.stderr) + print(f"{self.log_prefix} - Start processing") return super().__call__(**kwargs) -class Qwen25VLChatHandler(Llava15ChatHandler): +class Qwen25VLChatHandler(MTMDChatHandler): CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" "{% for message in messages %}" @@ -4358,24 +4377,14 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - # Clear any handler state - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"Qwen25VLChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) - except Exception: - print(f"Qwen25VLChatHandler - Cleared state", file=sys.stderr) + print(f"{self.log_prefix} - Start processing") # Use parent implementation return super().__call__(**kwargs) -class Qwen3VLChatHandler(Llava15ChatHandler): +class Qwen3VLChatHandler(MTMDChatHandler): CHAT_FORMAT = ( "{{- '<|im_start|>system\n' -}}" "{%- if messages[0].content is string and messages[0].role == 'system' -%}" @@ -4497,18 +4506,8 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - # Clear any handler state - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"Qwen3VLHandler(force_reasoning={self.force_reasoning}) - Cleared state, processing {image_count} images", file=sys.stderr) - except Exception: - print(f"Qwen3VLHandler(force_reasoning={self.force_reasoning}) - Cleared state", file=sys.stderr) + print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") # Use parent implementation return super().__call__(**kwargs) From 22df7474200b1373f5b115d7c01aa57b2635ed2c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Mar 2026 16:53:38 +0800 Subject: [PATCH 217/518] fix: Correct the mtmd vision check condition bug --- llama_cpp/llama_chat_format.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2e652c46d5..8feb465805 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2892,13 +2892,16 @@ def _init_mtmd_context(self, llama_model: llama.Llama): raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") # Check if vision is supported - if self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) and self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + if self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) else: - raise ValueError("Vision is not supported by this model") + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Vision is not supported by this model") + # Check if audio is supported - if self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) and self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + if self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx): + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" From 5bf6b6aca35d95a5c158e69c535b5c80f6cdaec4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 1 Mar 2026 21:36:48 +0800 Subject: [PATCH 218/518] refactor(mtmd): redesign multimodal pipeline for concurrent I/O and hybrid state management This commit fundamentally restructures the `MTMDChatHandler` pipeline, decoupling the prefill and evaluation stages to resolve previous I/O bottlenecks and state-sync issues. The new architecture fully supports hybrid/recurrent multimodal models (e.g., Qwen3.5s, LFM2-VL) with robust context management. Key structural advantages and changes: - Concurrent Media Decoding: Implemented `ThreadPoolExecutor` in `_process_mtmd_prompt` with pre-allocated arrays, allowing thread-safe parallel image/audio decoding while strictly preserving the chronological order of user inputs, and can be used in the future to process large numbers of video frames. - O(1) Prefix Matching ("Negative Reverse Vocabulary"): Replaced slow dictionary lookups with a deterministic hash-to-negative-integer mapping for media IDs. This isolates media tokens from the LLM's positive vocabulary space, enabling native, ultra-fast `longest_token_prefix` array comparisons in Python. - Hybrid Model State Management: Replaced aggressive mid-turn saving with highly efficient "End-of-Turn" checkpointing. This ensures multi-image prompts consume only a single LRU slot while allowing precise rollback to the nearest valid state upon cache misses. - Robust Context Shift (OOM Defense): The `__call__` loop now preemptively calculates token boundaries for upcoming multimodal chunks, safely discarding the oldest unpinned tokens from both the physical KV cache and the Python virtual ledger to prevent backend crashes. - Qwen3.5 Support CONFRIMED, waiting Qwen35ChatHandler PR merge Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 436 +++++++++++++++++++++++---------- 1 file changed, 313 insertions(+), 123 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 8feb465805..cc1193df6c 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2845,7 +2845,6 @@ def __init__( import llama_cpp.mtmd_cpp as mtmd_cpp self._mtmd_cpp = mtmd_cpp - self._exit_stack = ExitStack() self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} @@ -2858,6 +2857,8 @@ def __init__( lstrip_blocks=True, ).from_string(self.CHAT_FORMAT) + self._exit_stack = ExitStack() + def _init_mtmd_context(self, llama_model: llama.Llama): """Initialize mtmd context with the llama model.""" if self.mtmd_ctx is not None: @@ -2881,6 +2882,13 @@ def _init_mtmd_context(self, llama_model: llama.Llama): raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( self.clip_model_path.encode(), @@ -2922,32 +2930,206 @@ def close(self) -> None: def __del__(self) -> None: self.close() - def _create_bitmap_from_bytes(self, image_bytes: bytes): - """Create mtmd_bitmap from image bytes.""" + def _create_bitmap_from_bytes(self, media_bytes: bytes): + """ + Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. + + Supported formats: + - Images (via stb_image): jpg, png, bmp, etc. + - Audio (via miniaudio): wav, mp3, flac. + + Note: + - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. + - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. + + Args: + media_bytes (bytes): The raw byte content of the media file. + + Returns: + mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features. + """ if self.mtmd_ctx is None: - raise ValueError("mtmd context not initialized") + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") with suppress_stdout_stderr(disable=self.verbose): # Create bitmap from buffer using helper function bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, - (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), - len(image_bytes) + (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), + len(media_bytes) ) if bitmap is None: - raise ValueError("Failed to create bitmap from image bytes") + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load image or audio file from media bytes " + "(unsupported media format or corrupted data).") return bitmap - # Todo(JamePeng): Separate the workflow for building the prompt in __call__ + def _process_mtmd_prompt( self, llama: llama.Llama, messages: List[llama_types.ChatCompletionRequestMessage], ) -> Tuple[List[int], List[tuple], Any, List[Any]]: - pass + """ + Core multimodal preprocessing pipeline. + Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. + + Features: + - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. + - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. + - Strict RAII-style C++ memory management to prevent leaks on failure. + + Returns: + full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. + chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). + chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). + bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. + """ + # 1. Inject default system prompt if omitted by the user + system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: + messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages + + image_urls = self.get_image_urls(messages) + media_marker = self.media_marker + + # 2. Render the chat template and replace actual URLs with C++ media markers + text = self.chat_template.render( + messages=messages, + add_generation_prompt=True, + eos_token=self.mtmd_eos_token, + bos_token=self.mtmd_bos_token, + **getattr(self, 'extra_template_arguments', {}) + ) + # Replace image_url by media_marker in text + for url in image_urls: + text = text.replace(url, media_marker) + + if self.verbose: + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Image count: {len(image_urls)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) + + # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding + bitmaps = [None] * len(image_urls) + bitmap_cleanup = [] + chunks = None + + try: + # Concurrent Media Decoding + import concurrent.futures + if image_urls: + def _create_bitmap_func(idx: int, url: str): + media_bytes = self.load_image(url) + bitmap = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap + # This method uses multi-threaded parallel processing to convert images to bitmaps, + # which can be used in the future to process large numbers of video frames. + max_workers = min(llama.n_threads, len(image_urls)) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_create_bitmap_func, i, url) for i, url in enumerate(image_urls)] + + for future in concurrent.futures.as_completed(futures): + idx, bitmap = future.result() + bitmaps[idx] = bitmap + bitmap_cleanup.append(bitmap) + + # Strict validation: Abort if any thread failed to decode its assigned media + if any(b is None for b in bitmaps): + raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") + else: + if self.verbose: + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(image_urls)} bitmaps were successfully created.") + else: + # If there are no images, set the bitmaps to empty. + bitmaps = [] + + # 4. Initialize mtmd_input_chunks + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = (llama.n_tokens == 0) + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") + + # 5. Hybrid Tokenization (Text + Media binding) + if len(bitmaps) > 0: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) + ) + else: + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + + # 6. Virtual Token Ledger Construction + full_prompt_ids = [] + chunk_token_spans = [] + current_idx = 0 + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: continue + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + # Extract standard text token IDs + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) + full_prompt_ids.extend(tokens) + current_idx += len(tokens) + elif chunk_type in [ + self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, + self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ]: + # Extract media properties + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + chunk_id_bytes = self._mtmd_cpp.mtmd_input_chunk_get_id(chunk) + + if chunk_id_bytes: + # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) + # Create Negative Reverse Vocabulary ID: -100 to -16,777,316 + # Improved longest_token_prefix search matching performance + media_id = - (abs(hash(chunk_id_bytes.decode('utf-8', errors='ignore'))) % (2**24)) - 100 + else: + # Magic Negative Number as fallback :) + media_id = -314159 + + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) + + # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache + full_prompt_ids.extend([media_id] * chunk_n_tokens) + current_idx += chunk_n_tokens + else: + raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") + return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup + + except Exception as e: + # Ensure no useless pointers remain upon any failure + # Free chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Free bitmaps + if len(bitmap_cleanup) > 0: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup = None + bitmaps = None + + raise e def __call__( self, @@ -2998,146 +3180,154 @@ def __call__( llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], ]: - # Initialize mtmd context + # 1. Initialize mtmd context self._init_mtmd_context(llama) assert self.mtmd_ctx is not None - system_prompt = _get_system_message(messages) - if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: - messages = [ - llama_types.ChatCompletionRequestSystemMessage( - role="system", content=self.DEFAULT_SYSTEM_MESSAGE - ) - ] + messages - - try: - image_urls = self.get_image_urls(messages) - image_count = len(image_urls) - if self.verbose: - print(f"{self.log_prefix} - processing {image_count} images", file=sys.stderr) - except Exception: - print(f"{self.log_prefix} - get_image_urls() failed from the messages", file=sys.stderr) - - # Get the default media marker - media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Replace image URLs with media markers in the template - text = self.chat_template.render( - messages=messages, - tools=tools, - add_generation_prompt=True, - eos_token=llama.detokenize([llama.token_eos()]), - bos_token=llama.detokenize([llama.token_bos()]), - **self.extra_template_arguments - ) - - # Replace image URLs in text with media markers - for image_url in image_urls: - text = text.replace(image_url, media_marker) + # 2. Concurrent Preprocessing & Ledger Construction + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt(llama, messages) if self.verbose: - print(text, file=sys.stderr) + print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) - # Create bitmaps from images - bitmaps = [] - bitmap_cleanup = [] try: - for image_url in image_urls: - image_bytes = self.load_image(image_url) - bitmap = self._create_bitmap_from_bytes(image_bytes) - bitmaps.append(bitmap) - bitmap_cleanup.append(bitmap) - - # Create input text structure - input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') - input_text.add_special = (llama.n_tokens == 0) - input_text.parse_special = True + # 3. KV Cache Synchronization & State Rollback + # Compares the virtual ledger with physical history to prevent Cache Poisoning. + current_history = llama.input_ids[:llama.n_tokens].tolist() + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids) + + if longest_prefix < llama.n_tokens: + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) + llama._ctx.memory_seq_rm(0, longest_prefix, -1) + llama.n_tokens = longest_prefix - # Create input chunks - chunks = self._mtmd_cpp.mtmd_input_chunks_init() - if chunks is None: - raise ValueError("Failed to create input chunks") + n_past = llama.n_tokens - try: - # Tokenize text and images together - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, - chunks, - ctypes.byref(input_text), - bitmap_array, - len(bitmaps) - ) + for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: + # Skip previously matched chunks + if end_idx <= n_past: + continue - if result != 0: - raise ValueError(f"Failed to tokenize input: error code {result}") + if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + unprocessed_start = max(start_idx, n_past) - start_idx + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) - # Process each chunk - n_past = 0 - n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + if tokens_ptr and n_tokens_out.value > 0: + all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + tokens_to_eval = all_tokens[unprocessed_start:] - for i in range(n_chunks): - chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: continue + if tokens_to_eval: + if self.verbose: + print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + # Text evaluation delegates shift and chunking to native llama.eval + llama.eval(tokens_to_eval) + n_past = llama.n_tokens - chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + elif chunk_type in [ + self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, + self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ]: + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) + + if self.verbose: + media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO" + print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + + # Stage 5: Multimodal Physical OOM Defense + if n_past + chunk_n_tokens > llama.n_ctx(): + if llama._ctx.memory_can_shift(): + raise RuntimeError( + f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " + f"You MUST increase n_ctx to fit the dialogue." + ) + else: + # Safely discard oldest tokens while preserving system prompts + n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch + n_keep = min(llama.n_keep, n_past) + n_discard = min(n_discard, n_past - n_keep) - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: - # Handle text chunk - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if n_discard <= 0: + raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") - if tokens_ptr and n_tokens_out.value > 0: - # Convert ctypes array to Python list - tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + if self.verbose: + print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) - if llama.n_tokens + len(tokens) > llama.n_ctx(): - raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" - ) - llama.n_tokens = n_past - llama.eval(tokens) - n_past = llama.n_tokens + # Execute physical memory shift + llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) + llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) - elif chunk_type in [self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO]: - # Handle image/audio chunk using helper - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + # Shift python virtual array to match + remaining_len = n_past - (n_keep + n_discard) + if remaining_len > 0: + llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] - if n_past + chunk_n_tokens > llama.n_ctx(): - raise ValueError( - f"Prompt exceeds n_ctx: {n_past + chunk_n_tokens} > {llama.n_ctx()}" - ) + n_past -= n_discard + llama.n_tokens = n_past - new_n_past = llama_cpp.llama_pos(0) - result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( - self.mtmd_ctx, - llama._ctx.ctx, - chunk, - llama_cpp.llama_pos(n_past), - llama_cpp.llama_seq_id(0), - llama.n_batch, - False, # logits_last - ctypes.byref(new_n_past) - ) + # Execute C++ Multimodal Black-box Extraction + new_n_past = llama_cpp.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk_ptr, + llama_cpp.llama_pos(n_past), + llama_cpp.llama_seq_id(0), + llama.n_batch, + True, # logits_last = True, drastically saves computational overhead + ctypes.byref(new_n_past) + ) - if result != 0: - raise ValueError(f"Failed to evaluate chunk: error code {result}") + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") - # Update llama's token count - n_past = new_n_past.value - llama.n_tokens = n_past + # Update Ledger with "Negative Reverse Vocabulary" IDs + llama.input_ids[n_past : new_n_past.value] = media_id + n_past = new_n_past.value + llama.n_tokens = n_past - # Get prompt tokens to avoid a cache miss - prompt = llama.input_ids[: llama.n_tokens].tolist() + # Extract the final, perfectly synchronized prompt sequence + prompt = llama.input_ids[: llama.n_tokens].tolist() - finally: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) + # End-of-Turn Checkpoint + # Anchors the state ONLY after the entire multi-modal turn is processed + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if self.verbose: + print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) + llama._hybrid_cache_mgr.save_checkpoint( + current_pos=llama.n_tokens, + tokens=prompt, + seq_id=0 + ) finally: + # Cleanup chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None # Cleanup bitmaps - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) + if bitmap_cleanup: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup.clear() bitmap_array = None # Handle response format and tools (same as before) From 2258973465a79a2c5526dd2b9bbe0cd7fac6708e Mon Sep 17 00:00:00 2001 From: AlcoftTAO Date: Sun, 1 Mar 2026 19:22:17 +0100 Subject: [PATCH 219/518] update chat handler --- llama_cpp/llama_chat_format.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 4d06966bcf..cff03918d6 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4681,7 +4681,7 @@ def __init__( - False: Doesn't count the images. Can save tokens with single-image. """ super().__init__(**kwargs) - + self.force_reasoning = force_reasoning self.extra_template_arguments["force_reasoning"] = force_reasoning self.extra_template_arguments["add_vision_id"] = add_vision_id @@ -4697,7 +4697,7 @@ def __call__(self, **kwargs): # Use parent implementation return super().__call__(**kwargs) -class Qwen35ChatHandler(Llava15ChatHandler): +class Qwen35ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( "{%- set image_count = namespace(value=0) -%}" "{%- set video_count = namespace(value=0) -%}" @@ -4864,13 +4864,13 @@ class Qwen35ChatHandler(Llava15ChatHandler): def __init__( self, - reasoning: bool = True, + enable_thinking: bool = True, add_vision_id: bool = True, **kwargs, ): """ Parameters: - - reasoning (bool): + - enable_thinking (bool): - True (default): Enables reasoning for better results. - False: Disables reasoning for faster results. - add_vision_id (bool): @@ -4878,8 +4878,8 @@ def __init__( - False: Doesn't count the images. Can save tokens with single-image. """ super().__init__(**kwargs) - - self.extra_template_arguments["enable_thinking"] = reasoning + self.enable_thinking = enable_thinking + self.extra_template_arguments["enable_thinking"] = enable_thinking self.extra_template_arguments["add_vision_id"] = add_vision_id def __call__(self, **kwargs): @@ -4888,18 +4888,8 @@ def __call__(self, **kwargs): if hasattr(llama, 'input_ids'): llama.input_ids.fill(0) - # Clear any handler state - if hasattr(self, '_last_image_embed'): - self._last_image_embed = None - self._last_image_hash = None - if self.verbose: - messages = kwargs.get('messages', []) - try: - image_count = len(self.get_image_urls(messages)) - print(f"Qwen35ChatHandler - Cleared state, processing {image_count} images", file=sys.stderr) - except Exception: - print(f"Qwen35ChatHandler - Cleared state", file=sys.stderr) + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") # Use parent implementation return super().__call__(**kwargs) From 8e8c5208074e74275bc68308275d84265cecaffc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 2 Mar 2026 19:52:20 +0800 Subject: [PATCH 220/518] Update Submodule vendor/llama.cpp d979f2b..2afcdb9 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d979f2b176..2afcdb9777 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d979f2b176217a062c57c24355db8c1dc3dfc7d5 +Subproject commit 2afcdb9777b1bac79fa4bfe284b9bf23085b0469 From 3f8f0f89a2b72ec2f9494fa5f14206591a5cde49 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 2 Mar 2026 20:00:34 +0800 Subject: [PATCH 221/518] Update README.md --- README.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 885ae0134d..0d651dd4d9 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,28 @@ This package provides: Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). + +## Discussions + +Starting March 2026, I am excited to announce that we have officially enabled the **Discussions** tab for `llama-cpp-python`! + +You can access it right here: [GitHub Discussions](https://github.com/JamePeng/llama-cpp-python/discussions). + +**Why Discussions? & Updates on Documentation** +As the project has evolved, our existing documentation (`docs`) has unfortunately become a bit bloated and outdated. To provide you with more timely and clear information: + +* **New Feature Releases:** Moving forward, whenever a new feature is rolled out, I will publish a dedicated standalone article in the Discussions section. These posts will include detailed explanations, usage guides, and important caveats. +* This approach will serve as a more agile and interactive "live documentation" while we figure out the best way to refactor the old docs. + +**Join the Community** +I warmly welcome all of you to use this new space. Let's build together: + +* 💬 **Discuss & Share:** Have a question, an idea, or a cool use case? Share it with the community! +* 🛠️ **Maintain & Test:** Help us test new features, troubleshoot issues, and collaboratively maintain the repository. +* 📚 **Learn & Grow:** I hope everyone can benefit from this project, learn from each other, and gain valuable insights. + +Thank you for your continuous support! + ## Installation Requirements: @@ -522,6 +544,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | +| [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. @@ -1039,7 +1062,7 @@ This package is under active development and I welcome any contributions. To get started, clone the repository and install the package in editable / development mode: ```bash -git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git +git clone https://github.com/JamePeng/llama-cpp-python --recursive cd llama-cpp-python # Upgrade pip (required for editable mode) From 41959f5bc3fdbdbc33f9d5578ac4516e08a8d8c8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 2 Mar 2026 20:22:07 +0800 Subject: [PATCH 222/518] Bump version to milestone version 0.3.30. --- CHANGELOG.md | 127 ++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 128 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84e63ad2b5..7904b39e4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,133 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.30] Milestone Release + +I will update the release notes for version 0.3.30 in the [discussion](https://github.com/JamePeng/llama-cpp-python/discussions). + +- refactor(mtmd): redesign multimodal pipeline for concurrent I/O and hybrid state management +This commit fundamentally restructures the `MTMDChatHandler` pipeline, decoupling the prefill and evaluation stages to resolve previous I/O bottlenecks and state-sync issues. The new architecture fully supports hybrid/recurrent multimodal models (e.g., Qwen3.5s, LFM2-VL) with robust context management. + + Key structural advantages and changes: + - Concurrent Media Decoding: Implemented `ThreadPoolExecutor` in `_process_mtmd_prompt` with pre-allocated arrays, allowing thread-safe parallel image/audio decoding while strictly preserving the chronological order of user inputs, and can be used in the future to process large numbers of video frames. + - O(1) Prefix Matching ("Negative Reverse Vocabulary"): Replaced slow dictionary lookups with a deterministic hash-to-negative-integer mapping for media IDs. This isolates media tokens from the LLM's positive vocabulary space, enabling native, ultra-fast `longest_token_prefix` array comparisons in Python. + - Hybrid Model State Management: Replaced aggressive mid-turn saving with highly efficient "End-of-Turn" checkpointing. This ensures multi-image prompts consume only a single LRU slot while allowing precise rollback to the nearest valid state upon cache misses. + - Robust Context Shift (OOM Defense): The `__call__` loop now preemptively calculates token boundaries for upcoming multimodal chunks, safely discarding the oldest unpinned tokens from both the physical KV cache and the Python virtual ledger to prevent backend crashes. + - Qwen3.5 Support CONFRIMED, waiting Qwen35ChatHandler PR merge + +- merge: Implemented Qwen35ChatHandler for Qwen3.5(by **@alcoftTAO**) + +- fix: Correct the mtmd vision check condition bug + +- refactor(chat_handler): extract MTMDChatHandler base class and Simplify the complexity of subsequent multimodal adaptation + - Extracted the core multimodal processing pipeline from `Llava15ChatHandler` into a generic `MTMDChatHandler` base class, separating pipeline logic from model-specific prompt formats. + - Updated all multimodal subclass handlers (e.g., Gemma3, Granite-Docling, PaddleOCR, Qwen2.5vl, Qwen3-vl, MiniCPM, GLM4.xV, LFM2-VL) to inherit from the new base class `MTMDChatHandler`. + - Implemented strict `**kwargs` validation in the baseconstructor to gracefully intercept and report unsupported parameters, significantly improving Developer Experience (DX). + - Introduced dynamic `self.log_prefix` (`self.__class__.__name__`) for accurate and consistent logging across all subclasses. + - Cleaned up redundant state-clearing, image-count logic and hardcoded print statements across subclass `__call__` implementations. + - To avoid exceptions occurring when the close method is called due to initialization failure and the call to exit_stack. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/2afcdb9777b1bac79fa4bfe284b9bf23085b0469](https://github.com/ggml-org/llama.cpp/commit/2afcdb9777b1bac79fa4bfe284b9bf23085b0469) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260301 + +Many thanks to **@yamikumo-DSD** and **@roj234** for providing detailed testing and valuable suggestions. + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/e4861df5fd44bb83ec2b9063ca3375759416aead...3f8f0f89a2b72ec2f9494fa5f14206591a5cde49 + +## [0.3.29] + +- perf(eval): implement adaptive checkpoint intervals for hybrid models + - Dynamically scale checkpoint frequency during large prompt pre-filling (max 3 triggers per eval) to minimize I/O bottlenecks and stuttering. + - Add success validation to `save_checkpoint`, ensuring the `last_ckpt_pos` tracker is only updated when the state is successfully saved to disk/memory. + - Enhance verbose logging to track dynamic interval calculations and save failures. + +- fix(eval): make context shift mathematically robust and architecture-safe + - Added a `memory_can_shift()` pre-flight check to proactively intercept and abort gracefully on architectures that physically forbid shifting (e.g., multimodal mmproj where `n_pos_per_embd > 1` or incompatible M-RoPE), preventing fatal `GGML_ASSERT` C++ crashes. + - Implemented dynamic mathematical bounds for `n_keep` and `n_discard` to guarantee that enough space is always freed, completely eliminating the edge-case where `n_discard` evaluates to 0 (causing a dead-loop when `n_ctx` is extremely small). + - Wrapped underlying C++ memory shift operations in a try-except block for defense-in-depth against unexpected backend failures. + - Expanded in-code documentation to clarify the arithmetic constraints and architectural limitations of the KV shift mechanism. + +- Add the memory_can_shift API to class LlamaContext + +- feat(eval): enable native context shift for hybrid/recurrent models + - Removed the `RuntimeError` that previously blocked context shifting for hybrid and SWA architectures. + - Delegated the shift logic to the underlying C++ backend, which automatically handles Attention KV removal and RNN `pos` shifting. + - Added dynamic verbose logging to clearly identify the model type (Transformer vs. Hybrid/Recurrent/SWA) during a context shift event. + +- fix(eval): prevent batch size from halving below 1 during KV slot exhaustion + - Added an explicit guard to break the dynamic batch downgrade loop when `current_batch_size` is exactly 1 and a Code 1 (No KV slot) is returned. + - Prevents the engine from executing an invalid `1 // 2` operation and generating the confusing "Halving batch size from 1 to 0" verbose log. + - Ensures the evaluation process fails fast and aborts gracefully when physical VRAM is completely depleted and no further fallback is mathematically possible. + +- feat(hybrid): add periodic checkpointing and adaptive batch handling + - Increase default `ctx_checkpoints` from 16 to 32 + - Add new parameter `checkpoint_interval` (default: 4096) for hybrid model state snapshots + - Implement robust dynamic batch downgrade on KV cache exhaustion (status=1) + - Introduce periodic checkpoint saves during eval in hybrid mode + - Improve error handling and logging around context shifts and decoding failures + +- Optimization (decode): treat KV slot exhaustion (code 1) as a recoverable return value + - Updated the `decode` wrapper to explicitly return `1` instead of raising a `RuntimeError` when `llama_decode` indicates no KV slots are available. + - Aligned Python API behavior with the underlying C++ contract, treating code 1 as a recoverable signal rather than a fatal crash. + - Enabled upper-level caller loops (like `eval`) to gracefully handle VRAM fragmentation via dynamic batch halving without relying on clumsy try-except block string parsing. + - Retained strict `RuntimeError` exceptions for truly fatal backend failures (e.g., codes -1, -2, -3). + - Added comprehensive docstrings detailing return codes and exception scenarios. + +- feat(core): overhaul generate and eval for hybrid model support(Qwen3-next、Qwen3.5 etc.) + - Integrated `HybridCheckpointCache` into the generation loop to support state rollback for recurrent/hybrid architectures. + - Implemented Context Shift (sliding window) in `eval` to gracefully prevent OOM when exceeding `n_ctx`. + - Adapted `eval` to use the newly vectorized `LlamaBatch.add_sequence` API with dynamic `logits_array` configuration. + - Fixed the full prefix match bug by forcing a 1-token re-evaluation to refresh logits. + - Disabled speculative decoding for hybrid models to prevent irreversible state pollution. + - Wrapped the generation loop in a `try...finally` block to guarantee safe checkpoint saving. + +- refactor(LlamaBatch): replace set_batch with granular add_token + vectorized add_sequence + - Introduce high-performance add_token() for single-token append in generation loop + - Add flexible add_sequence() with per-token pos/seq_ids/logits arrays + - Remove old set_batch() that assumed single-seq + forced last logit + - Better support for multi-sequence and precise logit control + +## [0.3.28] + +- fix(HybridCheckpointCache): ValueError: bytes must be in range(0, 256) + +- feat: add HybridCheckpointCache detect support for recurrent/hybrid/SWA models + - Introduce ctx_checkpoints parameter (default 16) + - Detect recurrent / hybrid / n_swa > 0 models in __init__ + - Automatically use HybridCheckpointCache when hybrid architecture is detected + - Properly close and clear HybridCheckpointCache in __del__ + +- fix(cache): add safety guards to checkpoint restore and optimize API calls + - Replaced direct `llama_cpp` API calls with cached function pointers (`self._get_size_ext`, etc.) for better performance and consistency. + - Added sequence ID validation with verbose error logging to prevent cross-sequence contamination. + - Added strict state size validation before restoration to prevent buffer overflows and backend segmentation faults. + +- Remove redundant seq_id and add resource cleanup + - Removed `seq_id` from `HybridCheckpointCache` initialization to make it a stateless, global multi-sequence manager. + - Added `close()` and `__del__()` methods to safely release C++ context references and prevent memory leaks. + +- feat(cache): implement HybridCheckpointCache for hybrid/recurrent models +Introduces a dedicated caching mechanism to support state rollback for +models that cannot physically truncate their KV cache (e.g., Qwen3-Next, Qwen3.5, +etc.). + + Key additions and changes: + - Add `HybridCheckpoint` dataclass to store RNN state snapshots along with their binary data and metadata. + - Implement `HybridCheckpointCache` to manage sequence-specific states using the `llama_state_seq_*_ext` C++ APIs. + - Introduce `_hash_prefix` using SHA-256 to guarantee cryptographic certainty when matching prompt histories, preventing state corruption. + - Add `save_checkpoint` with a FIFO eviction policy to strictly bound memory usage based on `max_checkpoints`. + - Add `restore_checkpoint` to securely inject valid RNN states back into the C++ backend. + - Explicitly disable incompatible dictionary interfaces (`__getitem__`, `__setitem__`, `__contains__`) inherited from `BaseLlamaCache`. + - Refactor module imports (alphabetical sorting) and relocate `LlamaDiskCache` for better structural consistency. + +- Remove the hack code in llama_chat_format.py + +- LLama: Optimize KV cache management for multi-round conversations + - Implements prefix-matching logic to truncate stale "ghost" tokens in C++ KV cache + - Prevents attention misalignment and context poisoning during multi-turn interactions + - Reduces memory overhead by reusing matched prefixes efficiently + ## [0.3.27] - feat: add `PaddleOCR-VL-1.5` multimodal chat handler `PaddleOCRChatHandler` diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 369f24ca88..b72459f653 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.27" +__version__ = "0.3.30" From f5f76a1d13ecfe7ea5ae67c5eaca36e580226dac Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 4 Mar 2026 20:53:51 +0800 Subject: [PATCH 223/518] Update Submodule vendor/llama.cpp 2afcdb9..7f5ee54 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2afcdb9777..7f5ee54968 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2afcdb9777b1bac79fa4bfe284b9bf23085b0469 +Subproject commit 7f5ee549683d600ad41db6a295a232cdd2d8eb9f From b342f70f7b309cb247317afcb8ff41e3193162d6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 4 Mar 2026 22:41:56 +0800 Subject: [PATCH 224/518] refactor(mtmd): introduce omni-modal media pipeline with experimental audio support This commit significantly overhauls the media parsing and loading pipeline in `MTMDChatHandler` to gracefully handle both vision and audio inputs, establishing a true omni-modal architecture. Key structural changes: - Hardware Capability Sniffing: `_init_mtmd_context` now actively probes the C++ backend for `ctx_v` (vision) and `ctx_a` (audio) encoders, enabling proactive fail-fast validation before media processing. - Unified Media Extraction: Replaced `get_image_urls` and `split_text_on_image_urls` with a robust `_get_media_items` method. This safely parses `image_url`, `input_audio`, and `audio_url` while strictly maintaining the chronological order of user prompts and enforcing OpenAI format specs. - Media Dispatcher & Magic Bytes: Introduced a unified `load_media` dispatcher. Added a new `_load_audio` method and a rigorous `detect_audio_format` static method that accurately mimics `llama.cpp`'s C++ magic bytes sniffing (RIFF/WAVE, ID3/MPEG, fLaC) to prevent fatal backend crashes. - Concurrent Omni-Decoding: The ThreadPoolExecutor in `_process_mtmd_prompt` has been upgraded to concurrently fetch and decode both image and audio payloads into unified `mtmd_bitmap` structures. Note: Audio processing capabilities in the underlying llama.cpp engine are currently in an experimental stage. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 251 ++++++++++++++++++++++++--------- 1 file changed, 183 insertions(+), 68 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index cff03918d6..780f32d21d 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,13 +1,14 @@ from __future__ import annotations -import os -import sys -import json +import base64 import ctypes import dataclasses import datetime +import json +import os import random import string +import sys from contextlib import ExitStack from typing import ( @@ -29,6 +30,9 @@ import numpy as np import numpy.typing as npt +import urllib.request +from urllib.error import URLError, HTTPError + import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama as llama import llama_cpp.llama_types as llama_types @@ -2900,16 +2904,22 @@ def _init_mtmd_context(self, llama_model: llama.Llama): raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") # Check if vision is supported - if self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: if self.verbose: print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) else: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Vision is not supported by this model") + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) # Check if audio is supported - if self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx): + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: if self.verbose: print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" @@ -2930,6 +2940,72 @@ def close(self) -> None: def __del__(self) -> None: self.close() + def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: + """ + Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. + Strictly enforces capability checks, raising exceptions if unsupported media is passed. + + Returns: + media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). + """ + media_items: List[Dict[str, str]] = [] + for message in messages: + if isinstance(message.get("content"), list): + for content in message["content"]: + content_type = content.get("type", "") + + # 1. Vision Processing + if content_type == "image_url": + if not self.is_support_vision: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") + + url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] + media_items.append({"url": url, "type": "image"}) + + # 2. Audio Processing + elif content_type in ["audio_url", "input_audio"]: + if not self.is_support_audio: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") + + # Case A: Handle custom/forward-compatible audio_url format + if content == "audio_url": + url = content["audio_url"] if isinstance(content["audio_url"], str) else content["audio_url"]["url"] + media_items.append({"url": url, "type": "audio"}) + # Case B: Handle OpenAI standard input_audio format + else: + input_audio = content.get("input_audio", {}) + if isinstance(input_audio, dict) and "data" in input_audio: + # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic + # input_audio: { + # data: audio.base64Data, + # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' + # } + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + + # Strictly align with llama.cpp (require wav/mp3) + if audio_format not in ["wav", "mp3"]: + raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") + + # Format as a Data URI to reuse the unified load_media logic + media_items.append({ + "url": f"data:audio/{audio_format};base64,{audio_data}", + "type": "audio" + }) + else: + # Just a raw base64 data + url = input_audio if isinstance(input_audio, str) else "" + if url: + media_items.append({"url": url, "type": "audio"}) + + # 3. Text & Unknown Types + elif content_type == "text": + continue + else: + if self.verbose: + print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) + return media_items + def _create_bitmap_from_bytes(self, media_bytes: bytes): """ Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. @@ -2992,7 +3068,7 @@ def _process_mtmd_prompt( if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages - image_urls = self.get_image_urls(messages) + media_items = self._get_media_items(messages) media_marker = self.media_marker # 2. Render the chat template and replace actual URLs with C++ media markers @@ -3004,31 +3080,31 @@ def _process_mtmd_prompt( **getattr(self, 'extra_template_arguments', {}) ) # Replace image_url by media_marker in text - for url in image_urls: - text = text.replace(url, media_marker) + for item in media_items: + text = text.replace(item["url"], media_marker) if self.verbose: - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Image count: {len(image_urls)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding - bitmaps = [None] * len(image_urls) + bitmaps = [None] * len(media_items) bitmap_cleanup = [] chunks = None try: # Concurrent Media Decoding import concurrent.futures - if image_urls: - def _create_bitmap_func(idx: int, url: str): - media_bytes = self.load_image(url) + if media_items: + def _create_bitmap_func(idx: int, item: str): + media_bytes = self.load_media(item["url"], item["type"]) bitmap = self._create_bitmap_from_bytes(media_bytes) return idx, bitmap - # This method uses multi-threaded parallel processing to convert images to bitmaps, + # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, # which can be used in the future to process large numbers of video frames. - max_workers = min(llama.n_threads, len(image_urls)) + max_workers = min(llama.n_threads, len(media_items)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(_create_bitmap_func, i, url) for i, url in enumerate(image_urls)] + futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] for future in concurrent.futures.as_completed(futures): idx, bitmap = future.result() @@ -3040,7 +3116,7 @@ def _create_bitmap_func(idx: int, url: str): raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") else: if self.verbose: - print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(image_urls)} bitmaps were successfully created.") + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") else: # If there are no images, set the bitmaps to empty. bitmaps = [] @@ -3423,8 +3499,95 @@ def __call__( ) return _convert_completion_to_chat(completion_or_chunks, stream=stream) - def load_image(self, image_url: str) -> bytes: - return self._load_image(image_url) + def load_media(self, media_url: str, media_type: str) -> bytes: + """ + Unified dispatcher for loading media payloads. + Routes the URL/URI to the specific image or audio processor based on the media_type. + """ + if media_type == "image": + return self._load_image(media_url) + elif media_type == "audio": + audio_bytes = self._load_audio(media_url) + # Apply ironclad magic bytes validation before returning + try: + self.detect_audio_format(audio_bytes) + except ValueError as e: + raise ValueError(f"{self.log_prefix}(load_media): {e}") + return audio_bytes + else: + raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") + + @staticmethod + def detect_audio_format(audio_bytes: bytes) -> str: + """ + Pure utility function: Detects the audio format from magic bytes. + Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility + and avoid false positives (e.g., AVI files disguised as RIFF). + """ + length = len(audio_bytes) + + if length < 12: + raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") + + # RIFF & WAVE magic bytes verification + is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" + + # ID3 metadata or MPEG sync word verification + is_mp3 = length >= 3 and ( + audio_bytes.startswith(b"ID3") or + (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) + ) + + # FLAC magic bytes verification + is_flac = audio_bytes.startswith(b"fLaC") + + if is_wav: + return "wav" + elif is_mp3: + return "mp3" + elif is_flac: + return "flac" + else: + raise ValueError( + "Unsupported audio format detected via magic bytes. " + "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." + ) + + @staticmethod + def _load_audio(audio_url: str) -> bytes: + """ + Load audio from either a URL, local path, or a data URI and return raw bytes. + """ + + audio_bytes = b"" + + # 1. Handle data URI (base64) + if audio_url.strip().startswith("data:"): + comma_pos = audio_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + base64_data = audio_url[comma_pos + 1 :] + audio_bytes = base64.b64decode(base64_data) + + # 2. Handle local file path + elif os.path.exists(audio_url): + with open(audio_url, "rb") as f: + audio_bytes = f.read() + + # 3. Handle remote URL via HTTP/HTTPS + else: + headers = {"User-Agent": "Mozilla/5.0"} + req = urllib.request.Request(audio_url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=15) as f: + audio_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download audio from {audio_url}: {e}") + + if not audio_bytes: + raise ValueError("Empty audio data received") + + return audio_bytes @staticmethod def _load_image(image_url: str) -> bytes: @@ -3444,7 +3607,6 @@ def _load_image(image_url: str) -> bytes: # 1. Handle data URI (base64) if image_url.strip().startswith("data:"): - import base64 # Split only once from the right to correctly handle mime types containing commas comma_pos = image_url.find(",") if comma_pos == -1: @@ -3454,9 +3616,6 @@ def _load_image(image_url: str) -> bytes: # 2. Handle local/remote URL else: - import urllib.request - from urllib.error import URLError, HTTPError - headers = {"User-Agent": "Mozilla/5.0"} req = urllib.request.Request(image_url, headers=headers) @@ -3506,50 +3665,6 @@ def _load_image(image_url: str) -> bytes: image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) return output.getvalue() - @staticmethod - def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): - image_urls: List[str] = [] - for message in messages: - if message["role"] == "user": - if message["content"] is None: - continue - for content in message["content"]: - if isinstance(content, dict) and "type" in content: - if content["type"] == "image_url": - if ( - isinstance(content["image_url"], dict) - and "url" in content["image_url"] - ): - image_urls.append(content["image_url"]["url"]) - else: - image_urls.append(content["image_url"]) - return image_urls - - @staticmethod - def split_text_on_image_urls(text: str, image_urls: List[str]): - """This method is no longer used in the new implementation.""" - def find_first(s: str, substrs: List[str]): - for i, substr in enumerate(substrs): - pos = s.find(substr) - if pos != -1: - return pos, i - return None, None - - split_text: List[Tuple[Literal["text", "image_url"], str]] = [] - remaining = text - while remaining: - # Find first image_url - pos, i = find_first(remaining, image_urls) - if pos is not None and i is not None: - if pos > 0: - split_text.append(("text", remaining[:pos])) - split_text.append(("image_url", image_urls[i])) - remaining = remaining[pos + len(image_urls[i]) :] - else: - split_text.append(("text", remaining)) - remaining = "" - return split_text - @classmethod def from_pretrained( cls, From c03ce2284d25c0464da00c023b96e915e7cdc246 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 5 Mar 2026 08:03:30 +0800 Subject: [PATCH 225/518] fix(hybrid): implement N-1 checkpointing to support 1-token rollbacks Forces an N-1 state snapshot during prompt prefilling for hybrid models. This ensures the engine can safely perform a 1-token rollback to refresh logits upon 100% cache matches (e.g., changing seeds on identical prompts), preventing RNN state desyncs and empty outputs. --- llama_cpp/llama.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1d67f12af0..32ab5b1185 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1306,7 +1306,27 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): try: while True: if len(tokens) > 0: - self.eval(tokens) + # For hybrid models processing a prompt (len > 1), force an N-1 checkpoint + # to safely allow 1-token rollbacks (e.g., for seed changes on 100% prompt matches). + if self.is_hybrid and self._hybrid_cache_mgr is not None and len(tokens) > 1: + body_tokens = tokens[:-1] + last_token = [tokens[-1]] + + # 1. Evaluate up to N-1 + self.eval(body_tokens) + + # 2. Save the N-1 state snapshot + current_history = self._input_ids[:self.n_tokens].tolist() + self._hybrid_cache_mgr.save_checkpoint( + current_pos=self.n_tokens, + tokens=current_history, + seq_id=0 + ) + # 3. Evaluate the final token to refresh logits + self.eval(last_token) + else: + # Standard evaluation or single-token generation step + self.eval(tokens) while sample_idx < self.n_tokens: token = self._sampling_ctx.sample(self._ctx, idx=-1) self._sampling_ctx.accept(token, False if grammar is None else True) From f4ed6f33e79e3deccd158281d3075ba6493eb461 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 6 Mar 2026 06:05:35 +0800 Subject: [PATCH 226/518] Update Submodule vendor/llama.cpp 7f5ee54..a0ed91a --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7f5ee54968..a0ed91a442 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7f5ee549683d600ad41db6a295a232cdd2d8eb9f +Subproject commit a0ed91a442ea6b013bd42ebc3887a81792eaefa1 From 118a1a89dcf0753129b74e75baf4dddaea4df89c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 6 Mar 2026 07:32:11 +0800 Subject: [PATCH 227/518] fix(mtmd): remove OS-level log suppression to expose critical C++ errors Removed the `suppress_stdout_stderr` context manager around critical C++ backend calls (`_init_mtmd_context`, `_create_bitmap_from_bytes`, and `close`). Previously, when `verbose=False`, this OS-level file descriptor redirection was swallowing fatal C++ backend errors (e.g., `stb_image` decoding failures, corrupted `.mmproj` model weights, or CUDA Out-Of-Memory aborts), resulting in silent crashes that were impossible to debug. The framework now correctly relies on the native C-API `llama_log_callback` to route logs to Python gracefully, ensuring that critical decoding and hardware exceptions remain visible to the developer. --- llama_cpp/llama_chat_format.py | 125 ++++++++++++++++----------------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 780f32d21d..3acb10e233 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2868,65 +2868,63 @@ def _init_mtmd_context(self, llama_model: llama.Llama): if self.mtmd_ctx is not None: return # Already initialized - with suppress_stdout_stderr(disable=self.verbose): - self._mtmd_cpp.mtmd_helper_log_set(llama_log_callback, ctypes.c_void_p(0)) - - # Get default parameters - self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() - self.mctx_params.use_gpu = self.use_gpu - self.mctx_params.print_timings = self.verbose - self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - self.mctx_params.warmup = True - if self.image_min_tokens > 0: - self.mctx_params.image_min_tokens = self.image_min_tokens - if self.image_max_tokens > 0: - self.mctx_params.image_max_tokens = self.image_max_tokens - if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " - f"cannot be less than image_min_tokens ({self.image_min_tokens}).") - - # Cache the model's eos token and bos token - self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') - self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') - - # Cache the mtmd_default_marker - self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Initialize mtmd context - self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), - llama_model.model, - self.mctx_params - ) - - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") + self._mtmd_cpp.mtmd_helper_log_set(llama_log_callback, ctypes.c_void_p(0)) + + # Get default parameters + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True + if self.image_min_tokens > 0: + self.mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + self.mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.clip_model_path.encode(), + llama_model.model, + self.mctx_params + ) - # Check if vision is supported - self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) - if self.is_support_vision: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") - # Check if audio is supported - self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) - if self.is_support_audio: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + # Check if vision is supported + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if audio is supported + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" if getattr(self, "mtmd_ctx", None) is not None: try: - with suppress_stdout_stderr(disable=getattr(self, "verbose", True)): - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) except Exception: pass self.mtmd_ctx = None @@ -3027,20 +3025,19 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): if self.mtmd_ctx is None: raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") - with suppress_stdout_stderr(disable=self.verbose): - # Create bitmap from buffer using helper function - bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( - self.mtmd_ctx, - (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), - len(media_bytes) - ) + # Create bitmap from buffer using helper function + bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), + len(media_bytes) + ) - if bitmap is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load image or audio file from media bytes " - "(unsupported media format or corrupted data).") + if bitmap is None: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load image or audio file from media bytes " + "(unsupported media format or corrupted data).") - return bitmap + return bitmap def _process_mtmd_prompt( From 6db4b83b22e020534085b15de8a2469b3c277953 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 6 Mar 2026 19:22:00 +0800 Subject: [PATCH 228/518] Update Submodule vendor/llama.cpp a0ed91a..f5ddcd1 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a0ed91a442..f5ddcd1696 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a0ed91a442ea6b013bd42ebc3887a81792eaefa1 +Subproject commit f5ddcd1696eca5069dc7915f4d4c03c9a709afea From 2c191cb0aa10d22e3aa9de10d26451a328ccb909 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 6 Mar 2026 20:43:21 +0800 Subject: [PATCH 229/518] Bump version to 0.3.31 Signed-off-by: JamePeng --- CHANGELOG.md | 25 +++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7904b39e4d..85ad0f9f5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.31] Omni-Modal Media Pipeline, Hybrid 1-Token Rollback and Enhanced Logging + +- refactor(mtmd): introduce omni-modal media pipeline with experimental audio support +This commit significantly overhauls the media parsing and loading pipeline in `MTMDChatHandler` to gracefully handle both vision and audio inputs, establishing a true omni-modal architecture. + + Key structural changes: + - Hardware Capability Sniffing: `_init_mtmd_context` now actively probes the C++ backend for `ctx_v` (vision) and `ctx_a` (audio) encoders, enabling proactive fail-fast validation before media processing. + - Unified Media Extraction: Replaced `get_image_urls` and `split_text_on_image_urls` with a robust `_get_media_items` method. This safely parses `image_url`, `input_audio`, and `audio_url` while strictly maintaining the chronological order of user prompts and enforcing OpenAI format specs. + - Media Dispatcher & Magic Bytes: Introduced a unified `load_media` dispatcher. Added a new `_load_audio` method and a rigorous `detect_audio_format` static method that accurately mimics `llama.cpp`'s C++ magic bytes sniffing (RIFF/WAVE, ID3/MPEG, fLaC) to prevent fatal backend crashes. + - Concurrent Omni-Decoding: The ThreadPoolExecutor in `_process_mtmd_prompt` has been upgraded to concurrently fetch and decode both image and audio payloads into unified `mtmd_bitmap` structures. + + - **Note**: Audio processing capabilities in the underlying llama.cpp engine are currently in an experimental stage. + +- fix(hybrid): implement N-1 checkpointing to support 1-token rollbacks + - Forces an N-1 state snapshot during prompt prefilling for hybrid models. This ensures the engine can safely perform a 1-token rollback to refresh logits upon 100% cache matches (e.g., changing seeds on identical prompts), preventing RNN state desyncs and empty outputs. + + - **Note**: For the Comfyui plugin developer, I recommend performing a reset operation before inputting the prompt word. This way, the seed will be included as one of the factors in the initial complete recalculation. + +- fix(mtmd): remove OS-level log suppression to expose critical C++ errors + - Removed the `suppress_stdout_stderr` context manager around critical C++ backend calls (`_init_mtmd_context`, `_create_bitmap_from_bytes`, and `close`). + + - Previously, when `verbose=False`, this OS-level file descriptor redirection was swallowing fatal C++ backend errors (e.g., `stb_image` decoding failures, corrupted `.mmproj` model weights, or CUDA Out-Of-Memory aborts), resulting in silent crashes that were impossible to debug. The framework now correctly relies on the native C-API `llama_log_callback` to route logs to Python gracefully, ensuring that critical decoding and hardware exceptions remain visible to the developer. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea](https://github.com/ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea) + ## [0.3.30] Milestone Release I will update the release notes for version 0.3.30 in the [discussion](https://github.com/JamePeng/llama-cpp-python/discussions). diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index b72459f653..ed3c342f20 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.30" +__version__ = "0.3.31" From 591356379d73f446033b57a247f28f8c7c99e599 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Mar 2026 19:17:19 +0800 Subject: [PATCH 230/518] Update Submodule vendor/llama.cpp f5ddcd1..c5a7788 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f5ddcd1696..c5a778891b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f5ddcd1696eca5069dc7915f4d4c03c9a709afea +Subproject commit c5a778891ba0ddbd4cbb507c823f970595b1adc2 From 7289b548c186d2c07c787f573c2447a704fd8fa6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Mar 2026 20:06:11 +0800 Subject: [PATCH 231/518] perf(hybrid): eliminate PCIe I/O latency for single-turn workflows This commit introduces critical performance optimizations and log tracing improvements for HybridCheckpointCache in single-turn workflows (e.g., ComfyUI or single-turn conversation mode): - Now support 0 HybridCheckpointCache for single-turn conversation.(set the `ctx_checkpoints=0` when llama init ) - Added early-exit intercepts for `max_checkpoints <= 0` in `save_checkpoint` and `find_best_checkpoint`. This prevents massive (e.g., 150MB+) synchronous VRAM-to-RAM state extractions over the PCIe bus when rollback capabilities are disabled, eliminating a ~3-second blocking delay at the end of generation. - Added a non-empty check in `clear()` to prevent log spam when the cache is already empty or disabled. - Standardized logging prefixes (e.g., `HybridCheckpointCache(save_checkpoint)`) for better observability. - Fixed a potential `UnicodeEncodeError` hazard in warning logs by replacing a non-standard arrow character with standard ASCII (`->`). --- llama_cpp/llama_cache.py | 51 +++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index c211ce6888..ec6b576746 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -336,7 +336,7 @@ class HybridCheckpointCache(BaseLlamaCache): """ def __init__(self, ctx: llama_cpp.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): if ctx is None: - raise ValueError("HybridCheckpointCache: Failed to create HybridCheckpointCache with model context") + raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context") self._ctx = ctx self.max_checkpoints = max_checkpoints self.checkpoints: list[HybridCheckpoint] = [] @@ -350,6 +350,13 @@ def __init__(self, ctx: llama_cpp.llama_context_p, max_checkpoints: int = 16, ve self.verbose = verbose + if self.max_checkpoints <= 0: + if self.verbose: + import sys + print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " + "Rollback capabilities are turned off. This is optimal for single-turn workflows.", + file=sys.stderr) + @property def cache_size(self) -> int: """Returns the total memory used by all stored checkpoints in bytes.""" @@ -357,6 +364,9 @@ def cache_size(self) -> int: def clear(self): """Clears all stored checkpoints and resets memory tracking.""" + if not self.checkpoints: + # Empty Checkpoint: Return immediately, no need to clear. + return self.checkpoints.clear() self._current_size = 0 if self.verbose: @@ -392,6 +402,10 @@ def find_best_checkpoint(self, tokens: List[int], seq_id: int = 0) -> Optional[H Finds the longest valid checkpoint that perfectly matches the provided token prefix. Returns None if no matching checkpoint is found. """ + # Empty Checkpoint: Instant return, no hash calculation needed. + if self.max_checkpoints <= 0 or len(self.checkpoints) == 0: + return None + best_cp = None best_pos = -1 for cp in self.checkpoints: @@ -417,27 +431,42 @@ def save_checkpoint( Extracts the RNN hidden state from the C++ backend and saves it as a checkpoint. Manages eviction (FIFO) if the maximum number of checkpoints is exceeded. """ + + # 0. Early Exit / Feature Toggle + # If the user disables checkpoints (max_checkpoints <= 0), we immediately return. + # This absolutely critical bypass prevents massive (e.g., 150MB+) synchronous + # VRAM-to-RAM copies over the PCIe bus, eliminating multi-second delays at the + # end of generation for single-turn workflows. + # This is more friendly to the single-call ComfyUI ecosystem. :) + if self.max_checkpoints <= 0: + if self.verbose: + print("HybridCheckpointCache(save_checkpoint): Cache is DISABLED (max_checkpoints <= 0). " + "Operating in single-turn conversation mode. Skipping state extraction to optimize generation latency.", + file=sys.stderr) + return False + flags = self._flag_partial - # 1. Query the required buffer size + # 1. Query the required buffer size from the underlying C++ context size = self._get_size_ext(self._ctx, seq_id, flags) if size == 0: if self.verbose: - print("HybridCheckpointCache: size=0, skip") + print("HybridCheckpointCache(save_checkpoint): size=0, skip") return False - # 2. Allocate buffer and extract data + # 2. Allocate buffer and extract raw state data buffer = (ctypes.c_uint8 * size)() n_written = self._get_data_ext(self._ctx, buffer, size, seq_id, flags) if n_written != size: if self.verbose: - print(f"HybridCheckpointCache: get failed {n_written}/{size}") + print(f"HybridCheckpointCache(save_checkpoint): get failed {n_written}/{size}") return False + # Note: This deep copy isolates the state from subsequent C++ backend mutations data_bytes = bytes(buffer[:n_written]) hash_val = self._hash_prefix(tokens, current_pos) - # 3. Store the checkpoint + # 3. Store the newly extracted checkpoint self.checkpoints.append(HybridCheckpoint( pos=current_pos, data=data_bytes, @@ -454,10 +483,10 @@ def save_checkpoint( old_cp = self.checkpoints.pop(0) self._current_size -= old_cp.size if self.verbose: - print(f"HybridCheckpointCache: evicted pos={old_cp.pos}") + print(f"HybridCheckpointCache(save_checkpoint): evicted pos={old_cp.pos}") if self.verbose: - print(f"HybridCheckpointCache: Saved checkpoint at pos {current_pos} ({size / 1024 / 1024:.2f} MiB) " + print(f"HybridCheckpointCache(save_checkpoint): Saved checkpoint at pos {current_pos} ({size / 1024 / 1024:.2f} MiB) " f"total={len(self.checkpoints)} used={self._current_size / 1024 / 1024:.2f} MiB", file=sys.stderr) @@ -470,7 +499,7 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: # 1. Verify sequence ID matches to prevent cross-sequence contamination if cp.seq_id != seq_id: if self.verbose: - print(f"HybridCheckpointCache: [Error] Sequence ID mismatch: checkpoint has {cp.seq_id}, requested {seq_id}", file=sys.stderr) + print(f"HybridCheckpointCache(restore_checkpoint): [Error] Sequence ID mismatch: checkpoint has {cp.seq_id}, requested {seq_id}", file=sys.stderr) return False flags = self._flag_partial @@ -479,7 +508,7 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: current_size = self._get_size_ext(self._ctx, seq_id, flags) if current_size != cp.size: if self.verbose: - print(f"HybridCheckpointCache: [Warning] State size mismatch before restore: expected {cp.size}, got {current_size} → possible invalidation") + print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: expected {cp.size}, got {current_size} -> possible invalidation") return False # 3. Copy data back to a ctypes buffer and push to the C++ backend @@ -490,7 +519,7 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: success = (ret == cp.size) if self.verbose: - print(f"HybridCheckpointCache: restore {'OK' if success else 'FAIL'} pos={cp.pos}") + print(f"HybridCheckpointCache(restore_checkpoint): restore {'OK' if success else 'FAIL'} pos={cp.pos}") return success # Disable BaseLlamaCache Dictionary Interfaces From 191e3343a7660f08255bc37c8e4d4346dceac7c4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Mar 2026 20:40:38 +0800 Subject: [PATCH 232/518] perf(hybrid): bypass N-1 evaluation split if max_checkpoints is 0 - Prevent fragmenting the prompt evaluation into `len(tokens)-1` and `1` when hybrid caching is disabled. - Allows the underlying C++ engine to process the entire prompt in a single, efficient batch for single-turn workflows. --- llama_cpp/llama.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 32ab5b1185..209dabba86 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1308,7 +1308,13 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): if len(tokens) > 0: # For hybrid models processing a prompt (len > 1), force an N-1 checkpoint # to safely allow 1-token rollbacks (e.g., for seed changes on 100% prompt matches). - if self.is_hybrid and self._hybrid_cache_mgr is not None and len(tokens) > 1: + # ONLY apply this if rollback capabilities are enabled (max_checkpoints > 0). + if ( + self.is_hybrid + and self._hybrid_cache_mgr is not None + and self._hybrid_cache_mgr.max_checkpoints > 0 + and len(tokens) > 1 + ): body_tokens = tokens[:-1] last_token = [tokens[-1]] From 850ed2e1ed5049d462caca2df08eb26e629088c8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Mar 2026 20:48:16 +0800 Subject: [PATCH 233/518] perf(hybrid): prevent expensive array slicing when cache is disabled Added a `max_checkpoints > 0` check to the `finally` block of the generation loop. Previously, even though the underlying C++ state extraction was bypassed, the Python layer was still executing `self._input_ids[:self.n_tokens].tolist()`. For long contexts, slicing and converting this massive array to a Python list caused unnecessary CPU overhead and garbage collection (GC) pressure. This intercept acts as a double-layer isolation, ensuring absolute zero memory allocation and zero overhead for hybrid models running in single-turn mode. --- llama_cpp/llama.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 209dabba86..a97b56a694 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1396,7 +1396,11 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): ] ) finally: - if self.is_hybrid and self._hybrid_cache_mgr is not None: + if ( + self.is_hybrid + and self._hybrid_cache_mgr is not None + and self._hybrid_cache_mgr.max_checkpoints > 0 + ): current_history = self._input_ids[:self.n_tokens].tolist() self._hybrid_cache_mgr.save_checkpoint( From fb3072df65c771f79aeb42535e3b611a32f9ad7a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 7 Mar 2026 21:47:48 +0800 Subject: [PATCH 234/518] perf(hybrid): optimize multimodal single-turn and fix KV clear bug - Added a 100% match "FAST PATH" in Llama.generate to bypass N-1 truncation for hybrid models when caching is disabled. - Fixed a bug where failed rollbacks on disabled caches would wipe the KV cache, causing multimodal pseudo-token crashes. - Updated MTMDChatHandler to suppress cache-related logs and anchoring logic when max_checkpoints <= 0. --- llama_cpp/llama.py | 122 +++++++++++++++++++++------------ llama_cpp/llama_chat_format.py | 27 +++++--- 2 files changed, 95 insertions(+), 54 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a97b56a694..d8390bfbd3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1170,59 +1170,91 @@ def generate( original_tokens = list(tokens) # Check for kv cache prefix match if reset and self.n_tokens > 0: - longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) - if longest_prefix > 0: + # 1. First, check for a 100% exact match of the entire sequence + full_match_prefix = self.longest_token_prefix(self._input_ids, tokens) + + # --- FAST PATH: Zero-latency bypass for Hybrid Single-Turn & Multimodal --- + # If the cache is disabled (max_checkpoints <= 0) and we have a 100% match, + # we completely skip the N-1 truncation. This ensures that multimodal handlers + # (which just finished evaluating and already hold fresh logits) don't trigger + # unnecessary N-1 rollbacks or catastrophic KV cache clears. + if ( + full_match_prefix == len(tokens) + and full_match_prefix == self.n_tokens + and self.is_hybrid + and (self._hybrid_cache_mgr is None or self._hybrid_cache_mgr.max_checkpoints <= 0) + ): reset = False + longest_prefix = len(tokens) + tokens = tokens[longest_prefix:] # Empties the tokens array to bypass evaluation + if self.verbose: + print(f"Llama.generate: Hybrid single-turn full match ({longest_prefix} tokens). Bypassing rollback/truncation.", file=sys.stderr) - if longest_prefix == len(tokens): - if self.verbose: - print(f"Llama.generate: Full match. Forcing prefix-- to evaluate 1 token.", file=sys.stderr) - longest_prefix -= 1 + # --- STANDARD PATH: Force N-1 re-evaluation --- + else: + # By matching against `tokens[:-1]`, we intentionally drop the last token. + # This forces the engine to re-evaluate the final token to refresh sampling logits. + longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) - # Physically erase trailing "ghost" tokens from the C++ KV cache - # to prevent attention misalignment in multi-round chats. - if longest_prefix < self.n_tokens: - if self.is_hybrid and self._hybrid_cache_mgr is not None: - if self.verbose: - print(f"Llama.generate: Hybrid model rollback triggered.", file=sys.stderr) + if longest_prefix > 0: + reset = False - best_ckpt = self._hybrid_cache_mgr.find_best_checkpoint(original_tokens, 0) - if best_ckpt is not None and self._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - actual_prefix = best_ckpt.pos + # Note: Kept for legacy compatibility. Triggers if the prefix matching + # somehow equals the full token length (e.g., edge cases in tokenization). + if longest_prefix == len(tokens): + if self.is_hybrid and (self._hybrid_cache_mgr is None or self._hybrid_cache_mgr.max_checkpoints <= 0): + if self.verbose: + print(f"Llama.generate: Full match on disabled hybrid cache. Skipping prefix-- to use existing fresh logits.", file=sys.stderr) else: - actual_prefix = 0 - self._hybrid_cache_mgr.clear() - self._ctx.memory_clear(True) + if self.verbose: + print(f"Llama.generate: Full match. Forcing prefix-- to evaluate 1 token.", file=sys.stderr) + longest_prefix -= 1 - self.n_tokens = actual_prefix - tokens = original_tokens[actual_prefix:] - if self.verbose: - print( - f"Llama.generate: {actual_prefix} prefix-match hit, " - f"remaining {len(tokens)} prompt tokens to eval", - file=sys.stderr, - ) - else: - if self.verbose: - print(f"Llama.generate: Truncating KV cache size from {self.n_tokens} to {longest_prefix}", file=sys.stderr) - self._ctx.memory_seq_rm(0, longest_prefix, -1) + # Physically erase trailing "ghost" tokens from the C++ KV cache + # to prevent attention misalignment in multi-round chats. + if longest_prefix < self.n_tokens: + if self.is_hybrid and self._hybrid_cache_mgr is not None: + if self.verbose: + print(f"Llama.generate: Hybrid model rollback triggered.", file=sys.stderr) - # Adjust the tokens array and cursor to reuse the matched cache - self.n_tokens = longest_prefix - tokens = tokens[longest_prefix:] + best_ckpt = self._hybrid_cache_mgr.find_best_checkpoint(original_tokens, 0) + if best_ckpt is not None and self._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + actual_prefix = best_ckpt.pos + else: + # Fallback: No checkpoint found, must fully clear the context to prevent poisoning + actual_prefix = 0 + self._hybrid_cache_mgr.clear() + self._ctx.memory_clear(True) - if self.verbose: - print( - f"Llama.generate: {longest_prefix} prefix-match hit, " - f"remaining {len(tokens)} prompt tokens to eval", - file=sys.stderr, - ) - else: - # No prefix matched. Completely clear the KV cache to prevent context poisoning. - self.n_tokens = 0 - self._ctx.memory_clear(True) - if self.is_hybrid and self._hybrid_cache_mgr is not None: - self._hybrid_cache_mgr.clear() + self.n_tokens = actual_prefix + tokens = original_tokens[actual_prefix:] + if self.verbose: + print( + f"Llama.generate: {actual_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", + file=sys.stderr, + ) + else: + if self.verbose: + print(f"Llama.generate: Truncating KV cache size from {self.n_tokens} to {longest_prefix}", file=sys.stderr) + self._ctx.memory_seq_rm(0, longest_prefix, -1) + + # Adjust the tokens array and cursor to reuse the matched cache + self.n_tokens = longest_prefix + tokens = tokens[longest_prefix:] + + if self.verbose: + print( + f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", + file=sys.stderr, + ) + else: + # No prefix matched at all. Completely clear the KV cache to prevent context poisoning. + self.n_tokens = 0 + self._ctx.memory_clear(True) + if self.is_hybrid and self._hybrid_cache_mgr is not None: + self._hybrid_cache_mgr.clear() # Reset mirostat sampling params = LlamaSamplingParams( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 3acb10e233..da470920e7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3271,15 +3271,20 @@ def __call__( if longest_prefix < llama.n_tokens: if llama.is_hybrid and llama._hybrid_cache_mgr is not None: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " - f"Searching for nearest checkpoint...", file=sys.stderr) - - best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) - if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - llama.n_tokens = best_ckpt.pos + if llama._hybrid_cache_mgr.max_checkpoints > 0: if self.verbose: - print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 else: llama._hybrid_cache_mgr.clear() llama._ctx.memory_clear(True) @@ -3382,7 +3387,11 @@ def __call__( # End-of-Turn Checkpoint # Anchors the state ONLY after the entire multi-modal turn is processed - if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if ( + llama.is_hybrid + and llama._hybrid_cache_mgr is not None + and llama._hybrid_cache_mgr.max_checkpoints > 0 + ): if self.verbose: print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) From ad10cfd4bc459c476d606db2e66506e6601a7bda Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Mar 2026 10:44:41 +0800 Subject: [PATCH 235/518] fix(sampling): pass seed to sampling context and remove global mutation - Add `seed` parameter to `generate` and `sample` method signatures. - Pass the resolved seed directly to `LlamaSamplingParams` to ensure the underlying C++ sampler uses it. - Remove thread-unsafe `self.set_seed()` calls in `_create_completion` to prevent global state pollution during concurrent requests. Signed-off-by: JamePeng --- llama_cpp/llama.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d8390bfbd3..a4587b63ea 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1019,6 +1019,7 @@ def sample( grammar: Optional[LlamaGrammar] = None, # optional BNF-like grammar to constrain sampling grammar_lazy: bool = False, idx: Optional[int] = None, + seed: Optional[int] = None, ): """Sample a token from the model. Returns: @@ -1040,6 +1041,7 @@ def sample( temp=temp, top_n_sigma=top_n_sigma, min_keep=min_keep, + seed=seed if seed is not None else self._seed, # Dynamic Temp dynatemp_range=dynatemp_range, @@ -1146,7 +1148,8 @@ def generate( logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, grammar: Optional[LlamaGrammar] = None, - grammar_lazy :bool = False, + grammar_lazy: bool = False, + seed: Optional[int] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -1302,6 +1305,7 @@ def generate( logit_bias=self._convert_logit_bias(logit_bias), grammar=grammar._grammar if grammar else "", grammar_lazy=grammar_lazy, + seed=seed if seed is not None else self._seed, ) if logits_processor: @@ -1635,7 +1639,6 @@ def _create_completion( dynatemp_exponent: float = 1.0, min_keep: int = 0, stream: bool = False, - seed: Optional[int] = None, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -1654,7 +1657,8 @@ def _create_completion( logit_bias: Optional[Dict[int, float]] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, - grammar_lazy: bool = False + grammar_lazy: bool = False, + seed: Optional[int] = None, ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -1798,11 +1802,6 @@ def _create_completion( if self.verbose: print("Llama._create_completion: cache miss", file=sys.stderr) - if seed is not None: - self.set_seed(seed) - else: - self.set_seed(random.Random(self._seed).randint(0, 2 ** 32)) - finish_reason = "length" multibyte_fix = 0 for token in self.generate( @@ -1838,6 +1837,7 @@ def _create_completion( logits_processor=logits_processor, grammar=grammar, grammar_lazy=grammar_lazy, + seed=seed if seed is not None else self._seed, ): if llama_cpp.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) From 8ec1763c7b4526b9a7366c74f833c6e881666019 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Mar 2026 18:02:58 +0800 Subject: [PATCH 236/518] docs(issue-template): modernize bug report for efficiency Completely revamped the legacy bug report template to streamline troubleshooting. Added an anti-AI-spam policy, a detailed OS/Hardware matrix, forced `verbose=True` logging requirements with code examples, and new sections for model parameters and AI-assisted brainstorming. Signed-off-by: JamePeng --- .github/ISSUE_TEMPLATE/bug_report.md | 145 +++++++++++++++------------ 1 file changed, 81 insertions(+), 64 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index eb0fb9662e..715aad89ee 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,96 +1,113 @@ --- -name: Bug report -about: Create a report to help us improve -title: '' -labels: '' -assignees: '' +name: 🚀 Bug Report (Efficiency & Runtime) +about: Report a runtime crash, logic error, or performance issue. +title: "[Bug]: " +labels: ["bug", "triage"] +assignees: "" +--- + +### ⚠️ IMPORTANT: HUMAN-ONLY SUBMISSION POLICY +**AI-generated Issues or Pull Requests will be closed without review.** +- Please use AI(artificial intelligence) only as a auxiliary tool to assist in brainstorming, code analysis, or adding comments. +- A human developer must verify the accuracy, necessity, and urgency of this report before submission. +- It's cool to learn about AI and how it works through a project to improve yourself, right? ;) --- -# Prerequisites +### Prerequisites -Please answer the following questions for yourself before submitting an issue. +* [ ] I am running the latest code from the **JamePeng/llama-cpp-python** branch. +* [ ] I carefully followed the [README.md](https://github.com/JamePeng/llama-cpp-python/blob/main/README.md). +* [ ] I have verified the issue is not a duplicate. +* [ ] I have tested it using the official binary `llama-cli` or `llama-server` provided by `llama.cpp`, and the problem (exists/does not exist) still exists. +* [ ] I reviewed the [Discussions](https://github.com/JamePeng/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. -- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. -- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). -- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). -- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. +### Environment & Hardware Configuration -# Expected Behavior +Please provide your specific setup. Use the suggested commands for your OS to verify. -Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. +| Category | Windows 10/11 | Ubuntu / Linux | macOS 15/18/26+ | +| --- | --- | --- | --- | +| **OS Version** | `winver` or System > About | `lsb_release -a` | `sw_vers` | +| **CPU** | Task Manager > Performance | `lscpu` | `sysctl -n machdep.cpu.brand_string` | +| **RAM Size** | Task Manager > Performance | `free -h` | Activity Monitor > Memory | +| **GPU/Multi-Card** | Device Manager / `nvidia-smi` | `nvidia-smi` or `lspci` | System Report > Graphics/Displays | -# Current Behavior +* **Multi-GPU Setup**: (e.g., 2x RTX 4090 / SLI / None) +* **Specific Hardware Screenshots**: [Insert screenshot of Task Manager / `nvidia-smi` / System Info here] -Please provide a detailed written description of what `llama-cpp-python` did, instead. +### Toolchain Versions -# Environment and Context +Provide the exact versions or commit hashes: -Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. +* **Python**: `python --version` +* **Python Library**: `pip list` +* **Compiler**: (e.g., `g++ --version`, `msvc` via VS Installer, or `xcode-select -v`) +* **llama-cpp-python Commit**: `git rev-parse HEAD` +* **vendor/llama.cpp Commit**: `cd vendor/llama.cpp && git rev-parse HEAD` -* Physical (or virtual) hardware you are using, e.g. for Linux: +### Model & Logic Context -`$ lscpu` +* **Model Source**: (e.g., HuggingFace, ModelScope, Custom Conversion) +* **Model Path**: `Qwen3.5-9B-Q4_K_M.gguf` +* **Multimodal Path**: `mmproj-BF16.gguf` (if applicable) -* Operating System, e.g. for Linux: +### Failure Timing & Logs -`$ uname -a` +**Is the issue occurring during Build (Compilation) or Runtime?** -* SDK version, e.g. for Linux: +#### Runtime Debugging Requirement: + +When reporting a runtime bug, you **must** set `verbose=True` in both the `Llama` class and the `ChatHandler` to capture internal logs. + +```python +# Required Debugging Configuration +llm = Llama( + model_path="./Qwen3.5-9B-Q4_K_M.gguf", + chat_handler=Qwen35ChatHandler( + clip_model_path="./mmproj-BF16.gguf", + enable_thinking=True, + verbose=True # SET TO TRUE + ), + n_gpu_layers=-1, + n_ctx=40960, + verbose=True, # SET TO TRUE + ctx_checkpoints=0 +) -``` -$ python3 --version -$ make --version -$ g++ --version ``` -# Failure Information (for bugs) +### Steps to Reproduce & Reproduction Logic -Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. +1. Provide the full `CMAKE_ARGS` used during installation (e.g., `CMAKE_ARGS="-DGGML_CUDA=ON" pip install .`). +2. Provide the full runtime Python script. + - If privacy is a concern, `pseudo-paths` and `business logic code` can be used to hide the script, while preserving the initialization and runtime code surrounding bug triggers. + - Relatively complete code is best for tracking issues. Your choice :) +3. **Reproduction Screenshot**: [Insert screenshot of the error or the unexpected behavior here] -# Steps to Reproduce +### Analysis & Brainstorming -Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better. +> Use this section to include any insights gained from AI-assisted code analysis or your own/team's brainstorming. -1. step 1 -2. step 2 -3. step 3 -4. etc. +* **Potential Root Cause**: (e.g., buffer overflows, inaccurate memory releases, kv cache management, unnecessary memory allocations, redundant mergeable runtime logic, etc.) +* **Code Comments/Fix Ideas**: (Paste analyzed code snippets with your added comments) -**Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** +```text + -Try the following: +``` -1. `git clone https://github.com/abetlen/llama-cpp-python` -2. `cd llama-cpp-python` -3. `rm -rf _skbuild/` # delete any old builds -4. `python -m pip install .` -5. `cd ./vendor/llama.cpp` -6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp -7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues) +--- -# Failure Logs +### Expected Behavior -Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes. +Describe the expected outcome. -Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. +### Current Behavior + +Describe the actual outcome and paste the **Verbose Logs** below: + +```text + -Example environment info: -``` -llama-cpp-python$ git log | head -1 -commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 - -llama-cpp-python$ python3 --version -Python 3.10.10 - -llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy" -fastapi 0.95.0 -numpy 1.24.3 -sse-starlette 1.3.3 -uvicorn 0.21.1 - -llama-cpp-python/vendor/llama.cpp$ git log | head -3 -commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 -Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> -Date: Thu May 25 20:18:01 2023 -0600 ``` From 3e41921cb4e3b48f842601ceda3678eaaf0105c1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Mar 2026 18:27:43 +0800 Subject: [PATCH 237/518] Update Submodule vendor/llama.cpp c5a7788..b283f6d --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c5a778891b..b283f6d5b3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c5a778891ba0ddbd4cbb507c823f970595b1adc2 +Subproject commit b283f6d5b3d2d079019ae5ed3cbbdb4b3be03b25 From e7e1d48065ba53846f290cfe563c8c839a062ebe Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 8 Mar 2026 18:40:43 +0800 Subject: [PATCH 238/518] Bump version to 0.3.32 --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85ad0f9f5e..4656d87129 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.32] Hybrid/Multimodal Model Single-Turn Optimizations & Fix Sampling Seed + +- perf(hybrid): optimize multimodal single-turn and fix KV clear bug + - Added a 100% match "FAST PATH" in Llama.generate to bypass N-1 truncation for hybrid models when caching is disabled. + - Fixed a bug where failed rollbacks on disabled caches would wipe the KV cache, causing multimodal pseudo-token crashes. + - Updated MTMDChatHandler to suppress cache-related logs and anchoring logic when max_checkpoints <= 0. + +- perf(hybrid): prevent expensive array slicing when cache is disabled + - Added a `max_checkpoints > 0` check to the `finally` block of the generation loop. + - Previously, even though the underlying C++ state extraction was bypassed, the Python layer was still executing `self._input_ids[:self.n_tokens].tolist()`. For long contexts, slicing and converting this massive array to a Python list caused unnecessary CPU overhead and garbage collection (GC) pressure. This intercept acts as a double-layer isolation, ensuring absolute zero memory allocation and zero overhead for hybrid models running in single-turn mode. + +- perf(hybrid): bypass N-1 evaluation split if max_checkpoints is 0 + - Prevent fragmenting the prompt evaluation into `len(tokens)-1` and `1` when hybrid caching is disabled. + - Allows the underlying C++ engine to process the entire prompt in a single, efficient batch for single-turn workflows. + +- perf(hybrid): eliminate PCIe I/O latency for single-turn workflows + - This commit introduces critical performance optimizations and log tracing improvements for HybridCheckpointCache in single-turn workflows (e.g., ComfyUI or single-turn conversation mode): + - Now support 0 HybridCheckpointCache for single-turn conversation.(set the `ctx_checkpoints=0` when llama init ) + - Added early-exit intercepts for `max_checkpoints <= 0` in `save_checkpoint` and `find_best_checkpoint`. This prevents massive (e.g., 150MB+) synchronous VRAM-to-RAM state extractions over the PCIe bus when rollback capabilities are disabled, eliminating a ~3-second blocking delay at the end of generation. + - Added a non-empty check in `clear()` to prevent log spam when the cache is already empty or disabled. + - Standardized logging prefixes (e.g., `HybridCheckpointCache(save_checkpoint)`) for better observability. + - Fixed a potential `UnicodeEncodeError` hazard in warning logs by replacing a non-standard arrow character with standard ASCII (`->`). + +- fix(sampling): pass seed to sampling context and remove global mutation + - Add `seed` parameter to `generate` and `sample` method signatures. + - Pass the resolved seed directly to `LlamaSamplingParams` to ensure the underlying C++ sampler uses it. + - Remove thread-unsafe `self.set_seed()` calls in `_create_completion` to prevent global state pollution during concurrent requests. + +- docs(issue-template): modernize bug report for efficiency + - Completely revamped the legacy bug report template to streamline troubleshooting. Added an anti-AI-spam policy, a detailed OS/Hardware matrix, forced `verbose=True` logging requirements with code examples, and new sections for model parameters and AI-assisted brainstorming. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/b283f6d5b3d2d079019ae5ed3cbbdb4b3be03b25](https://github.com/ggml-org/llama.cpp/commit/b283f6d5b3d2d079019ae5ed3cbbdb4b3be03b25) + ## [0.3.31] Omni-Modal Media Pipeline, Hybrid 1-Token Rollback and Enhanced Logging - refactor(mtmd): introduce omni-modal media pipeline with experimental audio support diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index ed3c342f20..45f9c8f27f 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.31" +__version__ = "0.3.32" From 5bd087046e199214476e318b9021db85a1783bfc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 9 Mar 2026 02:24:32 +0800 Subject: [PATCH 239/518] Update Submodule vendor/llama.cpp b283f6d..35bee03 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b283f6d5b3..35bee031e1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b283f6d5b3d2d079019ae5ed3cbbdb4b3be03b25 +Subproject commit 35bee031e17ed2b2e8e7278b284a6c8cd120d9f8 From 964160dd22dcb2179eb9bcdce0d4eebda94d9ffc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 9 Mar 2026 02:25:34 +0800 Subject: [PATCH 240/518] feat(MTMDChatHandler): support audio inputs and fix interleaved media ordering - Refactored `CHAT_FORMAT` to use a single loop for `message.content`, preserving the exact chronological order of interleaved text, images, and audio. - Added template routing for `audio_url`. - Added template routing for OpenAI's `input_audio` format, properly formatting it as a Data URI. --- llama_cpp/llama_chat_format.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index da470920e7..d13258a346 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2803,10 +2803,15 @@ class MTMDChatHandler: "{% for content in message.content %}" "{% if content.type == 'image_url' %}" "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" + "{% elif content.type == 'audio_url' %}" + "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" + "{% elif content.type == 'input_audio' %}" + "{% if content.input_audio is string %}" + "{{ content.input_audio }}" + "{% else %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% endif %}" + "{% elif content.type == 'text' %}" "{{ content.text }}" "{% endif %}" "{% endfor %}" From 9cc2efb4054cdd4053d6cf01ffd47099a4648838 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 05:56:00 +0800 Subject: [PATCH 241/518] Update Submodule vendor/llama.cpp 35bee03..59db9a3 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 35bee031e1..59db9a357d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 35bee031e17ed2b2e8e7278b284a6c8cd120d9f8 +Subproject commit 59db9a357d9a247009c70fda34050661b17a1a5c From 5e285fecd0c8d476fffbc2d8ce8748aeeedde12d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 06:16:07 +0800 Subject: [PATCH 242/518] fix(cache): fix namespace shadowing to prevent AttributeError - Renamed `llama_cpp.llama` import to `llama_core` and `llama_cpp.llama_cpp` to `llama_cpp_lib` to prevent namespace collision. - Fixed `AttributeError` thrown when accessing `llama_cpp.llama.Llama.longest_token_prefix`. - Updated all associated type hints and C-API bindings in cache classes to use the new isolated aliases. --- llama_cpp/llama_cache.py | 41 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index ec6b576746..2169780326 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -14,9 +14,8 @@ Tuple, ) -import llama_cpp.llama -import llama_cpp._internals as _internals -import llama_cpp.llama_cpp as llama_cpp +import llama_cpp.llama as llama_core +import llama_cpp.llama_cpp as llama_cpp_lib from .llama_types import * @@ -39,7 +38,7 @@ def _find_longest_prefix_key( pass @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": raise NotImplementedError @abstractmethod @@ -48,7 +47,7 @@ def __contains__(self, key: Sequence[int]) -> bool: @abstractmethod def __setitem__( - self, key: Sequence[int], value: "llama_cpp.llama.LlamaState" + self, key: Sequence[int], value: "llama_core.LlamaState" ) -> None: raise NotImplementedError @@ -73,18 +72,18 @@ def _find_longest_prefix_key( min_len = 0 min_key: Optional[Tuple[int, ...]] = None for k in self.cache.iterkeys(): # type: ignore - prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) + prefix_len = llama_core.Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore + value: "llama_core.LlamaState" = self.cache.pop(_key) # type: ignore # NOTE: This puts an integer as key in cache, which breaks, # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore @@ -93,7 +92,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): print("LlamaDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: @@ -114,7 +113,7 @@ def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes self.cache_state: OrderedDict[ - Tuple[int, ...], "llama_cpp.llama.LlamaState" + Tuple[int, ...], "llama_core.LlamaState" ] = OrderedDict() @property @@ -128,7 +127,7 @@ def _find_longest_prefix_key( min_len = 0 min_key = None keys = ( - (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) + (k, llama_core.Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() ) for k, prefix_len in keys: @@ -137,7 +136,7 @@ def _find_longest_prefix_key( min_key = k return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -149,7 +148,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -164,7 +163,7 @@ def __init__(self): # Child nodes: {token_id: TrieNode} self.children: Dict[int, "TrieNode"] = {} # Stores the LlamaState if this node marks the end of a cached sequence. - self.state: Optional["llama_cpp.llama.LlamaState"] = None + self.state: Optional["llama_core.LlamaState"] = None class LlamaTrieCache(BaseLlamaCache): @@ -228,7 +227,7 @@ def _find_longest_prefix_node( return longest_prefix_node, longest_prefix_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": """ Retrieves the state for the longest matching prefix in O(K) time. Updates the LRU status. @@ -282,7 +281,7 @@ def _prune(self, key: Tuple[int, ...]): # Node is still in use, stop pruning break - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): """ Adds a (key, state) pair to the cache in O(K) time. Handles LRU updates and eviction. @@ -334,7 +333,7 @@ class HybridCheckpointCache(BaseLlamaCache): Manager for RNN state snapshots (Checkpoints) tailored for Hybrid/Recurrent models. Provides rollback capabilities for models that cannot physically truncate KV cache. """ - def __init__(self, ctx: llama_cpp.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): + def __init__(self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): if ctx is None: raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context") self._ctx = ctx @@ -343,10 +342,10 @@ def __init__(self, ctx: llama_cpp.llama_context_p, max_checkpoints: int = 16, ve self._current_size = 0 # Cache C-type API function pointers for performance - self._get_size_ext = llama_cpp.llama_state_seq_get_size_ext - self._get_data_ext = llama_cpp.llama_state_seq_get_data_ext - self._set_data_ext = llama_cpp.llama_state_seq_set_data_ext - self._flag_partial = llama_cpp.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY + self._get_size_ext = llama_cpp_lib.llama_state_seq_get_size_ext + self._get_data_ext = llama_cpp_lib.llama_state_seq_get_data_ext + self._set_data_ext = llama_cpp_lib.llama_state_seq_set_data_ext + self._flag_partial = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY self.verbose = verbose From 7efcb2b049cadef4f40e69392793add02841f8b9 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 06:35:48 +0800 Subject: [PATCH 243/518] fix(chat_format): fix namespace and variable shadowing of llama modules - Changed imports to use `llama_cpp_lib` and `llama_core` to avoid namespace collisions. - Fixed severe variable shadowing where the `llama` module was being overshadowed by the `llama` parameter in function signatures. - Updated associated type hints and C-API bindings to use the new isolated aliases. - Corrected `LlamaGrammar` type definitions to point to the `llama_grammar` module. --- llama_cpp/llama_chat_format.py | 54 +++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index d13258a346..9b96beaea4 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -33,8 +33,8 @@ import urllib.request from urllib.error import URLError, HTTPError -import llama_cpp.llama_cpp as llama_cpp -import llama_cpp.llama as llama +import llama_cpp.llama_cpp as llama_cpp_lib +import llama_cpp.llama as llama_core import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar @@ -85,7 +85,7 @@ def __call__( self, *, # llama.cpp instance - llama: llama.Llama, + llama: llama_core.Llama, # openai api parameters messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, @@ -124,8 +124,8 @@ def __call__( adaptive_target : float = -1.0, adaptive_decay : float = 0.9, use_infill: bool = False, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, **kwargs, # type: ignore @@ -199,7 +199,7 @@ class ChatFormatterResponse: prompt: str stop: Optional[Union[str, List[str]]] = None - stopping_criteria: Optional[llama.StoppingCriteriaList] = None + stopping_criteria: Optional[llama_core.StoppingCriteriaList] = None added_special: bool = False @@ -281,7 +281,7 @@ def stop_on_last_token( ) -> bool: return tokens[-1] in self.stop_token_ids - stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token]) + stopping_criteria = llama_core.StoppingCriteriaList([stop_on_last_token]) return ChatFormatterResponse( prompt=prompt, @@ -585,7 +585,7 @@ def chat_formatter_to_chat_completion_handler( ) -> LlamaChatCompletionHandler: def chat_completion_handler( *, - llama: llama.Llama, + llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, @@ -621,8 +621,8 @@ def chat_completion_handler( adaptive_decay : float = 0.9, use_infill: bool = False, model: Optional[str] = None, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, @@ -1467,7 +1467,7 @@ def format_gemma( @register_chat_completion_handler("functionary") def functionary_chat_handler( - llama: llama.Llama, + llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, @@ -1500,8 +1500,8 @@ def functionary_chat_handler( adaptive_decay : float = 0.9, use_infill: bool = False, model: Optional[str] = None, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -1856,7 +1856,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): @register_chat_completion_handler("functionary-v1") @register_chat_completion_handler("functionary-v2") def functionary_v1_v2_chat_handler( - llama: llama.Llama, + llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, @@ -1889,8 +1889,8 @@ def functionary_v1_v2_chat_handler( adaptive_decay : float = 0.9, use_infill: bool = False, model: Optional[str] = None, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -2868,7 +2868,7 @@ def __init__( self._exit_stack = ExitStack() - def _init_mtmd_context(self, llama_model: llama.Llama): + def _init_mtmd_context(self, llama_model: llama_core.Llama): """Initialize mtmd context with the llama model.""" if self.mtmd_ctx is not None: return # Already initialized @@ -3047,7 +3047,7 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): def _process_mtmd_prompt( self, - llama: llama.Llama, + llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], ) -> Tuple[List[int], List[tuple], Any, List[Any]]: """ @@ -3212,7 +3212,7 @@ def _create_bitmap_func(idx: int, item: str): def __call__( self, *, - llama: llama.Llama, + llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, @@ -3248,8 +3248,8 @@ def __call__( adaptive_decay : float = 0.9, use_infill: bool = False, model: Optional[str] = None, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, @@ -3367,13 +3367,13 @@ def __call__( llama.n_tokens = n_past # Execute C++ Multimodal Black-box Extraction - new_n_past = llama_cpp.llama_pos(0) + new_n_past = llama_cpp_lib.llama_pos(0) result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( self.mtmd_ctx, llama._ctx.ctx, chunk_ptr, - llama_cpp.llama_pos(n_past), - llama_cpp.llama_seq_id(0), + llama_cpp_lib.llama_pos(n_past), + llama_cpp_lib.llama_seq_id(0), llama.n_batch, True, # logits_last = True, drastically saves computational overhead ctypes.byref(new_n_past) @@ -5022,7 +5022,7 @@ def __call__(self, **kwargs): @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( - llama: llama.Llama, + llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, @@ -5055,8 +5055,8 @@ def chatml_function_calling( adaptive_decay : float = 0.9, use_infill: bool = False, model: Optional[str] = None, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, **kwargs, # type: ignore From 955ac33925542771ec863a21861d3588ee87de52 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 07:19:35 +0800 Subject: [PATCH 244/518] Update llama.cpp API 20260310 --- llama_cpp/llama_cpp.py | 74 +++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9e05d52baa..f435658046 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1232,6 +1232,47 @@ def llama_numa_init(numa: int, /): # TODO: Add llama_detach_threadpool +# typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata); +llama_model_set_tensor_data_t = ctypes.CFUNCTYPE( + None, + ctypes.c_void_p, + ctypes.c_void_p +) + + +# // Create a new model from GGUF metadata as well as a function to set the tensor data +# // - tensors are created as GGML_TYPE_F32 by default, +# // override by adding a tensor with the same name but a different name to the context +# LLAMA_API struct llama_model * llama_model_init_from_user( +# struct gguf_context * metadata, +# llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with +# void * set_tensor_data_ud, // userdata for function +# struct llama_model_params params); +@ctypes_function( + "llama_model_init_from_user", + [ + ctypes.c_void_p, + llama_model_set_tensor_data_t, + ctypes.c_void_p, + llama_model_params + ], + llama_model_p_ctypes, +) +def llama_model_init_from_user( + metadata: ctypes.c_void_p, + set_tensor_data: llama_model_set_tensor_data_t, + set_tensor_data_ud: ctypes.c_void_p, + params: llama_model_params, + / +) -> Optional[llama_model_p]: + """ + Create a new model from GGUF metadata as well as a function to set the tensor data + - tensors are created as GGML_TYPE_F32 by default, + override by adding a tensor with the same name but a different name to the context + """ + ... + + # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file( # const char * path_model, # struct llama_model_params params), @@ -1247,7 +1288,7 @@ def llama_load_model_from_file( ... -# // Load the model from a file +# // Load a model from a file # // If the file is split into multiple parts, the file name must follow this pattern: -%05d-of-%05d.gguf # // If the split file name does not follow this pattern, use llama_model_load_from_splits # LLAMA_API struct llama_model * llama_model_load_from_file( @@ -1261,15 +1302,15 @@ def llama_load_model_from_file( def llama_model_load_from_file( path_model: bytes, params: llama_model_params, / ) -> Optional[llama_model_p]: - """Load the model from a file - + """ + Load a model from a file If the file is split into multiple parts, the file name must follow this pattern: -%05d-of-%05d.gguf - - If the split file name does not follow this pattern, use llama_model_load_from_splits""" + If the split file name does not follow this pattern, use llama_model_load_from_splits + """ ... -# // Load the model from multiple splits (support custom naming scheme) +# // Load a model from multiple splits (support custom naming scheme) # // The paths must be in the correct order # LLAMA_API struct llama_model * llama_model_load_from_splits( # const char ** paths, @@ -1283,9 +1324,10 @@ def llama_model_load_from_file( def llama_model_load_from_splits( paths: list[bytes], n_paths: int, params: llama_model_params, / ) -> Optional[llama_model_p]: - """Load the model from multiple splits (support custom naming scheme) - - The paths must be in the correct order""" + """ + Load a model from multiple splits (support custom naming scheme) + The paths must be in the correct order + """ ... @@ -2982,7 +3024,7 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: # // Logits for the ith token. For positive indices, Equivalent to: # // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab -# // Negative indicies can be used to access logits in reverse order, -1 is the last logit. +# // Negative indices can be used to access logits in reverse order, -1 is the last logit. # // returns NULL for invalid ids. # LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); @ctypes_function( @@ -3017,7 +3059,7 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float] # // Get the embeddings for the ith token. For positive indices, Equivalent to: # // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd -# // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding. +# // Negative indices can be used to access embeddings in reverse order, -1 is the last embedding. # // shape: [n_embd] (1-dimensional) # // returns NULL for invalid ids. # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); @@ -3076,9 +3118,9 @@ def llama_get_sampled_token_ith( ... -# // Get the backend sampled probabilites for the ith token +# // Get the backend sampled probabilities for the ith token # // The index matches llama_get_sampled_token_ith(). -# // Returns NULL if no probabilites were generated. +# // Returns NULL if no probabilities were generated. # LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i); @ctypes_function( "llama_get_sampled_probs_ith", @@ -3089,9 +3131,9 @@ def llama_get_sampled_probs_ith( ctx: llama_context_p, i: ctypes.c_int32, / ) -> CtypesArray[ctypes.c_float]: """ - Get the backend sampled probabilites for the ith token + Get the backend sampled probabilities for the ith token The index matches llama_get_sampled_token_ith(). - Returns NULL if no probabilites were generated. + Returns NULL if no probabilities were generated. """ ... @@ -4345,7 +4387,7 @@ def llama_sampler_init_mirostat_v2( ... -# /// @details Intializes a GBNF grammar, see grammars/README.md for details. +# /// @details Initializes a GBNF grammar, see grammars/README.md for details. # /// @param vocab The vocabulary that this grammar will be used with. # /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails. # /// @param grammar_root The name of the start symbol for the grammar. From 9acc5070dfe33e81d49cc5894aab50e88a03a64d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 07:51:18 +0800 Subject: [PATCH 245/518] fix(core): disable swa_full for non-SWA models (sync llama.cpp upstream #20291) - Fallback `context_params.swa_full` to False if `_n_swa == 0` and emit a warning. - Updated `is_hybrid` validation to use the resolved `self.context_params.swa_full` state. --- llama_cpp/llama.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a4587b63ea..cde5491f47 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -479,15 +479,22 @@ def __init__( _is_recurrent = self._model.is_recurrent() _is_hybrid = self._model.is_hybrid() _n_swa = self._model.n_swa() + # Sync llama.cpp upstream (#20291): warn swa-full is not supported for non-SWA models. + if _n_swa == 0: + if (self.context_params.swa_full): + self.context_params.swa_full = False + if self.verbose: + print("Llama.__init__: swa_full is not supported by this model, it will be disabled", file=sys.stderr) + # checkpoints are created only if: # - the model uses SWA and we are not using `swa_full` # - the model architecture is marked as recurrent or hybrid - self.is_hybrid = _is_recurrent or _is_hybrid or (_n_swa > 0 and not swa_full) + self.is_hybrid = _is_recurrent or _is_hybrid or (_n_swa > 0 and not self.context_params.swa_full) if self.is_hybrid: if self.verbose: print(f"Llama.__init__: Hybrid/Recurrent model detected." - f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}, swa_full: {swa_full}). " + f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}, checkpoint_interval={checkpoint_interval}).", file=sys.stderr) self.ctx_checkpoints = ctx_checkpoints From 4bec16509d9ac122053d04c8cafe0115d56dc949 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 19:59:28 +0800 Subject: [PATCH 246/518] Update Submodule vendor/llama.cpp 59db9a3..ec947d2 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 59db9a357d..ec947d2b16 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 59db9a357d9a247009c70fda34050661b17a1a5c +Subproject commit ec947d2b16218580dfad3f2f8bd589190955efaf From 5c8e056bc1c7a1949d3946299d0786bb0ffded2a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 10 Mar 2026 21:26:18 +0800 Subject: [PATCH 247/518] perf(cache): upgrade `LlamaRAMCache` to O(1) eviction and set `LlamaTrieCache` as default Addressed severe performance bottlenecks in legacy RAM caching components: - Refactored `LlamaRAMCache` to use an O(1) `_current_size` tracker instead of an O(N) dynamic sum. This eliminates massive CPU spikes and O(N^2) complexity during LRU eviction cycles. - Added strict OOM safeguards to `LlamaRAMCache`: The current size is explicitly clamped to 0 during evictions, and hard-reset to 0 if the cache empties, preventing catastrophic capacity drift. - Introduced early-exit O(1) short-circuits in `__getitem__` and `__contains__` to bypass expensive prefix searches when the cache is empty. - Updated the `LlamaCache` backward-compatibility alias to point to the highly optimized `LlamaTrieCache` instead of the legacy `LlamaRAMCache`. Signed-off-by: JamePeng --- llama_cpp/llama_cache.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index 2169780326..c703604dce 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -107,7 +107,10 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): class LlamaRAMCache(BaseLlamaCache): - """Cache for a llama.cpp model using RAM.""" + """ + RAM cache for a llama.cpp model. + Maintains an LRU eviction policy with O(1) size tracking. + """ def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) @@ -115,10 +118,11 @@ def __init__(self, capacity_bytes: int = (2 << 30)): self.cache_state: OrderedDict[ Tuple[int, ...], "llama_core.LlamaState" ] = OrderedDict() + self._current_size = 0 @property def cache_size(self): - return sum([state.llama_state_size for state in self.cache_state.values()]) + return self._current_size def _find_longest_prefix_key( self, @@ -137,6 +141,9 @@ def _find_longest_prefix_key( return min_key def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": + if not self.cache_state: + raise KeyError("Cache is empty") + key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -146,15 +153,26 @@ def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": return value def __contains__(self, key: Sequence[int]) -> bool: + if not self.cache_state: + return False + return self._find_longest_prefix_key(tuple(key)) is not None def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] + self.cache_state[key] = value - while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: - self.cache_state.popitem(last=False) + self._current_size += value.llama_state_size + + while self._current_size > self.capacity_bytes and len(self.cache_state) > 0: + _, popped_state = self.cache_state.popitem(last=False) + self._current_size -= popped_state.llama_state_size + self._current_size = max(0, self._current_size) + + if len(self.cache_state) == 0: + self._current_size = 0 class TrieNode: @@ -316,7 +334,7 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): self._prune(evicted_key) # Alias for backwards compatibility -LlamaCache = LlamaRAMCache +LlamaCache = LlamaTrieCache @dataclass From 054f04e85b80288c87bcaa75266e6964a669e253 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 11 Mar 2026 05:35:44 +0800 Subject: [PATCH 248/518] Update Submodule vendor/llama.cpp ec947d2..10e5b14 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ec947d2b16..10e5b148b0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ec947d2b16218580dfad3f2f8bd589190955efaf +Subproject commit 10e5b148b061569aaee8ae0cf72a703129df0eab From 8e8f36fb8f993761a1e007b2e8d16d42e264f875 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 11 Mar 2026 05:36:16 +0800 Subject: [PATCH 249/518] Sync examples : fix empty items in json_schema_to_grammar.py (#19968) --- llama_cpp/llama_grammar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 46ae4ba1ce..21bb688dee 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -890,7 +890,7 @@ def add_component(comp_schema, is_required): return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): - items = schema.get('items') or schema['prefixItems'] + items = schema.get('items', schema.get('prefixItems')) if isinstance(items, list): return self._add_rule( rule_name, From a36e8d2c52afe20d5724988b2bb7331cce817013 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 11 Mar 2026 06:58:21 +0800 Subject: [PATCH 250/518] perf(cache): optimize LlamaDiskCache I/O and fix LRU behavior - Delegated LRU and size limits to native `diskcache` SQLite engine, removing the slow manual eviction loop. - Added an O(1) early exit in `_find_longest_prefix_key` to prevent unnecessary full-table disk scans. - Fixed a destructive read bug by replacing `.pop()` with standard access to properly update LRU timestamps. - Added fast-path empty checks to bypass disk queries entirely when the cache is empty. --- llama_cpp/llama_cache.py | 45 +++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index c703604dce..da00cbfb05 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -53,57 +53,68 @@ def __setitem__( class LlamaDiskCache(BaseLlamaCache): - """Cache for a llama.cpp model using disk.""" + """ + Disk cache for a llama.cpp model. + Delegates LRU and size management natively to the SQLite-backed `diskcache` library. + """ def __init__( self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) ): super().__init__(capacity_bytes) - self.cache = diskcache.Cache(cache_dir) + self.cache_dir = cache_dir + # Native SQLite size limit and LRU eviction + self.cache = diskcache.Cache(cache_dir, size_limit=capacity_bytes) @property def cache_size(self): - return int(self.cache.volume()) # type: ignore + # Native O(1) volume calculation + return self.cache.volume() # type: ignore def _find_longest_prefix_key( self, key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: + # Early exit if cache is empty + if len(self.cache) == 0: + return None + min_len = 0 min_key: Optional[Tuple[int, ...]] = None + target_len = len(key) for k in self.cache.iterkeys(): # type: ignore prefix_len = llama_core.Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore + # Perfect match found, break to prevent full-table disk scan + if min_len == target_len: + break + return min_key def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": + print("LlamaDiskCache.__getitem__: called", file=sys.stderr) + if len(self.cache) == 0: + raise KeyError("Cache is empty") + key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "llama_core.LlamaState" = self.cache.pop(_key) # type: ignore - # NOTE: This puts an integer as key in cache, which breaks, - # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens - # self.cache.push(_key, side="front") # type: ignore + # Non-destructive read: automatically updates access time for LRU + value: "llama_core.LlamaState" = self.cache[_key] # type: ignore return value def __contains__(self, key: Sequence[int]) -> bool: + if len(self.cache) == 0: + return False return self._find_longest_prefix_key(tuple(key)) is not None def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): print("LlamaDiskCache.__setitem__: called", file=sys.stderr) - key = tuple(key) - if key in self.cache: - print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) - del self.cache[key] - self.cache[key] = value - print("LlamaDiskCache.__setitem__: set", file=sys.stderr) - while self.cache_size > self.capacity_bytes and len(self.cache) > 0: - key_to_remove = next(iter(self.cache)) - del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + # diskcache natively handles capacity check and eviction upon assignment + self.cache[tuple(key)] = value class LlamaRAMCache(BaseLlamaCache): From b827932bad0945f130ce780b7d9e7f58f343f165 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 11 Mar 2026 22:08:47 +0800 Subject: [PATCH 251/518] Update Submodule vendor/llama.cpp 10e5b14..182acfe --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 10e5b148b0..182acfe5c5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 10e5b148b061569aaee8ae0cf72a703129df0eab +Subproject commit 182acfe5c5eb17a4f82d9181fa7bd91510e3b93d From 8641667e6600e9f31e5723be4349b27aec430df8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 11 Mar 2026 22:23:29 +0800 Subject: [PATCH 252/518] fix(chat_handler): fix tools and function calling in MTMDChatHandler. --- llama_cpp/llama_chat_format.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 9b96beaea4..b82b653e7a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3049,6 +3049,10 @@ def _process_mtmd_prompt( self, llama: llama_core.Llama, messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, ) -> Tuple[List[int], List[tuple], Any, List[Any]]: """ Core multimodal preprocessing pipeline. @@ -3079,6 +3083,10 @@ def _process_mtmd_prompt( add_generation_prompt=True, eos_token=self.mtmd_eos_token, bos_token=self.mtmd_bos_token, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, **getattr(self, 'extra_template_arguments', {}) ) # Replace image_url by media_marker in text @@ -3263,7 +3271,14 @@ def __call__( assert self.mtmd_ctx is not None # 2. Concurrent Preprocessing & Ledger Construction - full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt(llama, messages) + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( + llama=llama, + messages=messages, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice + ) if self.verbose: print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) From 21c4c872d8cfaa6129156b9982ad9ebf04df54e3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Mar 2026 06:17:53 +0800 Subject: [PATCH 253/518] Update Submodule vendor/llama.cpp 182acfe..fdb1764 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 182acfe5c5..fdb17643d3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 182acfe5c5eb17a4f82d9181fa7bd91510e3b93d +Subproject commit fdb17643d379cd35bf6acf0f57cfaa500f88a145 From c420a2aeec9db138cb16989e713206d93f636a74 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Mar 2026 07:37:14 +0800 Subject: [PATCH 254/518] Update llama.cpp API 20260312 --- llama_cpp/llama_cpp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f435658046..e6dc1567d0 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -348,6 +348,7 @@ # LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors # LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors # LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -387,6 +388,7 @@ LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 +LLAMA_FTYPE_MOSTLY_NVFP4 = 39 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { From 2bb5cd9c44ee067dc371a12467c23da837317dbb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Mar 2026 07:44:29 +0800 Subject: [PATCH 255/518] feat(_ggml): extend ctypes bindings with more ggml constants, enums, and structs --- llama_cpp/_ggml.py | 536 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 533 insertions(+), 3 deletions(-) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 35fc077d4e..d53272f6c4 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -33,6 +33,34 @@ # // ====== ggml.h ====== +GGML_FILE_MAGIC = 0x67676d6c # b"ggml" +GGML_FILE_VERSION = 2 + +GGML_QNT_VERSION = 2 # bump this on quantization format changes +GGML_QNT_VERSION_FACTOR = 1000 # do not change this + +GGML_MAX_DIMS = 4 +GGML_MAX_PARAMS = 2048 +GGML_MAX_SRC = 10 +GGML_MAX_N_THREADS = 512 +GGML_MAX_OP_PARAMS = 64 + +GGML_MAX_NAME = 64 + +GGML_DEFAULT_N_THREADS = 4 +GGML_DEFAULT_GRAPH_SIZE = 2048 + +GGML_EXIT_SUCCESS = 0 +GGML_EXIT_ABORTED = 1 + +GGML_ROPE_TYPE_NORMAL = 0 +GGML_ROPE_TYPE_NEOX = 2 +GGML_ROPE_TYPE_MROPE = 8 +GGML_ROPE_TYPE_VISION = 24 +GGML_ROPE_TYPE_IMROPE = 40 # binary: 101000 + +GGML_MROPE_SECTIONS = 4 + # enum ggml_status { # GGML_STATUS_ALLOC_FAILED = -2, # GGML_STATUS_FAILED = -1, @@ -88,7 +116,8 @@ class GGMLStatus(enum.IntEnum): # // GGML_TYPE_IQ4_NL_4_8 = 37, # // GGML_TYPE_IQ4_NL_8_8 = 38, # GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) -# GGML_TYPE_COUNT = 40, +# GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) +# GGML_TYPE_COUNT = 41, # }; class GGMLType(enum.IntEnum): GGML_TYPE_F32 = 0 @@ -123,7 +152,8 @@ class GGMLType(enum.IntEnum): GGML_TYPE_TQ1_0 = 34 GGML_TYPE_TQ2_0 = 35 GGML_TYPE_MXFP4 = 39 - GGML_TYPE_COUNT = 40 + GGML_TYPE_NVFP4 = 40 + GGML_TYPE_COUNT = 41 # // precision @@ -163,6 +193,7 @@ class GGMLPrec(enum.IntEnum): # GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors # GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors # GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors +# GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors # }; class GGMLFType(enum.IntEnum): GGML_FTYPE_UNKNOWN = -1 @@ -190,7 +221,302 @@ class GGMLFType(enum.IntEnum): GGML_FTYPE_MOSTLY_IQ1_M = 23 GGML_FTYPE_MOSTLY_BF16 = 24 GGML_FTYPE_MOSTLY_MXFP4 = 25 + GGML_FTYPE_MOSTLY_NVFP4 = 26 + + +# // available tensor operations: +# enum ggml_op { +# GGML_OP_NONE = 0, + +# GGML_OP_DUP, +# GGML_OP_ADD, +# GGML_OP_ADD_ID, +# GGML_OP_ADD1, +# GGML_OP_ACC, +# GGML_OP_SUB, +# GGML_OP_MUL, +# GGML_OP_DIV, +# GGML_OP_SQR, +# GGML_OP_SQRT, +# GGML_OP_LOG, +# GGML_OP_SIN, +# GGML_OP_COS, +# GGML_OP_SUM, +# GGML_OP_SUM_ROWS, +# GGML_OP_CUMSUM, +# GGML_OP_MEAN, +# GGML_OP_ARGMAX, +# GGML_OP_COUNT_EQUAL, +# GGML_OP_REPEAT, +# GGML_OP_REPEAT_BACK, +# GGML_OP_CONCAT, +# GGML_OP_SILU_BACK, +# GGML_OP_NORM, // normalize +# GGML_OP_RMS_NORM, +# GGML_OP_RMS_NORM_BACK, +# GGML_OP_GROUP_NORM, +# GGML_OP_L2_NORM, + +# GGML_OP_MUL_MAT, +# GGML_OP_MUL_MAT_ID, +# GGML_OP_OUT_PROD, + +# GGML_OP_SCALE, +# GGML_OP_SET, +# GGML_OP_CPY, +# GGML_OP_CONT, +# GGML_OP_RESHAPE, +# GGML_OP_VIEW, +# GGML_OP_PERMUTE, +# GGML_OP_TRANSPOSE, +# GGML_OP_GET_ROWS, +# GGML_OP_GET_ROWS_BACK, +# GGML_OP_SET_ROWS, +# GGML_OP_DIAG, +# GGML_OP_DIAG_MASK_INF, +# GGML_OP_DIAG_MASK_ZERO, +# GGML_OP_SOFT_MAX, +# GGML_OP_SOFT_MAX_BACK, +# GGML_OP_ROPE, +# GGML_OP_ROPE_BACK, +# GGML_OP_CLAMP, +# GGML_OP_CONV_TRANSPOSE_1D, +# GGML_OP_IM2COL, +# GGML_OP_IM2COL_BACK, +# GGML_OP_IM2COL_3D, +# GGML_OP_CONV_2D, +# GGML_OP_CONV_3D, +# GGML_OP_CONV_2D_DW, +# GGML_OP_CONV_TRANSPOSE_2D, +# GGML_OP_POOL_1D, +# GGML_OP_POOL_2D, +# GGML_OP_POOL_2D_BACK, +# GGML_OP_UPSCALE, +# GGML_OP_PAD, +# GGML_OP_PAD_REFLECT_1D, +# GGML_OP_ROLL, +# GGML_OP_ARANGE, +# GGML_OP_TIMESTEP_EMBEDDING, +# GGML_OP_ARGSORT, +# GGML_OP_TOP_K, +# GGML_OP_LEAKY_RELU, +# GGML_OP_TRI, +# GGML_OP_FILL, + +# GGML_OP_FLASH_ATTN_EXT, +# GGML_OP_FLASH_ATTN_BACK, +# GGML_OP_SSM_CONV, +# GGML_OP_SSM_SCAN, +# GGML_OP_WIN_PART, +# GGML_OP_WIN_UNPART, +# GGML_OP_GET_REL_POS, +# GGML_OP_ADD_REL_POS, +# GGML_OP_RWKV_WKV6, +# GGML_OP_GATED_LINEAR_ATTN, +# GGML_OP_RWKV_WKV7, +# GGML_OP_SOLVE_TRI, +# GGML_OP_GATED_DELTA_NET, + +# GGML_OP_UNARY, + +# GGML_OP_MAP_CUSTOM1, +# GGML_OP_MAP_CUSTOM2, +# GGML_OP_MAP_CUSTOM3, + +# GGML_OP_CUSTOM, + +# GGML_OP_CROSS_ENTROPY_LOSS, +# GGML_OP_CROSS_ENTROPY_LOSS_BACK, +# GGML_OP_OPT_STEP_ADAMW, +# GGML_OP_OPT_STEP_SGD, + +# GGML_OP_GLU, + +# GGML_OP_COUNT, +# }; +class GGML_OP(enum.IntEnum): + GGML_OP_NONE = 0 + + GGML_OP_DUP = 1 + GGML_OP_ADD = 2 + GGML_OP_ADD_ID = 3 + GGML_OP_ADD1 = 4 + GGML_OP_ACC = 5 + GGML_OP_SUB = 6 + GGML_OP_MUL = 7 + GGML_OP_DIV = 8 + GGML_OP_SQR = 9 + GGML_OP_SQRT = 10 + GGML_OP_LOG = 11 + GGML_OP_SIN = 12 + GGML_OP_COS = 13 + GGML_OP_SUM = 14 + GGML_OP_SUM_ROWS = 15 + GGML_OP_CUMSUM = 16 + GGML_OP_MEAN = 17 + GGML_OP_ARGMAX = 18 + GGML_OP_COUNT_EQUAL = 19 + GGML_OP_REPEAT = 20 + GGML_OP_REPEAT_BACK = 21 + GGML_OP_CONCAT = 22 + GGML_OP_SILU_BACK = 23 + GGML_OP_NORM = 24 # // normalize + GGML_OP_RMS_NORM = 25 + GGML_OP_RMS_NORM_BACK = 26 + GGML_OP_GROUP_NORM = 27 + GGML_OP_L2_NORM = 28 + + GGML_OP_MUL_MAT = 29 + GGML_OP_MUL_MAT_ID = 30 + GGML_OP_OUT_PROD = 31 + + GGML_OP_SCALE = 32 + GGML_OP_SET = 33 + GGML_OP_CPY = 34 + GGML_OP_CONT = 35 + GGML_OP_RESHAPE = 36 + GGML_OP_VIEW = 37 + GGML_OP_PERMUTE = 38 + GGML_OP_TRANSPOSE = 39 + GGML_OP_GET_ROWS = 40 + GGML_OP_GET_ROWS_BACK = 41 + GGML_OP_SET_ROWS = 42 + GGML_OP_DIAG = 43 + GGML_OP_DIAG_MASK_INF = 44 + GGML_OP_DIAG_MASK_ZERO = 45 + GGML_OP_SOFT_MAX = 46 + GGML_OP_SOFT_MAX_BACK = 47 + GGML_OP_ROPE = 48 + GGML_OP_ROPE_BACK = 49 + GGML_OP_CLAMP = 50 + GGML_OP_CONV_TRANSPOSE_1D = 51 + GGML_OP_IM2COL = 52 + GGML_OP_IM2COL_BACK = 53 + GGML_OP_IM2COL_3D = 54 + GGML_OP_CONV_2D = 55 + GGML_OP_CONV_3D = 56 + GGML_OP_CONV_2D_DW = 57 + GGML_OP_CONV_TRANSPOSE_2D = 58 + GGML_OP_POOL_1D = 59 + GGML_OP_POOL_2D = 60 + GGML_OP_POOL_2D_BACK = 61 + GGML_OP_UPSCALE = 62 + GGML_OP_PAD = 63 + GGML_OP_PAD_REFLECT_1D = 64 + GGML_OP_ROLL = 65 + GGML_OP_ARANGE = 66 + GGML_OP_TIMESTEP_EMBEDDING = 67 + GGML_OP_ARGSORT = 68 + GGML_OP_TOP_K = 69 + GGML_OP_LEAKY_RELU = 70 + GGML_OP_TRI = 71 + GGML_OP_FILL = 72 + + GGML_OP_FLASH_ATTN_EXT = 73 + GGML_OP_FLASH_ATTN_BACK = 74 + GGML_OP_SSM_CONV = 75 + GGML_OP_SSM_SCAN = 76 + GGML_OP_WIN_PART = 77 + GGML_OP_WIN_UNPART = 78 + GGML_OP_GET_REL_POS = 79 + GGML_OP_ADD_REL_POS = 80 + GGML_OP_RWKV_WKV6 = 81 + GGML_OP_GATED_LINEAR_ATTN = 82 + GGML_OP_RWKV_WKV7 = 83 + GGML_OP_SOLVE_TRI = 84 + GGML_OP_GATED_DELTA_NET = 85 + + GGML_OP_UNARY = 86 + + GGML_OP_MAP_CUSTOM1 = 87 + GGML_OP_MAP_CUSTOM2 = 88 + GGML_OP_MAP_CUSTOM3 = 89 + + GGML_OP_CUSTOM = 90 + + GGML_OP_CROSS_ENTROPY_LOSS = 91 + GGML_OP_CROSS_ENTROPY_LOSS_BACK = 92 + GGML_OP_OPT_STEP_ADAMW = 93 + GGML_OP_OPT_STEP_SGD = 94 + + GGML_OP_GLU = 95 + + GGML_OP_COUNT = 96 + +# enum ggml_unary_op { +# GGML_UNARY_OP_ABS, +# GGML_UNARY_OP_SGN, +# GGML_UNARY_OP_NEG, +# GGML_UNARY_OP_STEP, +# GGML_UNARY_OP_TANH, +# GGML_UNARY_OP_ELU, +# GGML_UNARY_OP_RELU, +# GGML_UNARY_OP_SIGMOID, +# GGML_UNARY_OP_GELU, +# GGML_UNARY_OP_GELU_QUICK, +# GGML_UNARY_OP_SILU, +# GGML_UNARY_OP_HARDSWISH, +# GGML_UNARY_OP_HARDSIGMOID, +# GGML_UNARY_OP_EXP, +# GGML_UNARY_OP_EXPM1, +# GGML_UNARY_OP_SOFTPLUS, +# GGML_UNARY_OP_GELU_ERF, +# GGML_UNARY_OP_XIELU, +# GGML_UNARY_OP_FLOOR, +# GGML_UNARY_OP_CEIL, +# GGML_UNARY_OP_ROUND, +# GGML_UNARY_OP_TRUNC, + +# GGML_UNARY_OP_COUNT, +# }; +class GGMLUnaryOp(enum.IntEnum): + GGML_UNARY_OP_ABS = 0 + GGML_UNARY_OP_SGN = 1 + GGML_UNARY_OP_NEG = 2 + GGML_UNARY_OP_STEP = 3 + GGML_UNARY_OP_TANH = 4 + GGML_UNARY_OP_ELU = 5 + GGML_UNARY_OP_RELU = 6 + GGML_UNARY_OP_SIGMOID = 7 + GGML_UNARY_OP_GELU = 8 + GGML_UNARY_OP_GELU_QUICK = 9 + GGML_UNARY_OP_SILU = 10 + GGML_UNARY_OP_HARDSWISH = 11 + GGML_UNARY_OP_HARDSIGMOID = 12 + GGML_UNARY_OP_EXP = 13 + GGML_UNARY_OP_EXPM1 = 14 + GGML_UNARY_OP_SOFTPLUS = 15 + GGML_UNARY_OP_GELU_ERF = 16 + GGML_UNARY_OP_XIELU = 17 + GGML_UNARY_OP_FLOOR = 18 + GGML_UNARY_OP_CEIL = 19 + GGML_UNARY_OP_ROUND = 20 + GGML_UNARY_OP_TRUNC = 21 + + GGML_UNARY_OP_COUNT = 22 + +# enum ggml_glu_op { +# GGML_GLU_OP_REGLU, +# GGML_GLU_OP_GEGLU, +# GGML_GLU_OP_SWIGLU, +# GGML_GLU_OP_SWIGLU_OAI, +# GGML_GLU_OP_GEGLU_ERF, +# GGML_GLU_OP_GEGLU_QUICK, +# GGML_GLU_OP_COUNT, +# }; +class GGMLGluOp(enum.IntEnum): + GGML_GLU_OP_REGLU = 0 + GGML_GLU_OP_GEGLU = 1 + GGML_GLU_OP_SWIGLU = 2 + GGML_GLU_OP_SWIGLU_OAI = 3 + GGML_GLU_OP_GEGLU_ERF = 4 + GGML_GLU_OP_GEGLU_QUICK = 5 + GGML_GLU_OP_COUNT = 6 + +# // +# // ggml object +# // # enum ggml_object_type { # GGML_OBJECT_TYPE_TENSOR, @@ -203,6 +529,71 @@ class GGMLObjectType(enum.IntEnum): GGML_OBJECT_TYPE_WORK_BUFFER = 2 +# struct ggml_object { +# size_t offs; +# size_t size; +# struct ggml_object * next; +# enum ggml_object_type type; +# char padding[4]; +# }; +class ggml_object(ctypes.Structure): + if TYPE_CHECKING: + offs: ctypes.c_size_t + size: ctypes.c_size_t + next: "ctypes.POINTER(ggml_object)" + type: int + padding: ctypes.Array[ctypes.c_char] + +ggml_object_p = ctypes.POINTER(ggml_object) + +ggml_object._fields_ = [ + ("offs", ctypes.c_size_t), + ("size", ctypes.c_size_t), + ("next", ggml_object_p), + ("type", ctypes.c_int), + ("padding", ctypes.c_char * 4), +] + +GGML_OBJECT_SIZE = ctypes.sizeof(ggml_object) + + +# // +# // ggml context +# // + +# struct ggml_context { +# size_t mem_size; +# void * mem_buffer; +# bool mem_buffer_owned; +# bool no_alloc; +# int n_objects; +# struct ggml_object * objects_begin; +# struct ggml_object * objects_end; +# }; +class ggml_context(ctypes.Structure): + + if TYPE_CHECKING: + mem_size: ctypes.c_size_t + mem_buffer: ctypes.c_void_p + mem_buffer_owned: bool + no_alloc: bool + n_objects: int + objects_begin: ggml_object_p + objects_end: ggml_object_p + + _fields_ = [ + ("mem_size", ctypes.c_size_t), + ("mem_buffer", ctypes.c_void_p), + ("mem_buffer_owned", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), + ("n_objects", ctypes.c_int), + ("objects_begin", ggml_object_p), + ("objects_end", ggml_object_p), + ] + +ggml_context_p = ctypes.POINTER(ggml_context) + + # enum ggml_log_level { # GGML_LOG_LEVEL_NONE = 0, # GGML_LOG_LEVEL_DEBUG = 1, @@ -211,7 +602,6 @@ class GGMLObjectType(enum.IntEnum): # GGML_LOG_LEVEL_ERROR = 4, # GGML_LOG_LEVEL_CONT = 5, // continue previous log # }; - class GGMLLogLevel(enum.IntEnum): GGML_LOG_LEVEL_NONE = 0 GGML_LOG_LEVEL_DEBUG = 1 @@ -262,6 +652,67 @@ class ggml_init_params(ctypes.Structure): ] +# // n-dimensional tensor +# struct ggml_tensor { +# enum ggml_type type; +# struct ggml_backend_buffer * buffer; +# int64_t ne[GGML_MAX_DIMS]; // number of elements +# size_t nb[GGML_MAX_DIMS]; // stride in bytes: +# // nb[0] = ggml_type_size(type) +# // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding +# // nb[i] = nb[i-1] * ne[i-1] +# // compute data +# enum ggml_op op; +# // op params - allocated as int32_t for alignment +# int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; +# int32_t flags; +# struct ggml_tensor * src[GGML_MAX_SRC]; +# // source tensor and offset for views +# struct ggml_tensor * view_src; +# size_t view_offs; +# void * data; +# char name[GGML_MAX_NAME]; +# void * extra; // extra things e.g. for ggml-cuda.cu +# char padding[8]; +# }; +class ggml_tensor(ctypes.Structure): + """n-dimensional tensor""" + + if TYPE_CHECKING: + type: int + buffer: ctypes.c_void_p + ne: ctypes.Array[ctypes.c_int64] + nb: ctypes.Array[ctypes.c_size_t] + op: int + op_params: ctypes.Array[ctypes.c_int32] + flags: int + src: "ctypes.Array[ctypes.POINTER(ggml_tensor)]" + view_src: "ctypes.POINTER(ggml_tensor)" + view_offs: ctypes.c_size_t + data: ctypes.c_void_p + name: ctypes.Array[ctypes.c_char] + extra: ctypes.c_void_p + padding: ctypes.Array[ctypes.c_char] + +ggml_tensor_p = ctypes.POINTER(ggml_tensor) + +ggml_tensor._fields_ = [ + ("type", ctypes.c_int), + ("buffer", ctypes.c_void_p), + ("ne", ctypes.c_int64 * GGML_MAX_DIMS), + ("nb", ctypes.c_size_t * GGML_MAX_DIMS), + ("op", ctypes.c_int), + ("op_params", ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32))), + ("flags", ctypes.c_int32), + ("src", ggml_tensor_p * GGML_MAX_SRC), + ("view_src", ggml_tensor_p), + ("view_offs", ctypes.c_size_t), + ("data", ctypes.c_void_p), + ("name", ctypes.c_char * GGML_MAX_NAME), + ("extra", ctypes.c_void_p), + ("padding", ctypes.c_char * 8), +] + # // Abort callback # // If not NULL, called before ggml computation # // If it returns true, the computation is aborted @@ -389,3 +840,82 @@ def ggml_backend_load_all(): def ggml_backend_load_all_from_path(dir_path: ctypes.c_char_p): """Load all known backends from path""" ... + +# // +# // GGML internal header from ggml-impl.h +# // + +# typedef uint32_t ggml_bitset_t; +ggml_bitset_t = ctypes.c_uint32 + +# // computation graph + +# enum ggml_cgraph_eval_order { +# GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, +# GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, +# GGML_CGRAPH_EVAL_ORDER_COUNT +# }; +class GGMLCgraphEvalOrder(enum.IntEnum): + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0 + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT = 1 + GGML_CGRAPH_EVAL_ORDER_COUNT = 2 + + +# struct ggml_hash_set { +# size_t size; +# ggml_bitset_t * used; // whether or not the keys are in use i.e. set +# struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) +# }; +class ggml_hash_set(ctypes.Structure): + if TYPE_CHECKING: + size: int + used: ctypes.POINTER(ggml_bitset_t) + keys: "ctypes.POINTER(ggml_tensor_p)" + + _fields_ = [ + ("size", ctypes.c_size_t), + ("used", ctypes.POINTER(ggml_bitset_t)), + ("keys", ctypes.POINTER(ggml_tensor_p)), + ] + + +# struct ggml_cgraph { +# int size; // maximum number of nodes/leafs/grads/grad_accs +# int n_nodes; // number of nodes currently in use +# int n_leafs; // number of leafs currently in use + +# struct ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated +# struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes +# struct ggml_tensor ** grad_accs; // accumulators for node gradients +# struct ggml_tensor ** leafs; // tensors with constant data +# int32_t * use_counts;// number of uses of each tensor, indexed by hash table slot + +# struct ggml_hash_set visited_hash_set; + +# enum ggml_cgraph_eval_order order; +# }; +class ggml_cgraph(ctypes.Structure): + if TYPE_CHECKING: + size: int + n_nodes: int + n_leafs: int + nodes: "ctypes.POINTER(ggml_tensor_p)" + grads: "ctypes.POINTER(ggml_tensor_p)" + grad_accs: "ctypes.POINTER(ggml_tensor_p)" + leafs: "ctypes.POINTER(ggml_tensor_p)" + use_counts: ctypes.POINTER(ctypes.c_int32) + visited_hash_set: ggml_hash_set + order: int + + _fields_ = [ + ("size", ctypes.c_int), + ("n_nodes", ctypes.c_int), + ("n_leafs", ctypes.c_int), + ("nodes", ctypes.POINTER(ggml_tensor_p)), + ("grads", ctypes.POINTER(ggml_tensor_p)), + ("grad_accs", ctypes.POINTER(ggml_tensor_p)), + ("leafs", ctypes.POINTER(ggml_tensor_p)), + ("use_counts", ctypes.POINTER(ctypes.c_int32)), + ("visited_hash_set", ggml_hash_set), + ("order", ctypes.c_int), + ] From 380c64312a7e20d9db3483ed3ff46a864e33f04a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Mar 2026 22:57:30 +0800 Subject: [PATCH 256/518] Update Submodule vendor/llama.cpp fdb1764..557fe2d --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index fdb17643d3..557fe2d913 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit fdb17643d379cd35bf6acf0f57cfaa500f88a145 +Subproject commit 557fe2d9132913eaf08c8abf21b0cff61addb9ac From aa38850e7b18339595ae27436ccc45c5ca3c792e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Mar 2026 22:59:32 +0800 Subject: [PATCH 257/518] fix(sampling): prevent memory drift and hallucinations in logits view Previously, the numpy view for `logits_ptr` in `LlamaSamplingContext.sample` was only initialized once. If the underlying C++ buffer was reallocated or shifted (e.g., due to dynamic batch sizes or KV cache shifts), the numpy array would point to stale memory, leading to severe model hallucinations or segfaults. - Added explicit `_logits_ptr_addr` tracking to monitor the physical C memory address. - The zero-copy numpy view (`_logits_view`) is now safely recreated on-the-fly whenever the backend memory address changes. - Added proper initialization and garbage collection for the new tracker in `__init__` and `close`. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 9e89893efa..007cc0b82d 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1099,6 +1099,7 @@ def __init__( # reusable numpy logits view self._logits_view = None + self._logits_ptr_addr = None self._single_token = llama_cpp.llama_token_data() self._single_array = llama_cpp.llama_token_data_array( @@ -1281,12 +1282,14 @@ def sample( # 3. build cur_p logits_ptr = llama_cpp.llama_get_logits_ith(ctx.ctx, idx) + cur_addr = ctypes.addressof(logits_ptr.contents) - if self._logits_view is None: + if self._logits_ptr_addr != cur_addr: self._logits_view = np.ctypeslib.as_array( logits_ptr, shape=(self.n_vocab,), ) + self._logits_ptr_addr = cur_addr logits_array = self._logits_view cur_p = self._cur_p @@ -1395,6 +1398,7 @@ def close(self): # Remove NumPy view pointing to llama logits buffer. self._logits_view = None + self._logits_ptr_addr = None # Break references to small C structs used in grammar rejection sampling. self._single_token = None From d422e82403f733f7a7ee3129c739b0ca076b96cd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 12 Mar 2026 23:57:12 +0800 Subject: [PATCH 258/518] docs(README.md): add sampling parameter guide and strategic project tips to README - Sampling Documentation: Added a comprehensive guide for `LlamaSamplingParams`. It covers core, advanced (XTC, Dynatemp, Adaptive-P), entropy (Mirostat), and DRY repetition penalty configurations with a clean Python usage example. - Project Tips: Added a new "Quick tips" section to explicitly communicate the semi-deprecated status of `llama_cpp.server` in favor of the upstream `llama-server`. - Backend Recommendations: Added practical advice for AMD and Intel GPU users, officially recommending the Vulkan backend for cross-platform stability and faster updates. Signed-off-by: JamePeng --- README.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 111 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0d651dd4d9..b200d086a7 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,8 @@ # Python Bindings for [`llama.cpp`](https://github.com/ggml-org/llama.cpp) [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) -[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) +[![Tests](https://github.com/JamePeng/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/JamePeng/llama-cpp-python/actions/workflows/test.yaml) ![GitHub Tag](https://img.shields.io/github/v/tag/JamePeng/llama-cpp-python) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python) [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]() @@ -520,7 +519,108 @@ llm = Llama.from_pretrained( **NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.). -### Multi-modal Models +--- + +## Sampling Configuration & Usage (LlamaSamplingParams) + +The `Llama` class provides extensive control over the `llama.cpp` sampling chain during text generation. You can configure state-of-the-art sampling algorithms, dynamic temperature, and advanced repetition penalties directly via the `generate`, `create_completion`, or `__call__` methods. + +### Core Sampling Parameters + +These are the most common parameters used to control the randomness and focus of the model's output. + +* **`temperature`** (`float`, default: `0.80`): Controls the randomness of the generation. Higher values (e.g., `1.0`) make output more random, while lower values (e.g., `0.2`) make it more deterministic. Set to `<= 0.0` for greedy decoding. +* **`top_k`** (`int`, default: `40`): Limits the next token selection to the K most probable tokens. Set to `<= 0` to use the full vocabulary size. +* **`top_p`** (`float`, default: `0.95`): Nucleus sampling. Limits selection to a cumulative probability of P. Set to `1.0` to disable. +* **`min_p`** (`float`, default: `0.05`): Minimum P sampling. Drops tokens with a probability less than `min_p` relative to the most likely token. Set to `0.0` to disable. +* **`typical_p`** (`float`, default: `1.0`): Locally typical sampling. Adjusts probabilities based on the entropy of the distribution. Set to `1.0` to disable. + + +### Advanced & Experimental Samplers + +* **XTC (Exclude Top Choice)**: Removes the most likely tokens to force the model to take creative alternative paths. +* **`xtc_probability`** (`float`, default: `0.0`): The chance for token removal. `0.0` disables XTC. +* **`xtc_threshold`** (`float`, default: `0.1`): The minimum probability threshold for a token to be considered for removal. + + +* **Dynamic Temperature**: Adjusts the temperature dynamically based on the entropy of the current token distribution. +* **`dynatemp_range`** (`float`, default: `0.0`): The range of the dynamic temperature. `0.0` disables it. +* **`dynatemp_exponent`** (`float`, default: `1.0`): Controls how entropy maps to temperature. + + +* **`top_n_sigma`** (`float`, default: `-1.0`): Limits selection to tokens with pre-softmax logits within $n * \sigma$ of the max logit. Set to `-1.0` to disable. +* **Adaptive-P**: Dynamically adjusts the target probability using an exponential moving average (EMA). +* **`adaptive_target`** (`float`, default: `-1.0`): The target probability (0.0 to 1.0). Negative values disable it. +* **`adaptive_decay`** (`float`, default: `0.9`): The EMA decay rate (0.0 to 0.99). + + +### Target Entropy (Mirostat) + +Mirostat actively maintains a target entropy (`tau`) during generation to prevent text from becoming too boring or too chaotic. + +* **`mirostat_mode`** (`int`, default: `0`): `0` = disabled, `1` = Mirostat 1.0, `2` = Mirostat 2.0. +* **`mirostat_tau`** (`float`, default: `5.0`): The target cross-entropy (surprisal) value. +* **`mirostat_eta`** (`float`, default: `0.1`): The learning rate used to update the algorithm's internal state. + + +### Repetition Penalties + +* **Standard Penalties**: +* **`repeat_penalty`** (`float`, default: `1.0`): General penalty for repeated tokens. `1.0` = disabled. +* **`frequency_penalty`** (`float`, default: `0.0`): Penalty based on the absolute frequency of a token in the prompt. +* **`present_penalty`** (`float`, default: `0.0`): Flat penalty applied if a token is present anywhere in the context. +* **`penalty_last_n`** (`int`, default: `64`): The number of recent tokens to consider for standard penalties. `0` = disabled, `-1` = full context size. + + +* **DRY (Don't Repeat Yourself)**: An advanced exponential penalty specifically designed to break exact repeating sequences. +* **`dry_multiplier`** (`float`, default: `0.0`): The multiplier for the penalty. `0.0` disables DRY. +* **`dry_base`** (`float`, default: `1.75`): The base value for the exponential penalty. +* **`dry_allowed_length`** (`int`, default: `2`): Sequences extending beyond this length receive the penalty. +* **`dry_penalty_last_n`** (`int`, default: `0`): Tokens to scan for repetitions. `0` = disabled, `-1` = context size. +* **`dry_seq_breakers`** (`list[str]`, default: `["\n", ":", "\"", "*"]`): Tokens that reset the DRY sequence matching. + + +### Constraints & Callbacks + +* **`logit_bias`** (`Dict[int, float]`, optional): Manually boost or penalize specific token IDs. +* **`grammar`** (`LlamaGrammar`, optional): Force the model to generate text matching a specific BNF-like grammar (e.g., valid JSON). +* **`logits_processor`** (`LogitsProcessorList`, optional): Custom Python callbacks to modify the logits tensor in-place before sampling. +* **`stopping_criteria`** (`StoppingCriteriaList`, optional): Custom Python callbacks to halt generation based on the current sequence or scores. + +### 🛠️ Usage Example + +You can pass these parameters directly when calling the model to generate text. + +```python +from llama_cpp import Llama + +# Load the model +model = Llama(model_path="path/to/your/model.gguf") + +# Generate text with advanced sampling +response = model.create_completion( + prompt="The secret to a happy life is", + max_tokens=100, + # Adjust core randomness + temperature=0.85, + top_p=0.90, + min_p=0.05, + # Prevent the model from repeating specific phrases + dry_multiplier=0.8, + dry_base=1.75, + dry_allowed_length=3, + # Standard repetition penalty + repeat_penalty=1.1, + penalty_last_n=256, +) + +print(response["choices"][0]["text"]) + +``` + +--- + +## Multi-modal Models `llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images. @@ -968,7 +1068,7 @@ For instance, if you want to work with larger contexts, you can expand the conte llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048) ``` -## OpenAI Compatible Web Server +## OpenAI Compatible Web Server (Deprecated) `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). @@ -1157,8 +1257,14 @@ For example: The reason libraries from other authors are smaller is that they often **only compile for a single architecture** (e.g., targeting only the RTX 30 series [SM86] or the RTX 40 series [SM89]). To maximize convenience, I provide an **integrated compilation** covering a wide range of hardware; you simply need to select the CUDA version that matches your environment to load and run it. +### Quick tips for develop/user (continuously updated): + +* 1. I've determined that `llama_cpp.server` is currently in a semi-deprecated state (meaning it won't be maintained unless absolutely necessary, and I might even consider deleting or separating it to reduce the library size). I highly recommend using the `llama-server` program maintained by the upstream `llama.cpp` project, which offers a lower-level implementation, more frequent maintenance and optimization, and more reliable API calls. + +* 2. Regarding AMD and Intel graphics cards, AMD can certainly use ROCm as the primary backend (but the drawback is that it's basically only stable on Linux platforms), and Intel's Sycl will also encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. + -Any contributions and changes to this package will be made with these goals in mind. +### Any suggestions, contributions, and modifications to this package will be directed toward building a user-friendly, efficient, and secure Python library. ## License From 6f81466d36d1d739c11461d753f84770f02dab6d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 13 Mar 2026 08:16:37 +0800 Subject: [PATCH 259/518] fix(Llama.generate): add explicit fallback context reset and expand `Llama.generate` docstrings - Added a fallback `if reset:` block in `Llama.generate` to ensure the KV cache and hybrid cache manager are explicitly cleared when `reset=True` is passed and no prefix match is found. This prevents potential context poisoning from previous runs. - Added comprehensive docstrings to the `generate` method for all newly integrated sampler parameters (e.g., XTC, Mirostat, DRY penalties etc.). - Added explicit verbose logging for cache resetting, rollback events, and speculative decoding behaviors to improve debuggability. Signed-off-by: JamePeng --- llama_cpp/llama.py | 60 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cde5491f47..1b6bb9ee1b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1167,12 +1167,41 @@ def generate( ... print(llama.detokenize([token])) Args: - tokens: The prompt tokens. - top_k: The top-k sampling parameter. - top_p: The top-p sampling parameter. - temp: The temperature parameter. - repeat_penalty: The repeat penalty parameter. - reset: Whether to reset the model state. + tokens: The prompt tokens to evaluate. + top_k: Limit the next token selection to the K most probable tokens. (<=0 to use vocab size) + top_p: Nucleus sampling. Limits selection to a cumulative probability of P. + min_p: Minimum P sampling. Drops tokens with a probability less than min_p relative to the most likely token. + typical_p: Locally typical sampling. (1.0 = disabled) + temp: Temperature. Controls randomness. (<=0.0 greedy, 0.0 no probabilities) + dynatemp_range: Range of dynamic temperature. + dynatemp_exponent: Exponent of dynamic temperature. + top_n_sigma: Limit selection to tokens within n * sigma of the max logit. (-1.0 = disabled) + min_keep: Minimum tokens to keep for sampling. + penalty_last_n: Last n tokens to penalize (0 = disable penalty, -1 = context size). + repeat_penalty: General penalty for repeated tokens. (1.0 = disabled) + frequency_penalty: Penalty based on the absolute frequency of a token in the prompt. + present_penalty: Flat penalty applied if a token is present anywhere in the context. + reset: If True, attempts to automatically match the KV cache prefix to avoid re-evaluation. If False, blindly appends tokens to existing context. + mirostat_mode: Mirostat sampling mode (0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). + mirostat_tau: Target cross-entropy (surprisal) for Mirostat. + mirostat_eta: Learning rate for Mirostat. + xtc_threshold: Minimum probability threshold for XTC token removal. + xtc_probability: Chance for token removal in XTC sampling. + dry_multiplier: DRY (Don't Repeat Yourself) repetition penalty multiplier (0.0 = disabled). + dry_base: DRY repetition penalty base value. + dry_allowed_length: DRY maximum allowed sequence length without penalty. + dry_penalty_last_n: DRY tokens to scan for repetitions (0 = disabled, -1 = context size). + dry_seq_breakers: Array of sequence breakers for DRY sampling. + adaptive_target: Adaptive-p target probability (0.0 to 1.0, negative = disabled). + adaptive_decay: Adaptive-p decay rate (0.0 to 0.99). + use_infill: Activate specialized fill-in-the-middle sampler. + ignore_eos: If True, ignore the End-of-Sequence token. + logit_bias: Dictionary mapping token IDs to their bias values. + logits_processor: List of custom Python callbacks to modify logits in-place. + stopping_criteria: List of custom callbacks to halt generation dynamically. + grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax. + grammar_lazy: If True, activates grammar constraints only on specific trigger tokens. + seed: RNG seed for sampling. Overrides the instance seed. Yields: The generated tokens. @@ -1259,12 +1288,14 @@ def generate( f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr, ) - else: + if reset: # No prefix matched at all. Completely clear the KV cache to prevent context poisoning. self.n_tokens = 0 self._ctx.memory_clear(True) if self.is_hybrid and self._hybrid_cache_mgr is not None: self._hybrid_cache_mgr.clear() + if self.verbose: + print("Llama.generate: Context reset requested or no prefix match. Cleared KV cache.", file=sys.stderr) # Reset mirostat sampling params = LlamaSamplingParams( @@ -1315,6 +1346,7 @@ def generate( seed=seed if seed is not None else self._seed, ) + # Register custom python-level logits processors if provided if logits_processor: def adapter(token_data_array: llama_cpp.llama_token_data_array): if self._logits_all: @@ -1336,6 +1368,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): if CommonSamplerType.CUSTOM not in params.samplers: params.samplers.insert(3, CommonSamplerType.CUSTOM) + # Free previous sampling context to prevent memory leaks if getattr(self, "_sampling_ctx", None) is not None: self._sampling_ctx.close() self._sampling_ctx = None @@ -1345,7 +1378,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) - # Eval and sample + # Main evaluation and generation loop try: while True: if len(tokens) > 0: @@ -1376,12 +1409,15 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): else: # Standard evaluation or single-token generation step self.eval(tokens) + + # Sample loop while sample_idx < self.n_tokens: token = self._sampling_ctx.sample(self._ctx, idx=-1) self._sampling_ctx.accept(token, False if grammar is None else True) sample_idx += 1 + # Halt generation if custom stopping criteria are met if stopping_criteria is not None: if self._logits_all: logits_idx = sample_idx - self.n_tokens @@ -1399,13 +1435,17 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): ): return + # Yield the generated token to the caller tokens_or_none = yield token + tokens.clear() tokens.append(token) if tokens_or_none is not None: tokens.extend(tokens_or_none) + # Rollback Check: A previously evaluated token (e.g. from speculative decoding) + # mismatched the newly sampled token. We must rollback the KV cache. if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]: self.n_tokens = sample_idx if self.is_hybrid: @@ -1420,10 +1460,13 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): self._ctx.memory_clear(True) self.n_tokens = 0 else: + if self.verbose: + print(f"Llama.generate: Draft token rejected. Truncating context to {self.n_tokens}.", file=sys.stderr) self._ctx.memory_seq_rm(0, self.n_tokens, -1) break + # Speculative Decoding (Draft Model) logic if self.draft_model is not None: if self.is_hybrid: if self.verbose: @@ -1439,6 +1482,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): ] ) finally: + # Ensure the final state is checkpointed for hybrid models when generation finishes or is interrupted if ( self.is_hybrid and self._hybrid_cache_mgr is not None From a6db3f212001152319f6cc5e084f43200ab3af05 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 13 Mar 2026 21:33:47 +0800 Subject: [PATCH 260/518] Update Submodule vendor/llama.cpp 557fe2d..8f974d2 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 557fe2d913..8f974d2392 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 557fe2d9132913eaf08c8abf21b0cff61addb9ac +Subproject commit 8f974d2392da4e6fa422a67050e90f1471d72966 From afd06eaf599d8eab82295f7fc269d225956c8185 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 13 Mar 2026 21:36:35 +0800 Subject: [PATCH 261/518] Update mtmd_cpp API 20260313 --- llama_cpp/mtmd_cpp.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 111345706e..3368f848e8 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -252,12 +252,16 @@ def mtmd_support_vision(ctx: mtmd_context_p) -> c_bool: def mtmd_support_audio(ctx: mtmd_context_p) -> c_bool: ... -# // get audio bitrate in Hz, for example 16000 for Whisper +# // get audio sample rate in Hz, for example 16000 for Whisper # // return -1 if audio is not supported -# MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); +# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); @ctypes_function_mtmd( - "mtmd_get_audio_bitrate", [mtmd_context_p_ctypes], c_int) -def mtmd_get_audio_bitrate(ctx: mtmd_context_p) -> c_int: + "mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int) +def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: + """ + get audio sample rate in Hz, for example 16000 for Whisper + return -1 if audio is not supported + """ ... # // mtmd_bitmap From ec2cbeacfe8f704d880f3dea4fcabcbef865095e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 14 Mar 2026 08:25:38 +0800 Subject: [PATCH 262/518] Update Submodule vendor/llama.cpp 8f974d2..463b6a9 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8f974d2392..463b6a963c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8f974d2392da4e6fa422a67050e90f1471d72966 +Subproject commit 463b6a963c2de376e102d878a50d26802f15833c From dd111ebc358589bd9f15f9907ceb26e07021a174 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 14 Mar 2026 08:26:24 +0800 Subject: [PATCH 263/518] fix(mtmd): prevent multimodal image freeze by injecting deterministic media IDs Fixed a critical "image freeze" bug (report by @KLL535) where the model would continuously reuse the first cached image regardless of new inputs. - The issue occurred because the C++ `self._mtmd_cpp.mtmd_input_chunk_get_id(chunk)` parser returns an empty ID (`b''`) for placeholder tokens like `<__media__>`, causing all media to fallback to the same magic number (`-314159`). This resulted in false-positive KV cache prefix matches. - Replaced the C++ chunk ID extraction with a Python-side `media_item_cursor` that generates a deterministic 32-bit negative ID using `zlib.crc32(real_media_url)`. - This ensures `longest_token_prefix` correctly identifies and reuses identical images while instantly breaking the cache match when the image content changes. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index b82b653e7a..652cb15043 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -9,6 +9,7 @@ import random import string import sys +import zlib from contextlib import ExitStack from typing import ( @@ -3161,6 +3162,10 @@ def _create_bitmap_func(idx: int, item: str): current_idx = 0 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + # Cursor to track the actual media contents (URLs or base64 data) provided by the user + media_items_count = len(media_items) + media_items_cur = 0 + for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) if chunk is None: continue @@ -3181,17 +3186,27 @@ def _create_bitmap_func(idx: int, item: str): ]: # Extract media properties chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - chunk_id_bytes = self._mtmd_cpp.mtmd_input_chunk_get_id(chunk) - if chunk_id_bytes: + if media_items_cur < media_items_count: + # The C++ parser only sees identical placeholders (e.g., "<__media__>"). + # We MUST inject the actual media content's identity here. + real_media_url = media_items[media_items_cur]["url"] # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) - # Create Negative Reverse Vocabulary ID: -100 to -16,777,316 - # Improved longest_token_prefix search matching performance - media_id = - (abs(hash(chunk_id_bytes.decode('utf-8', errors='ignore'))) % (2**24)) - 100 + # Generate a deterministic, unique negative ID for this specific image/audio. + # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). + # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with + # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). + # This empowers `longest_token_prefix` to correctly identify and reuse cached images, + # while instantly breaking the match if the image content changes. + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_items_cur += 1 else: # Magic Negative Number as fallback :) media_id = -314159 + if self.verbose: + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens:{chunk_n_tokens}, media_id: {media_id}, ") + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache From bd0b069f8874e9ab7597177eff8c8059cf2174ed Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 14 Mar 2026 09:02:37 +0800 Subject: [PATCH 264/518] perf(mtmd): optimize media_id masking with bitwise AND - Replaced the modulo operation (`% (2**24)`) with a bitwise AND mask (`& 0xFFFFFF`) when calculating the deterministic `media_id` from the CRC32 hash. This is a micro-optimization that leverages faster native CPU bitwise instructions instead of division, resulting in more idiomatic and performant low-level bit masking. - When processing 1-2 images, the difference between % and & is only a few nanoseconds, imperceptible to the user. In future video processing, you might need to frantically calculate IDs within a for loop for 100 or even 300 frames of data. In this case, the extremely low CPU overhead of the bitwise operation & 0xFFFFFF ensures that the main thread will not experience any computational blockage at the Python layer when building a virtual ledger of tens of thousands of characters. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 652cb15043..6c8e2146e6 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3198,7 +3198,8 @@ def _create_bitmap_func(idx: int, item: str): # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). # This empowers `longest_token_prefix` to correctly identify and reuse cached images, # while instantly breaking the match if the image content changes. - media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 media_items_cur += 1 else: # Magic Negative Number as fallback :) From b0f00a96d3803863dd7d479f0cb1305f76f741b3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Mar 2026 06:16:38 +0800 Subject: [PATCH 265/518] Update Submodule vendor/llama.cpp 463b6a9..d23355a --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 463b6a963c..d23355afc3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 463b6a963c2de376e102d878a50d26802f15833c +Subproject commit d23355afc319f598d0e588a2d16a4da82e14ff41 From 6bbc8d2306319c67c9f7d0d2d0576496f3587a3c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Mar 2026 13:15:28 +0900 Subject: [PATCH 266/518] Bump version to 0.3.33 Signed-off-by: JamePeng --- CHANGELOG.md | 73 +++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4656d87129..75c6c0398b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,79 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.33] Fixing Multimodal Image Freezes, Stabilizing Logits, and Optimized Legacy Cache Logic + +- perf(mtmd): optimize media_id masking with bitwise AND + - Replaced the modulo operation (`% (2**24)`) with a bitwise AND mask (`& 0xFFFFFF`) when calculating the deterministic `media_id` from the CRC32 hash. This is a micro-optimization that leverages faster native CPU bitwise instructions instead of division, resulting in more idiomatic and performant low-level bit masking. + - When processing 1-2 images, the difference between % and & is only a few nanoseconds, imperceptible to the user. In future video processing, you might need to frantically calculate IDs within a for loop for 100 or even 300 frames of data. In this case, the extremely low CPU overhead of the bitwise operation & 0xFFFFFF ensures that the main thread will not experience any computational blockage at the Python layer when building a virtual ledger of tens of thousands of characters. + +- fix(mtmd): prevent multimodal image freeze by injecting deterministic media IDs + - Fixed a critical "image freeze" bug (report by **@KLL535**) where the model would continuously reuse the first cached image regardless of new inputs. + - The issue occurred because the C++ `self._mtmd_cpp.mtmd_input_chunk_get_id(chunk)` parser returns an empty ID (`b''`) for placeholder tokens like `<__media__>`, causing all media to fallback to the same magic number (`-314159`). This resulted in false-positive KV cache prefix matches. + - Replaced the C++ chunk ID extraction with a Python-side `media_item_cursor` that generates a deterministic 32-bit negative ID using `zlib.crc32(real_media_url)`. + - This ensures `longest_token_prefix` correctly identifies and reuses identical images while instantly breaking the cache match when the image content changes. + * Chat Structure: **[System Prompt] + [Image] + [Question]** + - The computational cost should be significantly reduced for the same image but different questions. + +- fix(Llama.generate): add explicit fallback context reset and expand Llama.generate docstrings + - Added a fallback `if reset:` block in `Llama.generate` to ensure the KV cache and hybrid cache manager are explicitly cleared when `reset=True` is passed and no prefix match is found. This prevents potential context poisoning from previous runs. + - Added comprehensive docstrings to the `generate` method for all newly integrated sampler parameters (e.g., XTC, Mirostat, DRY penalties etc.). + - Added explicit verbose logging for cache resetting, rollback events, and speculative decoding behaviors to improve debuggability. + +- docs(README.md): add sampling parameter guide and strategic project tips to README + - Sampling Documentation: Added a comprehensive guide for `LlamaSamplingParams`. It covers core, advanced (XTC, Dynatemp, Adaptive-P), entropy (Mirostat), and DRY repetition penalty configurations with a clean Python usage example. + - Project Tips: Added a new "Quick tips" section to explicitly communicate the semi-deprecated status of `llama_cpp.server` in favor of the upstream `llama-server`. + - Backend Recommendations: Added practical advice for AMD and Intel GPU users, officially recommending the Vulkan backend for cross-platform stability and faster updates. + +- fix(sampling): prevent memory drift and hallucinations in logits view + - Previously, the numpy view for `logits_ptr` in `LlamaSamplingContext.sample` was only initialized once. If the underlying C++ buffer was reallocated or shifted (e.g., due to dynamic batch sizes or KV cache shifts), the numpy array would point to stale memory, leading to severe model hallucinations or segfaults. + - Added explicit `_logits_ptr_addr` tracking to monitor the physical C memory address. + - The zero-copy numpy view (`_logits_view`) is now safely recreated on-the-fly whenever the backend memory address changes. + - Added proper initialization and garbage collection for the new tracker in `__init__` and `close`. + +- feat(_ggml): extend ctypes bindings with more ggml constants, enums, and structs + +- fix(chat_handler): fix tools and function calling in MTMDChatHandler.(Issue reported by **@alcoftTAO**) + +- perf(cache): optimize LlamaDiskCache I/O and fix LRU behavior + - Delegated LRU and size limits to native `diskcache` SQLite engine, removing the slow manual eviction loop. + - Added an O(1) early exit in `_find_longest_prefix_key` to prevent unnecessary full-table disk scans. + - Fixed a destructive read bug by replacing `.pop()` with standard access to properly update LRU timestamps. + - Added fast-path empty checks to bypass disk queries entirely when the cache is empty. + +- perf(cache): upgrade LlamaRAMCache to O(1) eviction and set LlamaTrieCache as default +Addressed severe performance bottlenecks in legacy RAM caching components: + - Refactored `LlamaRAMCache` to use an O(1) `_current_size` tracker instead of an O(N) dynamic sum. This eliminates massive CPU spikes and O(N^2) complexity during LRU eviction cycles. + - Added strict OOM safeguards to `LlamaRAMCache`: The current size is explicitly clamped to 0 during evictions, and hard-reset to 0 if the cache empties, preventing catastrophic capacity drift. + - Introduced early-exit O(1) short-circuits in `__getitem__` and `__contains__` to bypass expensive prefix searches when the cache is empty. + - Updated the `LlamaCache` backward-compatibility alias to point to the highly optimized `LlamaTrieCache` instead of the legacy `LlamaRAMCache`. + +- fix(core): disable swa_full for non-SWA models (sync llama.cpp upstream #20291) + - Fallback `context_params.swa_full` to False if `_n_swa == 0` and emit a warning. + - Updated `is_hybrid` validation to use the resolved `self.context_params.swa_full` state. + +- fix(chat_format): fix namespace and variable shadowing of llama modules + - Changed imports to use `llama_cpp_lib` and `llama_core` to avoid namespace collisions. + - Fixed severe variable shadowing where the `llama` module was being overshadowed by the `llama` parameter in function signatures. + - Updated associated type hints and C-API bindings to use the new isolated aliases. + - Corrected `LlamaGrammar` type definitions to point to the `llama_grammar` module. + +- fix(cache): fix namespace shadowing to prevent AttributeError (Issue reported by **@kantan-kanto**) + - Renamed `llama_cpp.llama` import to `llama_core` and `llama_cpp.llama_cpp` to `llama_cpp_lib` to prevent namespace collision. + - Fixed `AttributeError` thrown when accessing `llama_cpp.llama.Llama.longest_token_prefix`. + - Updated all associated type hints and C-API bindings in cache classes to use the new isolated aliases. + +- feat(MTMDChatHandler): support audio inputs and fix interleaved media ordering + - Refactored `CHAT_FORMAT` to use a single loop for `message.content`, preserving the exact chronological order of interleaved text, images, and audio. + - Added template routing for `audio_url`. + - Added template routing for OpenAI's `input_audio` format, properly formatting it as a Data URI. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/d23355afc319f598d0e588a2d16a4da82e14ff41](https://github.com/ggml-org/llama.cpp/commit/d23355afc319f598d0e588a2d16a4da82e14ff41) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260313 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/e7e1d48065ba53846f290cfe563c8c839a062ebe...b0f00a96d3803863dd7d479f0cb1305f76f741b3 + ## [0.3.32] Hybrid/Multimodal Model Single-Turn Optimizations & Fix Sampling Seed - perf(hybrid): optimize multimodal single-turn and fix KV clear bug diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 45f9c8f27f..ff142e6bf6 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.32" +__version__ = "0.3.33" From 9c29c274191a2da3ea2a3405cdcf96cbf2596cea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Mar 2026 14:35:01 +0900 Subject: [PATCH 267/518] Update CMakeLists.txt --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d3dfa00d0..04d3ec1fff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,8 +131,10 @@ if (LLAMA_BUILD) ggml-metal ggml-musa ggml-opencl + ggml-openvino ggml-rpc ggml-sycl + ggml-virtgpu ggml-vulkan ggml-webgpu ggml-zdnn From 35957bd7aa7d21f77a07d838f1308bffa33573ff Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Mar 2026 14:41:59 +0900 Subject: [PATCH 268/518] Update MIT license copyright to collective authorship (2023-2026) Change single-author copyright to "The llama-cpp-python authors" and apply standard multi-line formatting for better readability. Signed-off-by: JamePeng --- LICENSE.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/LICENSE.md b/LICENSE.md index 3a1d7180d5..1cc9dc9793 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,9 +1,21 @@ MIT License -Copyright (c) 2023 Andrei Betlen +Copyright (c) 2023-2026 The llama-cpp-python authors -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 90d46dcab450afe1a29d4ed78e1a1e1d99511440 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 15 Mar 2026 20:48:07 +0900 Subject: [PATCH 269/518] feat(core): add verbose debug logging to longest_token_prefix fast paths - Added an optional `verbose` parameter to `Llama.longest_token_prefix` to explicitly log early-exit conditions. This provides crucial visibility into cache-miss behaviors during debugging by outputting the specific reason for a fast exit (e.g., empty sequence vs. mismatched first token) along with the offending sequence lengths or token values. --- llama_cpp/llama.py | 24 +++++++++++++++++++----- llama_cpp/llama_cache.py | 10 ++++++---- llama_cpp/llama_chat_format.py | 4 ++-- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1b6bb9ee1b..eda22164a5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1210,7 +1210,7 @@ def generate( # Check for kv cache prefix match if reset and self.n_tokens > 0: # 1. First, check for a 100% exact match of the entire sequence - full_match_prefix = self.longest_token_prefix(self._input_ids, tokens) + full_match_prefix = self.longest_token_prefix(self._input_ids, tokens, self.verbose) # --- FAST PATH: Zero-latency bypass for Hybrid Single-Turn & Multimodal --- # If the cache is disabled (max_checkpoints <= 0) and we have a 100% match, @@ -1233,7 +1233,7 @@ def generate( else: # By matching against `tokens[:-1]`, we intentionally drop the last token. # This forces the engine to re-evaluate the final token to refresh sampling logits. - longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1]) + longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1], self.verbose) if longest_prefix > 0: reset = False @@ -1840,10 +1840,10 @@ def _create_completion( try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( - cache_item.input_ids, prompt_tokens + cache_item.input_ids, prompt_tokens, self.verbose ) eval_prefix_len = Llama.longest_token_prefix( - self._input_ids, prompt_tokens + self._input_ids, prompt_tokens, self.verbose ) if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) @@ -2996,7 +2996,8 @@ def logits_to_logprobs( @staticmethod def longest_token_prefix( current_ids: Union[Sequence[int], npt.NDArray[np.intc]], - new_tokens: Union[Sequence[int], npt.NDArray[np.intc]] + new_tokens: Union[Sequence[int], npt.NDArray[np.intc]], + verbose: bool = False ) -> int: """ Calculates the length of the longest common prefix between two token sequences. @@ -3008,12 +3009,19 @@ def longest_token_prefix( Args: current_ids: The existing token sequence (e.g., KV cache). new_tokens: The new input token sequence. + verbose: If True, prints detailed debug information to stderr. Returns: int: The number of matching tokens from the start. """ # Fast exit for empty sequences to avoid unnecessary processing if len(current_ids) == 0 or len(new_tokens) == 0: + if verbose: + print( + f"Llama.longest_token_prefix [Fast Exit 1]: Empty sequence detected. " + f"len(current_ids)={len(current_ids)}, len(new_tokens)={len(new_tokens)}", + file=sys.stderr + ) return 0 # Determine the comparison range (limited by the shorter sequence) @@ -3022,6 +3030,12 @@ def longest_token_prefix( # Probe inspection: Use Python to quickly compare the first token # If the tokens are different from the beginning, return immediately to avoid any NumPy overhead. if current_ids[0] != new_tokens[0]: + if verbose: + print( + f"Llama.longest_token_prefix [Fast Exit 2]: First token mismatch. " + f"current_ids[0]={current_ids[0]} vs new_tokens[0]={new_tokens[0]}", + file=sys.stderr + ) return 0 # Accelerating SIMD for Large Data Volumes diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index da00cbfb05..dc1dd20d7c 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -59,12 +59,13 @@ class LlamaDiskCache(BaseLlamaCache): """ def __init__( - self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30), verbose: bool = False ): super().__init__(capacity_bytes) self.cache_dir = cache_dir # Native SQLite size limit and LRU eviction self.cache = diskcache.Cache(cache_dir, size_limit=capacity_bytes) + self.verbose = verbose @property def cache_size(self): @@ -83,7 +84,7 @@ def _find_longest_prefix_key( min_key: Optional[Tuple[int, ...]] = None target_len = len(key) for k in self.cache.iterkeys(): # type: ignore - prefix_len = llama_core.Llama.longest_token_prefix(k, key) + prefix_len = llama_core.Llama.longest_token_prefix(k, key, self.verbose) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore @@ -123,13 +124,14 @@ class LlamaRAMCache(BaseLlamaCache): Maintains an LRU eviction policy with O(1) size tracking. """ - def __init__(self, capacity_bytes: int = (2 << 30)): + def __init__(self, capacity_bytes: int = (2 << 30), verbose: bool = False): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes self.cache_state: OrderedDict[ Tuple[int, ...], "llama_core.LlamaState" ] = OrderedDict() self._current_size = 0 + self.verbose = verbose @property def cache_size(self): @@ -142,7 +144,7 @@ def _find_longest_prefix_key( min_len = 0 min_key = None keys = ( - (k, llama_core.Llama.longest_token_prefix(k, key)) + (k, llama_core.Llama.longest_token_prefix(k, key, self.verbose)) for k in self.cache_state.keys() ) for k, prefix_len in keys: diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 6c8e2146e6..4b0d268b0b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3206,7 +3206,7 @@ def _create_bitmap_func(idx: int, item: str): media_id = -314159 if self.verbose: - print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens:{chunk_n_tokens}, media_id: {media_id}, ") + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) @@ -3303,7 +3303,7 @@ def __call__( # 3. KV Cache Synchronization & State Rollback # Compares the virtual ledger with physical history to prevent Cache Poisoning. current_history = llama.input_ids[:llama.n_tokens].tolist() - longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids) + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) if longest_prefix < llama.n_tokens: if llama.is_hybrid and llama._hybrid_cache_mgr is not None: From ed5e212223525429a6a4cc7484b6a6b7bbb95876 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 16 Mar 2026 06:19:12 +0800 Subject: [PATCH 270/518] Update Submodule vendor/llama.cpp d23355a..312cf03 Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 6 ++---- vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e6dc1567d0..3e03e09dce 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1913,7 +1913,6 @@ def llama_model_quantize( # // Load a LoRA adapter from file # // The adapter is valid as long as the associated model is not freed -# // All adapters must be loaded before context creation # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( # struct llama_model * model, # const char * path_lora); @@ -2016,9 +2015,8 @@ def llama_adapter_meta_val_str_by_index( # // Manually free a LoRA adapter -# // Note: loaded adapters will be free when the associated model is deleted -# LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter), -# "adapters are now freed together with the associated model"); +# // NOTE: loaded adapters that are not manually freed will be freed when the associated model is deleted +# LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); @ctypes_function( "llama_adapter_lora_free", [llama_adapter_lora_p_ctypes], diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d23355afc3..312cf03328 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d23355afc319f598d0e588a2d16a4da82e14ff41 +Subproject commit 312cf03328496df9c56fe843952cf02a8964c59a From bdc2d7c7da947e21c1c2b4f2fad5269da0e08ea7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Mar 2026 07:48:50 +0800 Subject: [PATCH 271/518] Update Submodule vendor/llama.cpp 312cf03..fb78ad2 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 312cf03328..fb78ad29bb 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 312cf03328496df9c56fe843952cf02a8964c59a +Subproject commit fb78ad29bbe7ae00619b2ce31b0a71e95fdbfc43 From d2318a4c04ab9efbfd46cf0915983d217f02ae6c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 20 Mar 2026 22:41:31 +0800 Subject: [PATCH 272/518] chore: enhance hybrid cache logging and document M-RoPE token usage - Added explanatory comments detailing why n_tokens is used instead of chunk_n_pos for M-RoPE models (to prevent the system from skipping evaluation). - Added verbose logging for hybrid cache clearance scenarios (when checkpoints are missing, restore fails, or max_checkpoints is 0). Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 4b0d268b0b..ed5261b05b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3185,6 +3185,10 @@ def _create_bitmap_func(idx: int, item: str): self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO ]: # Extract media properties + # Note(JamePeng): + # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). + # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. + # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) if media_items_cur < media_items_count: @@ -3318,10 +3322,14 @@ def __call__( if self.verbose: print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) else: + if self.verbose: + print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) llama._hybrid_cache_mgr.clear() llama._ctx.memory_clear(True) llama.n_tokens = 0 else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) llama._hybrid_cache_mgr.clear() llama._ctx.memory_clear(True) llama.n_tokens = 0 From 801755ef29276f3567d4c1a0dad6015b3cc31a79 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 22 Mar 2026 10:51:57 +0800 Subject: [PATCH 273/518] Update Submodule vendor/llama.cpp fb78ad2..3306dba --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index fb78ad29bb..3306dbaef7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit fb78ad29bbe7ae00619b2ce31b0a71e95fdbfc43 +Subproject commit 3306dbaef7553da03971c617e48cd27d00328bb4 From 87157f36002fb04f4233f9f5eb46a29f00ea63d7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 22 Mar 2026 11:30:16 +0800 Subject: [PATCH 274/518] refactor: remove legacy static LoRA initialization - Removed `lora_base`, `lora_path`, and `lora_scale` from `Llama` init parameters and state. - Dropped outdated `llama_adapter_lora_init` and `llama_set_adapters_lora` bindings in the constructor. - Restored default `use_mmap` behavior (no longer forced to False when LoRA is present). This removes the global context pollution and paves the way for the new dynamic, per-request LoRA routing architecture. Signed-off-by: JamePeng --- llama_cpp/llama.py | 44 +----------------------------------- llama_cpp/server/model.py | 3 --- llama_cpp/server/settings.py | 9 -------- 3 files changed, 1 insertion(+), 55 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index eda22164a5..8e1bc17608 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -116,10 +116,6 @@ def __init__( checkpoint_interval: int = 4096, # Sampling Params last_n_tokens_size: int = 64, - # LoRA Params - lora_base: Optional[str] = None, - lora_scale: float = 1.0, - lora_path: Optional[str] = None, # Backend Params numa: Union[bool, int] = False, # Chat Format Params @@ -206,8 +202,6 @@ def __init__( ctx_checkpoints: max number of context checkpoints to create per slot (default: 16)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) checkpoint_interval: Hybrid model checkpoint token intervals, and archiving of text with interval sizes along the way. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. - lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. - lora_path: Path to a LoRA file to apply to the model. numa: numa policy chat_format: String specifying the chat format to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion. @@ -270,7 +264,7 @@ def __init__( ) # keep a reference to the array so it is not gc'd self.model_params.tensor_split = self._c_tensor_split self.model_params.vocab_only = vocab_only - self.model_params.use_mmap = use_mmap if lora_path is None else False + self.model_params.use_mmap = use_mmap self.model_params.use_direct_io = use_direct_io self.model_params.use_mlock = use_mlock self.model_params.check_tensors = check_tensors @@ -416,10 +410,6 @@ def __init__( self.cache: Optional[BaseLlamaCache] = None - self.lora_base = lora_base - self.lora_scale = lora_scale - self.lora_path = lora_path - self.spm_infill = spm_infill if not os.path.exists(model_path): @@ -514,34 +504,6 @@ def __init__( ) ) - self._lora_adapter: Optional[llama_cpp.llama_adapter_lora_p] = None - - if self.lora_path: - self._lora_adapter = llama_cpp.llama_adapter_lora_init( - self._model.model, - self.lora_path.encode("utf-8"), - ) - if self._lora_adapter is None: - raise RuntimeError( - f"Failed to initialize LoRA adapter from lora path: {self.lora_path}" - ) - - def free_lora_adapter(): - if self._lora_adapter is None: - return - llama_cpp.llama_adapter_lora_free(self._lora_adapter) - self._lora_adapter = None - - self._stack.callback(free_lora_adapter) - - # Todo(JamePeng): The current LoRa loading logic is outdated and needs to be refactored. - if llama_cpp.llama_set_adapters_lora( - self._ctx.ctx, self._lora_adapter, self.lora_scale - ): - raise RuntimeError( - f"Failed to set LoRA adapter from lora path: {self.lora_path}" - ) - if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) @@ -2802,10 +2764,6 @@ def __getstate__(self): # Sampling Params no_perf=self.context_params.no_perf, last_n_tokens_size=self.last_n_tokens_size, - # LoRA Params - lora_base=self.lora_base, - lora_scale=self.lora_scale, - lora_path=self.lora_path, # Backend Params numa=self.numa, # Chat Format Params diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 62d491a76f..37c5195687 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -306,9 +306,6 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: kv_unified=settings.kv_unified, # Sampling Params last_n_tokens_size=settings.last_n_tokens_size, - # LoRA Params - lora_base=settings.lora_base, - lora_path=settings.lora_path, # Backend Params numa=settings.numa, # Chat Format Params diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index bb16527d8b..db96a41705 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -153,15 +153,6 @@ class ModelSettings(BaseSettings): ge=0, description="Last n tokens to keep for repeat penalty calculation.", ) - # LoRA Params - lora_base: Optional[str] = Field( - default=None, - description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.", - ) - lora_path: Optional[str] = Field( - default=None, - description="Path to a LoRA file to apply to the model.", - ) # Backend Params numa: Union[bool, int] = Field( default=False, From e42cf392aebc6a444ac11bc265cac9197e5b8856 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 22 Mar 2026 15:07:28 +0800 Subject: [PATCH 275/518] fix(types): correct `llama_set_adapters_lora` LoRA adapter ctypes signature and use pointer for scales - change scale: float to float* (POINTER(c_float)) - make adapters and scales optional arrays to match C API Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3e03e09dce..b4cf4f4d6f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1911,6 +1911,10 @@ def llama_model_quantize( ... +# // +# // Adapters +# // + # // Load a LoRA adapter from file # // The adapter is valid as long as the associated model is not freed # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( @@ -2058,14 +2062,23 @@ def llama_adapter_get_alora_invocation_tokens(adapter: llama_adapter_lora_p, /) # float * scales); @ctypes_function( "llama_set_adapters_lora", - [llama_context_p_ctypes, ctypes.POINTER(llama_adapter_lora_p_ctypes), ctypes.c_size_t, ctypes.c_float], + [llama_context_p_ctypes, + ctypes.POINTER(llama_adapter_lora_p_ctypes), + ctypes.c_size_t, + ctypes.POINTER(ctypes.c_float) + ], ctypes.c_int32, ) def llama_set_adapters_lora( - ctx: llama_context_p, adapters: CtypesArray[llama_adapter_lora_p], n_adapters: ctypes.c_size_t, scale: float, / + ctx: llama_context_p, + adapters: Optional[CtypesArray[llama_adapter_lora_p]], + n_adapters: ctypes.c_size_t, + scales: Optional[CtypesArray[ctypes.c_float]], / ) -> int: - """Set LoRa adapters on the context. - Will only modify if the adapters currently in context are different.""" + """ + Set LoRa adapters on the context. + Will only modify if the adapters currently in context are different. + """ ... From 65f0d825ae97cda38637bcf1ea6a97e34289ecbf Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 23 Mar 2026 20:59:59 +0800 Subject: [PATCH 276/518] Update Submodule vendor/llama.cpp 3306dba..f93c09e --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3306dbaef7..f93c09e267 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3306dbaef7553da03971c617e48cd27d00328bb4 +Subproject commit f93c09e2673090b59802f2552848a1cac70598b0 From ceb544cffe8c2d6cb780b7a748c7e93c90c5f616 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 24 Mar 2026 07:02:15 +0800 Subject: [PATCH 277/518] fix(types): correct llama_adapter_get_alora_invocation_tokens ctypes signature and use pointer for llama_token Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b4cf4f4d6f..6218f52cb3 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2045,7 +2045,7 @@ def llama_adapter_get_alora_n_invocation_tokens(adapter: llama_adapter_lora_p, / @ctypes_function( "llama_adapter_get_alora_invocation_tokens", [llama_adapter_lora_p_ctypes], - ctypes.c_uint64, + ctypes.POINTER(llama_token), ) def llama_adapter_get_alora_invocation_tokens(adapter: llama_adapter_lora_p, /) -> CtypesPointer[llama_token]: ... From aa41462e68fc6d741db5e481c67586d4c04c963d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 25 Mar 2026 07:28:58 +0800 Subject: [PATCH 278/518] Update Submodule vendor/llama.cpp f93c09e..914eb5f --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f93c09e267..914eb5ff0c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f93c09e2673090b59802f2552848a1cac70598b0 +Subproject commit 914eb5ff0c74c88c7ef8aec115878d8f64c81e56 From 6898d8e4e93bb733eaa4531214504286b70e3864 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 25 Mar 2026 20:18:19 +0800 Subject: [PATCH 279/518] Sync llama: fix llama-model-saver (#20503) --- llama_cpp/llama_cpp.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6218f52cb3..610a3f0274 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1312,6 +1312,24 @@ def llama_model_load_from_file( ... +# // Load a model from an open FILE pointer +# LLAMA_API struct llama_model * llama_model_load_from_file_ptr( +# FILE * file, +# struct llama_model_params params); +@ctypes_function( + "llama_model_load_from_file_ptr", + [ctypes.c_void_p, llama_model_params], + llama_model_p_ctypes, +) +def llama_model_load_from_file_ptr( + file: ctypes.c_void_p, params: llama_model_params, / +) -> Optional[llama_model_p]: + """ + Load a model from an open FILE pointer + """ + ... + + # // Load a model from multiple splits (support custom naming scheme) # // The paths must be in the correct order # LLAMA_API struct llama_model * llama_model_load_from_splits( From 0d379eb43bca73961ea3d7c4080a38ff3db358e4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 26 Mar 2026 07:01:30 +0800 Subject: [PATCH 280/518] feat(internals): implement dynamic LoRA routing and Control Vector support This commit overhauls the adapter management architecture in `_internals.py` to support dynamic, per-request LoRA routing and Control Vector (CVec) injection with strict C++ memory safety. Key changes: - Secure Memory Management: Introduced the `LlamaLoraAdapter` wrapper class to securely handle the lifecycle of `llama_adapter_lora_p` pointers, preventing VRAM leaks. Also added support for extracting ALoRA invocation tokens. - Model-Level Registry: Added `_lora_registry` to `LlamaModel` with robust methods (`load_lora`, `unload_lora`, `unload_all_loras`) to preload adapters into VRAM. Integrated cleanup into the model's `ExitStack` and `close()` methods for deterministic memory release. - Context-Level Dynamic Routing: Implemented `apply_loras` and `clear_loras` in `LlamaContext` to dynamically swap compute graph weights using contiguous C arrays, enabling zero-delay multi-tenant LoRA switching. - Control Vector Integration: Added `apply_cvec` and `clear_cvec` to `LlamaContext` for representation engineering. Includes strict C++ memory layout validation (enforcing buffer zero-padding up to `n_embd * il_end`) to prevent silent write failures in the GGML backend. - Observability & Docs: Added verbose logging for adapter/CVec application and expanded docstrings for context utility methods (e.g., threading, causal attention, warmup). Signed-off-by: JamePeng --- llama_cpp/_internals.py | 258 +++++++++++++++++++++++++++++++++++++++- llama_cpp/llama_cpp.py | 55 ++++++++- 2 files changed, 311 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 007cc0b82d..b9cb46e078 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -75,8 +75,10 @@ def __init__( self.model = model self.vocab = vocab + self._lora_registry: Dict[str, LlamaLoraAdapter] = {} + def close(self): - """Manually free LlamaModel and Vocab resources.""" + """Manually free LlamaModel and Vocab/Lora resources.""" if getattr(self, "model", None) is not None: try: llama_cpp.llama_model_free(self.model) @@ -85,6 +87,10 @@ def close(self): self.model = None self.vocab = None + if hasattr(self, "_lora_registry") and self._lora_registry: + self.unload_all_loras() + self._lora_registry = None + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): self._exit_stack.close() self._exit_stack = None @@ -311,8 +317,62 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: return bytes(buffer[:n_chars]) + # Lora + + def load_lora(self, name: str, path: str): + """Loads a LoRA adapter into VRAM without applying it yet.""" + # Skip if it's already loaded + if name in self._lora_registry: + return + + adapter = LlamaLoraAdapter(self.model, path) + self._lora_registry[name] = adapter + + self._exit_stack.callback(adapter.free) + + if self.verbose: + print(f"Loaded LoRA '{name}' into memory.") + + def unload_lora(self, name: str): + """Actively unloads a specific LoRA to free up VRAM.""" + if name in self._lora_registry: + adapter = self._lora_registry.pop(name) + adapter.free() + if self.verbose: + print(f"Unloaded LoRA '{name}' and freed memory.") + + @property + def loaded_lora_count(self) -> int: + """ + Returns the total number of LoRA adapters currently loaded in VRAM. + """ + return len(self._lora_registry) + + def list_loras(self) -> List[str]: + """ + Returns a list of all registered LoRA names. + """ + return list(self._lora_registry.keys()) + + def unload_all_loras(self): + """ + Iterates through the registry and forces VRAM release for all loaded LoRAs. + """ + if not self._lora_registry: + return + + # Cast keys to a list first to avoid RuntimeError: + # 'dictionary changed size during iteration' when pop() is called inside unload_lora. + loaded_names = list(self._lora_registry.keys()) + + for name in loaded_names: + self.unload_lora(name) + + if self.verbose: + print(f"Successfully unloaded all {len(loaded_names)} LoRA adapters and cleared the registry.") # Extra + def metadata(self) -> Dict[str, str]: metadata: Dict[str, str] = {} # Pre-allocate a 16KB buffer. This is large enough to handle almost all @@ -356,6 +416,61 @@ def default_params(): return llama_cpp.llama_model_default_params() +class LlamaLoraAdapter: + """Wrapper for llama_adapter_lora_p to safely manage C++ memory lifecycle.""" + def __init__(self, model: llama_cpp.llama_model_p, path: str): + """ + Initializes and loads the LoRA adapter into memory. + + Args: + model: The pointer to the base Llama model. + path: The file path to the LoRA adapter (.gguf). + """ + self.path = path + # Load the LoRA adapter from file into memory via llama.cpp API + # Note: The path string must be encoded to UTF-8 bytes for ctypes compatibility. + self.adapter = llama_cpp.llama_adapter_lora_init( + model, + path.encode("utf-8") + ) + if not self.adapter: + raise RuntimeError(f"Failed to load LoRA from {path}") + + def free(self): + """ + Explicitly frees the underlying C++ memory allocated for the LoRA adapter. + Should be called when the adapter is actively unloaded to instantly release VRAM. + """ + # Check if the adapter exists and hasn't been freed yet + if getattr(self, "adapter", None) is not None: + llama_cpp.llama_adapter_lora_free(self.adapter) + self.adapter = None + self.path = None + + def __del__(self): + self.free() + + @property + def alora_invocation_tokens(self) -> List[int]: + """ + Retrieves the list of invocation (trigger) tokens if this adapter is an ALoRA (Activation LoRA). + Returns an empty list for standard LoRA adapters. + """ + if getattr(self, "adapter", None) is None: + return [] + + # 1. Query the C++ backend for the exact number of trigger tokens + n_tokens = llama_cpp.llama_adapter_get_alora_n_invocation_tokens(self.adapter) + if n_tokens == 0: + return [] + + # 2. Retrieve the underlying C pointer to the contiguous array of tokens + tokens_ptr = llama_cpp.llama_adapter_get_alora_invocation_tokens(self.adapter) + + # 3. Safely iterate through the C memory block and convert it into a native Python list + return [tokens_ptr[i] for i in range(n_tokens)] + + class LlamaContext: """Intermediate Python wrapper for a llama.cpp llama_context. NOTE: For stability it's recommended you use the Llama class instead.""" @@ -577,21 +692,43 @@ def decode(self, batch: 'LlamaBatch') -> int: raise RuntimeError(f"llama_decode failed (code {return_code}): {msg}") def set_n_threads(self, n_threads: int, n_threads_batch: int): + """ + Set the number of threads used for decoding + + Args: + n_threads: the number of threads used for generation (single token) + n_threads_batch: the number of threads used for prompt and batch processing (multiple tokens) + """ llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) def n_threads(self) -> int: + """Get the number of threads used for generation of a single token.""" return llama_cpp.llama_n_threads(self.ctx) def n_threads_batch(self) -> int: + """Get the number of threads used for prompt and batch processing (multiple token).""" return llama_cpp.llama_n_threads_batch(self.ctx) def set_causal_attn(self, causal_attn: bool): + """ + Set whether to use causal attention or not + If set to true, the model will only attend to the past tokens + """ llama_cpp.llama_set_causal_attn(self.ctx, causal_attn) def set_warmup(self, warmup: bool): + """ + Set whether the model is in warmup mode or not + If true, all model tensors are activated during llama_decode() to load and cache their weights. + """ llama_cpp.llama_set_warmup(self.ctx, warmup) def synchronize(self): + """ + Wait until all computations are finished + This is automatically done when using one of the functions below to obtain the computation results + and is not necessary to call it explicitly in most cases + """ llama_cpp.llama_synchronize(self.ctx) def get_logits(self): @@ -619,9 +756,128 @@ def print_timings(self): llama_cpp.llama_perf_context_print(self.ctx) def print_memory_breakdown(self): + """print a breakdown of per-device memory use via LLAMA_LOG""" llama_cpp.llama_memory_breakdown_print(self.ctx) + # LoRA / ALoRA Dynamic Routing Methods + + def clear_loras(self): + """ + Clears all currently applied LoRA weights from the context. + Restores the computational graph to the base model state. + """ + llama_cpp.llama_set_adapters_lora(self.ctx, None, 0, None) + + def apply_loras(self, active_loras: List[Tuple["LlamaLoraAdapter", float]]): + """ + Dynamically mounts a combination of LoRAs and their scales to the current context. + This must be called immediately before evaluating/decoding the computation graph. + + Args: + active_loras: A list of tuples containing (LlamaLoraAdapter instance, scale float). + """ + # If no LoRAs are requested, ensure the context is wiped clean to prevent contamination + if not active_loras: + self.clear_loras() + return + + n_adapters = len(active_loras) + + # 1. Dynamically construct contiguous C-array types required by the C++ backend + AdapterArrayType = llama_cpp.llama_adapter_lora_p_ctypes * n_adapters + ScaleArrayType = ctypes.c_float * n_adapters + + # 2. Instantiate the arrays in memory + c_adapters = AdapterArrayType() + c_scales = ScaleArrayType() + + # 3. Populate the C-arrays with the underlying adapter pointers and float scales + for i, (adapter_obj, scale) in enumerate(active_loras): + c_adapters[i] = adapter_obj.adapter + c_scales[i] = scale + + # 4. Atomically apply the requested adapters to the computation graph + ret = llama_cpp.llama_set_adapters_lora( + self.ctx, + c_adapters, + n_adapters, + c_scales + ) + + if ret != 0: + raise RuntimeError("LlamaContext(apply_loras): Failed to set LoRA adapters dynamically.") + + if self.verbose: + print(f"LlamaContext(apply_loras): Successfully applied {n_adapters} LoRA adapter(s) to the compute graph.") + + # Control Vector (CVec) Methods + + def clear_cvec(self): + """ + Clears the currently loaded control vector from the context. + Passing NULL (None) and zeros safely resets the graph. + """ + llama_cpp.llama_set_adapter_cvec(self.ctx, None, 0, 0, 0, 0) + + def apply_cvec(self, data: List[float], n_embd: int, il_start: int, il_end: int): + """ + Dynamically applies a Control Vector (CVec) to the specified layer range. + + Args: + data: Flattened 1D list of floats. + [CRITICAL_LAYOUT_RULE]: Based on llama.cpp source, the data buffer + is strictly mapped starting from Layer 1. Even if il_start > 1, + the `data` array must contain zero-padding for the skipped early layers. + Total length MUST be >= n_embd * il_end. + n_embd: The embedding dimension of the model. + il_start: The starting layer to apply the vector (inclusive, 1-indexed). + il_end: The ending layer to apply the vector (inclusive). + """ + if not data: + self.clear_cvec() + return + + length = len(data) + + # Strictly validate length based on C++ buffer mapping rules + # The C++ backend uses offset: off = n_embd * (il - 1). + # To successfully write up to il_end, the buffer length must be at least n_embd * il_end. + minimum_required_len = n_embd * il_end + if length < minimum_required_len: + raise ValueError( + f"LlamaContext(apply_cvec): " + f"[Memory Layout Error] Control vector data length ({length}) is too short. " + f"llama.cpp requires the buffer to map continuously from Layer 1. " + f"To apply up to layer {il_end}, length must be at least {minimum_required_len}." + ) + + # 1. Convert to C Array + CFloatArrayType = ctypes.c_float * length + c_data = CFloatArrayType(*data) + + # 2. Inject into graph + ret = llama_cpp.llama_set_adapter_cvec( + self.ctx, + c_data, + length, + n_embd, + il_start, + il_end + ) + + # 3. Handle specific C++ boolean false (converted to -1) + if ret != 0: + raise RuntimeError( + f"LlamaContext(apply_cvec): " + f"C++ backend rejected the Control Vector. " + f"Usually indicates n_embd ({n_embd}) does not match the model's actual embedding dimension." + ) + + if self.verbose: + print(f"LlamaContext(apply_cvec): Applied Control Vector to layers {il_start}-{il_end} (Buffer size matched C++ layout).") + # Utility functions + @staticmethod def default_params(): """Get the default llama_context_params.""" diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 610a3f0274..43527fa7c6 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1115,10 +1115,63 @@ class llama_chat_message(ctypes.Structure): # // lora adapter -# struct llama_adapter_lora; +# struct llama_adapter_lora { +# llama_model * model = nullptr; + +# // map tensor name to lora_a_b +# std::unordered_map ab_map; + +# std::vector ctxs; +# std::vector bufs; + +# float alpha; + +# // gguf metadata +# std::unordered_map gguf_kv; + +# // activated lora (aLoRA) +# std::vector alora_invocation_tokens; + +# explicit llama_adapter_lora(llama_model * model) : model(model) {} +# ~llama_adapter_lora() = default; + +# llama_adapter_lora_weight * get_weight(ggml_tensor * w); + +# uint32_t get_n_nodes() const { +# return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat +# } +# }; llama_adapter_lora_p = ctypes.c_void_p llama_adapter_lora_p_ctypes = ctypes.POINTER(ctypes.c_void_p) +# // llama_adapter_cvec +# struct llama_adapter_cvec { +# ggml_tensor * tensor_for(int il) const; + +# ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const; + +# bool apply( +# const llama_model & model, +# const float * data, +# size_t len, +# int32_t n_embd, +# int32_t il_start, +# int32_t il_end); + +# private: +# bool init(const llama_model & model); + +# int32_t layer_start = -1; +# int32_t layer_end = -1; + +# std::vector ctxs; +# std::vector bufs; + +# std::vector tensors; // per layer +# }; +llama_adapter_cvec_p = ctypes.c_void_p +llama_adapter_cvec_p_ctypes = ctypes.POINTER(ctypes.c_void_p) + # // Helpers for getting default parameters # LLAMA_API struct llama_model_params llama_model_default_params(void); From eaec2c1a551b5d6405c0cae8f04c77af9caadf68 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 26 Mar 2026 21:33:11 +0800 Subject: [PATCH 281/518] Update Submodule vendor/llama.cpp 914eb5f..59d8402 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 914eb5ff0c..59d840209a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 914eb5ff0c74c88c7ef8aec115878d8f64c81e56 +Subproject commit 59d840209a5195c2f6e2e81b5f8339a0637b59d9 From 82ef99567c5f0a038522b41e4e3c17ac0e28bd6e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Mar 2026 07:03:06 +0800 Subject: [PATCH 282/518] perf(context): debounce loras adapter clearing to prevent compute graph reallocation overhead - This commit introduces a state-tracking mechanism (`_loras_applied` and `_cvec_applied` boolean flags) inside `LlamaContext` to short-circuit redundant calls to `clear_loras` and `clear_cvec`. - Previously, unconditionally calling the clear functions during `eval()` would invoke the C++ backend with NULL pointers. This inadvertently triggered `sched_need_reserve = true` in `llama-context.cpp`, forcing an expensive compute graph reallocation on every single token generation even when no adapters were active, causing severe latency and performance degradation. - By tracking whether adapters are actually mounted, the Python layer now gracefully bypasses the C++ API calls if the context is already clean. This restores maximum inference speed for standard generation while maintaining absolute safety against weight contamination in multi-tenant/multi-LoRA scenarios. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b9cb46e078..20648cf74d 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -495,6 +495,9 @@ def __init__( self.ctx = ctx + self._loras_applied: bool = False + self._cvec_applied: bool = False + def close(self): """Manually free LlamaContext resources.""" if getattr(self, "ctx", None) is not None: @@ -766,7 +769,10 @@ def clear_loras(self): Clears all currently applied LoRA weights from the context. Restores the computational graph to the base model state. """ + if not self._loras_applied: + return llama_cpp.llama_set_adapters_lora(self.ctx, None, 0, None) + self._loras_applied = False def apply_loras(self, active_loras: List[Tuple["LlamaLoraAdapter", float]]): """ @@ -807,6 +813,8 @@ def apply_loras(self, active_loras: List[Tuple["LlamaLoraAdapter", float]]): if ret != 0: raise RuntimeError("LlamaContext(apply_loras): Failed to set LoRA adapters dynamically.") + self._loras_applied = True + if self.verbose: print(f"LlamaContext(apply_loras): Successfully applied {n_adapters} LoRA adapter(s) to the compute graph.") @@ -817,7 +825,10 @@ def clear_cvec(self): Clears the currently loaded control vector from the context. Passing NULL (None) and zeros safely resets the graph. """ + if not self._cvec_applied: + return llama_cpp.llama_set_adapter_cvec(self.ctx, None, 0, 0, 0, 0) + self._cvec_applied = False def apply_cvec(self, data: List[float], n_embd: int, il_start: int, il_end: int): """ @@ -873,6 +884,8 @@ def apply_cvec(self, data: List[float], n_embd: int, il_start: int, il_end: int) f"Usually indicates n_embd ({n_embd}) does not match the model's actual embedding dimension." ) + self._cvec_applied = True + if self.verbose: print(f"LlamaContext(apply_cvec): Applied Control Vector to layers {il_start}-{il_end} (Buffer size matched C++ layout).") From 57bfdd88d9ef6724e0117aa2b6a122d9fccc7757 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Mar 2026 10:22:08 +0800 Subject: [PATCH 283/518] feat(LoRA): implement JIT dynamic LoRA routing and Control Vector injection - This commit introduces a paradigm shift in how adapters are managed by exposing dynamic, per-request LoRA and Control Vector routing through the high-level `Llama` API and OpenAI-compatible endpoints (`create_chat_completion`, `create_completion`, etc.). - Previously, LoRA adapters were statically bound to the model during initialization (`lora_path` inside `__init__`), strictly limiting a loaded model instance to a single persona/task (Single-Tenant). Switching personas required reloading the entire model or duplicating it in VRAM. - With this update: - Users can preload multiple LoRAs into VRAM via `load_lora()`. - Adapters can be dynamically mounted and unmounted at runtime using the `active_loras` and `control_vector` arguments. - The `eval` method now performs Just-In-Time (JIT) weight mounting right before the compute graph executes, followed by guaranteed state wipeout (Debouncing). - This unlocks zero-latency "Multi-Tenant" serving, allowing a single base model instance to concurrently serve multiple users with entirely different LoRA personas (e.g., Role A vs. Role B) without VRAM duplication or model reloading overhead. Signed-off-by: JamePeng --- llama_cpp/llama.py | 120 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8e1bc17608..a3e359805e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -688,6 +688,29 @@ def eval_logits(self) -> Deque[List[float]]: maxlen=self._n_ctx if self._logits_all else 1, ) + # LoRA / Adapter Management API + + def load_lora(self, name: str, path: str): + """Loads a LoRA adapter into VRAM without applying it yet.""" + self._model.load_lora(name, path) + + def unload_lora(self, name: str): + """Actively unloads a specific LoRA to free up VRAM.""" + self._model.unload_lora(name) + + @property + def loaded_lora_count(self) -> int: + """Returns the total number of LoRA adapters currently loaded in VRAM.""" + return self._model.loaded_lora_count + + def list_loras(self) -> List[str]: + """Returns a list of all registered LoRA names.""" + return self._model.list_loras() + + def unload_all_loras(self): + """Iterates through the registry and forces VRAM release for all loaded LoRAs.""" + self._model.unload_all_loras() + def tokenize( self, text: bytes, add_bos: bool = True, special: bool = False ) -> List[int]: @@ -746,14 +769,17 @@ def reset(self): """Reset the model state.""" self.n_tokens = 0 - def eval(self, tokens: Sequence[int]): + def eval( + self, + tokens: Sequence[int], + active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, + control_vector: Optional[Dict[str, Any]] = None, + ): """Evaluate a list of tokens. Args: tokens: The list of tokens to evaluate. """ - if len(tokens) == 0: - return n_eval = len(tokens) if n_eval == 0: return @@ -852,6 +878,38 @@ def eval(self, tokens: Sequence[int]): logits_array=logits_array ) + # JIT Dynamic LoRAs Weights Mounting + + # Dynamic LoRA Routing + if active_loras is not None: + adapters_to_apply = [] + for lora in active_loras: + name = lora.get("name") + scale = float(lora.get("scale", 1.0)) + adapter_obj = getattr(self._model, "_lora_registry", {}).get(name) + if adapter_obj: + adapters_to_apply.append((adapter_obj, scale)) + elif self.verbose: + print(f"Llama.eval: Warning! LoRA '{name}' not found in registry. Skipping.", file=sys.stderr) + + self._ctx.apply_loras(adapters_to_apply) + else: + # Crucial Fallback: Wipe the graph clean if no LoRAs are requested. + # This guarantees zero weight contamination between different users/slots in a multiplexed environment. + self._ctx.clear_loras() + + # Dynamic Control Vector (CVec) Injection + if control_vector is not None: + data = control_vector.get("data", []) + il_start = control_vector.get("layer_start", 1) + il_end = control_vector.get("layer_end", self.n_layer()) + n_embd = self.n_embd() + + self._ctx.apply_cvec(data, n_embd, il_start, il_end) + else: + # Ensure the control vector is cleared for a clean state + self._ctx.clear_cvec() + # Dynamic Batch Downgrade: Attempt to decode, reduce batch size if KV cache is fragmented current_batch_size = n_chunk success = False @@ -1119,6 +1177,8 @@ def generate( grammar: Optional[LlamaGrammar] = None, grammar_lazy: bool = False, seed: Optional[int] = None, + active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, + control_vector: Optional[Dict[str, Any]] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -1164,6 +1224,14 @@ def generate( grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax. grammar_lazy: If True, activates grammar constraints only on specific trigger tokens. seed: RNG seed for sampling. Overrides the instance seed. + active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. + Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) + and an optional "scale" key (float, defaults to 1.0). + Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`. + control_vector: A dictionary containing Control Vector (CVec) data for representation engineering. + Must contain a "data" key with a flattened 1D list of floats. + Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count). + Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers. Yields: The generated tokens. @@ -1357,7 +1425,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): last_token = [tokens[-1]] # 1. Evaluate up to N-1 - self.eval(body_tokens) + self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector) # 2. Save the N-1 state snapshot current_history = self._input_ids[:self.n_tokens].tolist() @@ -1367,10 +1435,10 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): seq_id=0 ) # 3. Evaluate the final token to refresh logits - self.eval(last_token) + self.eval(last_token, active_loras=active_loras, control_vector=control_vector) else: # Standard evaluation or single-token generation step - self.eval(tokens) + self.eval(tokens, active_loras=active_loras, control_vector=control_vector) # Sample loop while sample_idx < self.n_tokens: @@ -1672,6 +1740,8 @@ def _create_completion( grammar: Optional[LlamaGrammar] = None, grammar_lazy: bool = False, seed: Optional[int] = None, + active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, + control_vector: Optional[Dict[str, Any]] = None, ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -1851,6 +1921,8 @@ def _create_completion( grammar=grammar, grammar_lazy=grammar_lazy, seed=seed if seed is not None else self._seed, + active_loras=active_loras, + control_vector=control_vector, ): if llama_cpp.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -2303,6 +2375,8 @@ def create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, grammar_lazy: bool = False, + active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, + control_vector: Optional[Dict[str, Any]] = None, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2347,6 +2421,14 @@ def create_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. + Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) + and an optional "scale" key (float, defaults to 1.0). + Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`. + control_vector: A dictionary containing Control Vector (CVec) data for representation engineering. + Must contain a "data" key with a flattened 1D list of floats. + Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count). + Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers. Raises: ValueError: If the requested tokens exceed the context window. @@ -2396,6 +2478,8 @@ def create_completion( logits_processor=logits_processor, grammar=grammar, grammar_lazy=grammar_lazy, + active_loras=active_loras, + control_vector=control_vector, ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -2445,6 +2529,8 @@ def __call__( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, grammar_lazy: bool = False, + active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, + control_vector: Optional[Dict[str, Any]] = None, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2489,6 +2575,14 @@ def __call__( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. + Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) + and an optional "scale" key (float, defaults to 1.0). + Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`. + control_vector: A dictionary containing Control Vector (CVec) data for representation engineering. + Must contain a "data" key with a flattened 1D list of floats. + Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count). + Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers. Raises: ValueError: If the requested tokens exceed the context window. @@ -2538,6 +2632,8 @@ def __call__( logits_processor=logits_processor, grammar=grammar, grammar_lazy=grammar_lazy, + active_loras=active_loras, + control_vector=control_vector, ) def create_chat_completion( @@ -2583,6 +2679,8 @@ def create_chat_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, grammar_lazy: bool = False, + active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, + control_vector: Optional[Dict[str, Any]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, ) -> Union[ @@ -2632,6 +2730,14 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. grammar_lazy: If True, enables lazy evaluation. + active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. + Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) + and an optional "scale" key (float, defaults to 1.0). + Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`. + control_vector: A dictionary containing Control Vector (CVec) data for representation engineering. + Must contain a "data" key with a flattened 1D list of floats. + Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count). + Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers. Returns: Generated chat completion or a stream of chat completion chunks. @@ -2686,6 +2792,8 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, grammar_lazy=grammar_lazy, + active_loras=active_loras, + control_vector=control_vector, ) def create_chat_completion_openai_v1( From ee15771312f2ae9fcff074aebb7657f5f9b31cb7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 28 Mar 2026 10:41:18 +0800 Subject: [PATCH 284/518] Update README.md for Dynamic LoRA Routing & Control Vectors --- README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/README.md b/README.md index b200d086a7..78cdc4d64d 100644 --- a/README.md +++ b/README.md @@ -521,6 +521,68 @@ llm = Llama.from_pretrained( --- +## Dynamic LoRA Routing & Control Vectors (Multi-Tenant Serving) + +Historically, `llama-cpp-python` only supported "static loading" where a LoRA was permanently baked into the context during initialization. Switching personas required reloading the entire model or duplicating it in VRAM. + +`llama-cpp-python` now supports **Just-In-Time (JIT)** dynamic adapter routing. Instead of statically binding a single LoRA to a model during initialization (which locks the instance to a single task), you can now preload multiple adapters into VRAM and seamlessly apply them on-the-fly per request. + +This architecture unlocks true **Multi-Tenant Serving**: +* **Zero-Latency Switching:** Compute graph weights are atomically modified in C++ memory instantly before evaluation. +* **VRAM Efficiency:** You only load the heavy base model once. Multiple LoRAs share the same base model memory. +* **Thread-Safe & Contamination-Free:** Strict internal state debouncing ensures that weights are perfectly cleaned between requests, guaranteeing zero persona contamination. + +### Dynamic LoRA Example + +```python +from llama_cpp import Llama + +# 1. Load the pure base model once +llm = Llama(model_path="path/to/llama-3-8b.gguf") + +# 2. Preload multiple LoRAs into VRAM +llm.load_lora("python_coder", "path/to/python-coder-lora.gguf") +llm.load_lora("translator", "path/to/spanish-translator-lora.gguf") + +# 3. User A: Coding Task (Instantly applies the coder LoRA) +response_a = llm.create_chat_completion( + messages=[{"role": "user", "content": "Write a fast inverse square root in C."}], + active_loras=[{"name": "python_coder", "scale": 1.0}] +) + +# 4. User B: Translation Task (Zero-latency switch to the translator LoRA) +response_b = llm.create_chat_completion( + messages=[{"role": "user", "content": "Explain quantum physics in Spanish."}], + active_loras=[{"name": "translator", "scale": 0.85}] # Apply at 85% strength +) + +# 5. User C: General Query (Automatically wipes graph weights for a clean base model state) +response_c = llm.create_chat_completion( + messages=[{"role": "user", "content": "What is the capital of France?"}] +) + +# 6. Cleanup (Optional: manually free VRAM for specific LoRAs) +llm.unload_lora("python_coder") +``` + +### Control Vector Injection (Representation Engineering) + +In addition to LoRA, the API supports dynamic injection of **Control Vectors (CVec)**. This allows you to steer the model's behavior, emotion, or alignment by directly modifying the activation values at specific hidden layers, without needing `.gguf` weight files. + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Tell me a story about a futuristic city."}], + control_vector={ + "data": [...], # A flattened 1D list of floats representing the vector + "layer_start": 15, # Apply starting from this layer (inclusive) + "layer_end": 32 # Apply up to this layer (inclusive) + } +) +``` +*Note(JamePeng): Ensure your `data` array length exactly matches `embedding_length * layer_end`. The C++ backend maps the buffer continuously starting from layer 1, so early skipped layers must be zero-padded in your array.* + +--- + ## Sampling Configuration & Usage (LlamaSamplingParams) The `Llama` class provides extensive control over the `llama.cpp` sampling chain during text generation. You can configure state-of-the-art sampling algorithms, dynamic temperature, and advanced repetition penalties directly via the `generate`, `create_completion`, or `__call__` methods. From 6d27d08f125d36dd995bdb7f783d146b83362f90 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 29 Mar 2026 18:29:17 +0800 Subject: [PATCH 285/518] Update Submodule vendor/llama.cpp 59d8402..2405d59 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 59d840209a..2405d59cb6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 59d840209a5195c2f6e2e81b5f8339a0637b59d9 +Subproject commit 2405d59cb613f7b9f98ecbc9eb25f8a45188ee06 From 3ead0900678f8ef4878c9d05787da8cacec38331 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 29 Mar 2026 19:01:37 +0800 Subject: [PATCH 286/518] feat(chat_format): added `assistant_prefill` to seamlessly continue responses - This commit introduces the `assistant_prefill` parameter to the chat completion API, satisfying the highly requested need to continue interrupted or partially generated assistant messages. - Resolves #97 (Chat completion from unfinished response) - Usage: - Simply set `assistant_prefill=True` in `create_chat_completion` when the final item in your `messages` list is a partial `assistant` response. The engine will use it as a prompt base and continue generating seamlessly. Signed-off-by: JamePeng --- llama_cpp/llama.py | 2 ++ llama_cpp/llama_chat_format.py | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a3e359805e..b8a952f6ad 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2683,6 +2683,7 @@ def create_chat_completion( control_vector: Optional[Dict[str, Any]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + assistant_prefill: bool = False, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2794,6 +2795,7 @@ def create_chat_completion( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + assistant_prefill=assistant_prefill, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ed5261b05b..5fd08572a8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -129,6 +129,7 @@ def __call__( grammar: Optional[llama_grammar.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + assistant_prefill: bool = False, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -627,11 +628,30 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + assistant_prefill: bool = False, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], ]: + + # JIT Interception for Assistant Prefill (Continue Generation) + partial_assistant_text = "" + if assistant_prefill: + if not messages: + if llama.verbose: + print("Llama.create_chat_completion: Warning! 'assistant_prefill=True' but messages list is empty. Ignoring prefill.", file=sys.stderr) + elif messages[-1].get("role") != "assistant": + if llama.verbose: + print(f"Llama.create_chat_completion: Warning! 'assistant_prefill=True' but last message role is '{messages[-1].get('role')}'. Expected 'assistant'. Ignoring prefill.", file=sys.stderr) + else: + # Safe to prefill: pop the last message without mutating the user's original list + messages = messages.copy() + partial_message = messages.pop() + partial_assistant_text = partial_message.get("content", "") or "" + if not partial_assistant_text and llama.verbose: + print("Llama.create_chat_completion: Warning! 'assistant_prefill=True' but the assistant message has no content.", file=sys.stderr) + result = chat_formatter( messages=messages, functions=functions, @@ -639,6 +659,11 @@ def chat_completion_handler( tools=tools, tool_choice=tool_choice, ) + + # Seamlessly append the partial assistant text to the standard generated Jinja template + if partial_assistant_text: + result.prompt += partial_assistant_text + prompt = llama.tokenize( result.prompt.encode("utf-8"), add_bos=not result.added_special, From f9b531342193c3d9811fabff5917302d42adedac Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 29 Mar 2026 19:41:47 +0800 Subject: [PATCH 287/518] docs(readme): add documentation for Assistant Prefill features - Also slightly updated the `huggingface_hub` installation instructions for accuracy. Signed-off-by: JamePeng --- README.md | 46 ++++++++++++++++++++++++++++++---- llama_cpp/llama.py | 6 ++--- llama_cpp/llama_chat_format.py | 4 +-- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 78cdc4d64d..41555debd7 100644 --- a/README.md +++ b/README.md @@ -344,20 +344,23 @@ By default `llama-cpp-python` generates completions in an OpenAI compatible form Text completion is available through the [`__call__`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__) and [`create_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_completion) methods of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class. -### Pulling models from Hugging Face Hub +### Pulling models from [Hugging Face Hub](https://huggingface.co/models) You can download `Llama` models in `gguf` format directly from Hugging Face using the [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) method. -You'll need to install the `huggingface-hub` package to use this feature (`pip install huggingface-hub`). + +You'll need to install the `huggingface_hub` package to use this feature (`pip install --upgrade huggingface_hub`). + + ```python llm = Llama.from_pretrained( - repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", - filename="*q8_0.gguf", + repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", + filename="qwen2.5-0.5b-instruct-q4_k_m.gguf", verbose=False ) ``` -By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool. +By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`hf`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool. ### Chat Completion @@ -521,6 +524,39 @@ llm = Llama.from_pretrained( --- +## Continuing Assistant Responses (Prefill) + +`llama-cpp-python` supports native **Assistant Prefill** for seamless message continuation. You can now simply use the `assistant_prefill=True` parameter in the `create_chat_completion` function. + +This safely renders the `N-1` conversation history using standard Jinja templates (preserving exact control tokens) and flawlessly appends your partial text directly to the prompt. + +```python +from llama_cpp import Llama + +llm = Llama(model_path="path/to/model.gguf") + +# An interrupted/partial conversation +messages = [ + {"role": "user", "content": "What are the first 5 planets in the solar system?"}, + {"role": "assistant", "content": "The first 5 planets in our solar system are:\n1. Mercury\n2."} +] + +# Seamlessly continue the generation +response = llm.create_chat_completion( + messages=messages, + max_tokens=50, + assistant_prefill=True # <--- Enables seamless continuation +) + +prefilled_text = messages[-1]["content"] +# The model will flawlessly continue from " Venus\n3. Earth..." +generated_text = response["choices"][0]["message"]["content"] + +print(prefilled_text + generated_text) +``` + +--- + ## Dynamic LoRA Routing & Control Vectors (Multi-Tenant Serving) Historically, `llama-cpp-python` only supported "static loading" where a LoRA was permanently baked into the context during initialization. Switching personas required reloading the entire model or duplicating it in VRAM. diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b8a952f6ad..d6c6926e60 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -3141,8 +3141,8 @@ def from_pretrained( **kwargs: Any, ) -> "Llama": """Create a Llama model from a pretrained model name or path. - This method requires the huggingface-hub package. - You can install it with `pip install huggingface-hub`. + This method requires the huggingface_hub package. + You can install it with `pip install --upgrade huggingface_hub`. Args: repo_id: The model repo id. @@ -3160,7 +3160,7 @@ def from_pretrained( except ImportError: raise ImportError( "Llama.from_pretrained requires the huggingface-hub package. " - "You can install it with `pip install huggingface-hub`." + "You can install it with `pip install --upgrade huggingface_hub`." ) validate_repo_id(repo_id) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 5fd08572a8..62c9efa07c 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3758,8 +3758,8 @@ def from_pretrained( from huggingface_hub.utils import validate_repo_id # type: ignore except ImportError: raise ImportError( - "Llama.from_pretrained requires the huggingface-hub package. " - "You can install it with `pip install huggingface-hub`." + "Llama.from_pretrained requires the huggingface_hub package. " + "You can install it with `pip install --upgrade huggingface_hub`." ) validate_repo_id(repo_id) From a8cec004466493db57d3cbc043cdc897b2b37f9b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 31 Mar 2026 20:55:39 +0800 Subject: [PATCH 288/518] Update Submodule vendor/llama.cpp 2405d59..0fcb376 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2405d59cb6..0fcb3760b2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2405d59cb613f7b9f98ecbc9eb25f8a45188ee06 +Subproject commit 0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f From a184583e908cc138fd15794986b3581521fb9b0c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 1 Apr 2026 00:27:41 +0800 Subject: [PATCH 289/518] Bump version to 0.3.34 Signed-off-by: JamePeng --- CHANGELOG.md | 56 +++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75c6c0398b..38b4b90b50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,62 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.34] Dynamic LoRA Routing, Control Vectors, and Assistant Prefill + +- **feat(chat_format): added assistant_prefill to seamlessly continue responses** + - This commit introduces the `assistant_prefill` parameter to the chat completion API, satisfying the highly requested need to continue interrupted or partially generated assistant messages. + - Resolves #97 (Chat completion from unfinished response) + - Usage: + - Simply set `assistant_prefill=True` in `create_chat_completion` when the final item in your `messages` list is a partial `assistant` response. The engine will use it as a prompt base and continue generating seamlessly. + - docs(readme): add documentation for Assistant Prefill features + - Also slightly updated the `huggingface_hub` installation instructions for accuracy. + +- **feat(internals): implement dynamic LoRA routing and Control Vector support** + * This commit overhauls the adapter management architecture in `_internals.py` to support **dynamic, per-request LoRA routing and Control Vector (CVec) injection** with strict C++ memory safety. + + * Key changes: + - Secure Memory Management: Introduced the `LlamaLoraAdapter` wrapper class to securely handle the lifecycle of `llama_adapter_lora_p` pointers, preventing VRAM leaks. Also added support for extracting ALoRA invocation tokens. + - Model-Level Registry: Added `_lora_registry` to `LlamaModel` with robust methods (`load_lora`, `unload_lora`, `unload_all_loras`) to preload adapters into VRAM. Integrated cleanup into the model's `ExitStack` and `close()` methods for deterministic memory release. + - Context-Level Dynamic Routing: Implemented `apply_loras` and `clear_loras` in `LlamaContext` to dynamically swap compute graph weights using contiguous C arrays, enabling zero-delay multi-tenant LoRA switching. + - Control Vector Integration: Added `apply_cvec` and `clear_cvec` to `LlamaContext` for representation engineering. Includes strict C++ memory layout validation (enforcing buffer zero-padding up to `n_embd * il_end`) to prevent silent write failures in the GGML backend. + - Observability & Docs: Added verbose logging for adapter/CVec application and expanded docstrings for context utility methods (e.g., threading, causal attention, warmup). + - Update README.md for Dynamic LoRA Routing & Control Vectors + +- fix(types): correct llama_adapter_get_alora_invocation_tokens ctypes signature and use pointer for llama_token + +- fix(types): correct llama_set_adapters_lora LoRA adapter ctypes signature and use pointer for scales + - change scale: float to float* (POINTER(c_float)) + - make adapters and scales optional arrays to match C API + +- refactor: remove legacy static LoRA initialization + - Removed `lora_base`, `lora_path`, and `lora_scale` from `Llama` init parameters and state. + - Dropped outdated `llama_adapter_lora_init` and `llama_set_adapters_lora` bindings in the constructor. + - Restored default `use_mmap` behavior (no longer forced to False when LoRA is present). + + * This removes the global context pollution and paves the way for the new dynamic, per-request LoRA routing architecture. + +- chore: enhance hybrid cache logging and document M-RoPE token usage + - Added explanatory comments detailing why n_tokens is used instead of chunk_n_pos for M-RoPE models (to prevent the system from skipping evaluation). + - Added verbose logging for hybrid cache clearance scenarios (when checkpoints are missing, restore fails, or max_checkpoints is 0). + +- feat(core): add verbose debug logging to longest_token_prefix fast paths + - Added an optional `verbose` parameter to `Llama.longest_token_prefix` to explicitly log early-exit conditions. This provides crucial visibility into cache-miss behaviors during debugging by outputting the specific reason for a fast exit (e.g., empty sequence vs. mismatched first token) along with the offending sequence lengths or token values. + +- Update MIT license copyright to collective authorship (2023-2026) + - Change `single-author` copyright to `The llama-cpp-python authors` + and apply standard multi-line formatting for better readability. + - Every contributor who participates and makes an effort makes the project more reliable, efficient, + and user-friendly, and they all deserve to be remembered. + - Welcome to join us in promoting the project and enriching the open-source community. + +- Update CMakeLists.txt + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f](https://github.com/ggml-org/llama.cpp/commit/0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260325 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/6bbc8d2306319c67c9f7d0d2d0576496f3587a3c...a8cec004466493db57d3cbc043cdc897b2b37f9b + ## [0.3.33] Fixing Multimodal Image Freezes, Stabilizing Logits, and Optimized Legacy Cache Logic - perf(mtmd): optimize media_id masking with bitwise AND diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index ff142e6bf6..5a0a40d108 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.33" +__version__ = "0.3.34" From 65d27507759a1ec68d2d9855595dc995872931f1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 2 Apr 2026 03:55:41 +0800 Subject: [PATCH 290/518] Update Submodule vendor/llama.cpp 0fcb376..12dbf1d --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0fcb3760b2..12dbf1da95 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f +Subproject commit 12dbf1da9558524b315bc47fae976fe90ec3a95e From 08f15e7fa41557cc88b20cda149ecd4bde80c159 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 2 Apr 2026 04:32:15 +0800 Subject: [PATCH 291/518] feat(ggml): add support for ggml-base library and new function bindings - Load the new `ggml-base` shared library alongside `ggml`. - Add `ctypes` bindings for `ggml_log_get`, `ggml_log_set`, and `ggml_set_zero` using the `ggml_base_function` decorator. Signed-off-by: JamePeng --- llama_cpp/_ggml.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index d53272f6c4..7da64e79f3 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -31,6 +31,10 @@ ggml_function = ctypes_function_for_shared_library(libggml) +libggml_base = load_shared_library("ggml-base", libggml_base_paths) + +ggml_base_function = ctypes_function_for_shared_library(libggml_base) + # // ====== ggml.h ====== GGML_FILE_MAGIC = 0x67676d6c # b"ggml" @@ -726,6 +730,62 @@ class ggml_tensor(ctypes.Structure): None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p ) +# // Set callback for all future logging events. +# // If this is not called, or NULL is supplied, everything is output on stderr. +# // The logger state is global so these functions are NOT thread safe. +# GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data); +@ggml_base_function( + "ggml_log_get", + [ctypes.POINTER(ggml_log_callback), ctypes.POINTER(ctypes.c_void_p)], + None, +) +def ggml_log_get( + log_callback: Optional[ctypes.pointer(ggml_log_callback)], + user_data: ctypes.pointer(ctypes.c_void_p), + /, +): + """ + Get callback for all future logging events. + If this is not called, or NULL is supplied, everything is output on stderr. + The logger state is global so these functions are NOT thread safe. + """ + ... + + +# GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); +@ggml_base_function( + "ggml_log_set", + [ggml_log_callback, ctypes.c_void_p], + None, +) +def ggml_log_set( + log_callback: Optional[ggml_log_callback], + user_data: ctypes.c_void_p, + /, +): + """ + Set callback for all future logging events. + If this is not called, or NULL is supplied, everything is output on stderr. + The logger state is global so these functions are NOT thread safe. + """ + ... + + +# GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); +@ggml_base_function( + "ggml_set_zero", + [ggml_tensor_p], + ggml_tensor_p, +) +def ggml_set_zero( + tensor: ctypes.c_void_p, + /, +) -> ctypes.c_void_p: + """ + Memset tensor data to zero + """ + ... + # // ====== ggml-opt.h ====== From 225c7ad779fdedbe7605d41ec12c5ad7c7a7a7e2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 2 Apr 2026 04:35:10 +0800 Subject: [PATCH 292/518] refactor(logger): migrate from llama_log_callback to ggml_log_callback - Remove the deprecated `llama_log_callback` typedef from `llama_cpp.py`. - Update `_logger.py` to use `ggml_log_callback` from `_ggml`, aligning with the upstream GGML logging architecture. - Rename the callback references across the codebase, including the MTMD context initialization in `llama_chat_format.py`. Signed-off-by: JamePeng --- llama_cpp/_logger.py | 10 +++++----- llama_cpp/llama_chat_format.py | 4 ++-- llama_cpp/llama_cpp.py | 15 --------------- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 787b3f108e..022ece22bf 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,8 +1,8 @@ import sys import ctypes import logging - -import llama_cpp +import llama_cpp._ggml as _ggml +import llama_cpp.llama_cpp as llama_cpp_lib # enum ggml_log_level { # GGML_LOG_LEVEL_NONE = 0, @@ -26,8 +26,8 @@ _last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0] # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); -@llama_cpp.llama_log_callback -def llama_log_callback( +@_ggml.ggml_log_callback +def ggml_log_callback( level: int, text: bytes, user_data: ctypes.c_void_p, @@ -40,7 +40,7 @@ def llama_log_callback( _last_log_level = log_level -llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) +llama_cpp_lib.llama_log_set(ggml_log_callback, ctypes.c_void_p(0)) def set_verbose(verbose: bool): diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 62c9efa07c..bceccb186b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -40,7 +40,7 @@ import llama_cpp.llama_grammar as llama_grammar from ._ggml import GGMLLogLevel -from ._logger import logger, llama_log_callback +from ._logger import logger, ggml_log_callback from ._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -2899,7 +2899,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if self.mtmd_ctx is not None: return # Already initialized - self._mtmd_cpp.mtmd_helper_log_set(llama_log_callback, ctypes.c_void_p(0)) + self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) # Get default parameters self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 43527fa7c6..b37465cb1d 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -974,21 +974,6 @@ class llama_context_params(ctypes.Structure): llama_context_params_p = ctypes.POINTER(llama_context_params) -# // Signature for logging events -# // Note that text includes the new line character at the end for most events. -# // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it -# // if it exists. -# // It might not exist for progress report where '.' is output repeatedly. -# typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); -llama_log_callback = ctypes.CFUNCTYPE( - None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p -) -"""Signature for logging events -Note that text includes the new line character at the end for most events. -If your logging mechanism cannot handle that, check if the last character is '\n' and strip it -if it exists. -It might not exist for progress report where '.' is output repeatedly.""" - # // model quantization parameters # typedef struct llama_model_quantize_params { From 7036ac33b1e9b088fed960e44a0ec8493602dd68 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 2 Apr 2026 04:46:30 +0800 Subject: [PATCH 293/518] fix TypeError in _ggml.py --- llama_cpp/_ggml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 7da64e79f3..f22c9eb94d 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -740,8 +740,8 @@ class ggml_tensor(ctypes.Structure): None, ) def ggml_log_get( - log_callback: Optional[ctypes.pointer(ggml_log_callback)], - user_data: ctypes.pointer(ctypes.c_void_p), + log_callback: Optional[ctypes.POINTER(ggml_log_callback)], + user_data: ctypes.POINTER(ctypes.c_void_p), /, ): """ From 24f2562e4c45305802038b8f31a011b7c59c8df7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 2 Apr 2026 05:15:41 +0800 Subject: [PATCH 294/518] Sync llama : refactor llama_model_quantize_params to expose a pure C interface (#20346) Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 91 ++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b37465cb1d..997ed75c84 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -677,6 +677,8 @@ class llama_model_kv_override(ctypes.Structure): key: bytes value: Union[int, float, bool, bytes] +llama_model_kv_override_p = ctypes.POINTER(llama_model_kv_override) + # struct llama_model_tensor_buft_override { # const char * pattern; # ggml_backend_buffer_type_t buft; @@ -975,22 +977,59 @@ class llama_context_params(ctypes.Structure): llama_context_params_p = ctypes.POINTER(llama_context_params) +# struct llama_model_tensor_override { +# const char * pattern; +# enum ggml_type type; +# }; +class llama_model_tensor_override(ctypes.Structure): + _fields_ = [ + ("pattern", ctypes.c_char_p), + ("type", ctypes.c_int), + ] + + if TYPE_CHECKING: + pattern: ctypes.c_char_p + type: ctypes.c_int + +llama_model_tensor_override_p = ctypes.POINTER(llama_model_tensor_override) + + +# struct llama_model_imatrix_data { +# const char * name; +# const float * data; +# size_t size; +# }; +class llama_model_imatrix_data(ctypes.Structure): + _fields_ = [ + ("name", ctypes.c_char_p), + ("data", ctypes.POINTER(ctypes.c_float)), + ("size", ctypes.c_size_t), + ] + + if TYPE_CHECKING: + name: ctypes.c_char_p + data: ctypes.POINTER(ctypes.c_float) + size: ctypes.c_size_t + +llama_model_imatrix_data_p = ctypes.POINTER(llama_model_imatrix_data) + + # // model quantization parameters # typedef struct llama_model_quantize_params { -# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum llama_ftype ftype; // quantize to this llama_ftype -# enum ggml_type output_tensor_type; // output tensor type -# enum ggml_type token_embedding_type; // token embeddings tensor type -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored -# bool pure; // quantize all tensors to the default type -# bool keep_split; // quantize to the same number of shards -# bool dry_run; // calculate and show the final quantization size without performing quantization -# void * imatrix; // pointer to importance matrix data -# void * kv_overrides; // pointer to vector containing overrides -# void * tensor_types; // pointer to vector containing tensor types -# void * prune_layers; // pointer to vector containing layer indices to prune +# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum llama_ftype ftype; // quantize to this llama_ftype +# enum ggml_type output_tensor_type; // output tensor type +# enum ggml_type token_embedding_type; // token embeddings tensor type +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored +# bool pure; // quantize all tensors to the default type +# bool keep_split; // quantize to the same number of shards +# bool dry_run; // calculate and show the final quantization size without performing quantization +# const struct llama_model_imatrix_data * imatrix; // pointer to importance matrix data +# const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides +# const struct llama_model_tensor_override * tt_overrides; // pointer to tensor overrides +# const int32_t * prune_layers; // pointer to layer indices to prune # } llama_model_quantize_params; class llama_model_quantize_params(ctypes.Structure): """Parameters for llama_model_quantize @@ -1006,10 +1045,10 @@ class llama_model_quantize_params(ctypes.Structure): pure (bool): quantize all tensors to the default type keep_split (bool): quantize to the same number of shards dry_run (bool): calculate and show the final quantization size without performing quantization - imatrix (ctypes.c_void_p): pointer to importance matrix data - kv_overrides (ctypes.c_void_p): pointer to vector containing overrides - tensor_types (ctypes.c_void_p): pointer to vector containing tensor types - prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune + imatrix (POINTER(llama_model_imatrix_data)): Pointer to importance matrix data. + kv_overrides (POINTER(llama_model_kv_override)): Pointer to KV overrides. + tt_overrides (POINTER(llama_model_tensor_override)): Pointer to tensor overrides. + prune_layers (POINTER(c_int32)): Pointer to layer indices to prune. """ if TYPE_CHECKING: @@ -1023,10 +1062,10 @@ class llama_model_quantize_params(ctypes.Structure): pure: bool keep_split: bool dry_run: bool - imatrix: ctypes.c_void_p - kv_overrides: ctypes.c_void_p - tensor_types: ctypes.c_void_p - prune_layers: ctypes.c_void_p + imatrix: ctypes.POINTER(llama_model_imatrix_data) + kv_overrides: ctypes.POINTER(llama_model_kv_override) + tensor_types: ctypes.POINTER(llama_model_tensor_override) + prune_layers: ctypes.POINTER(ctypes.c_int32) _fields_ = [ ("nthread", ctypes.c_int32), @@ -1039,10 +1078,10 @@ class llama_model_quantize_params(ctypes.Structure): ("pure", ctypes.c_bool), ("keep_split", ctypes.c_bool), ("dry_run", ctypes.c_bool), - ("imatrix", ctypes.c_void_p), - ("kv_overrides", ctypes.c_void_p), - ("tensor_types", ctypes.c_void_p), - ("prune_layers", ctypes.c_void_p), + ("imatrix", ctypes.POINTER(llama_model_imatrix_data)), + ("kv_overrides", ctypes.POINTER(llama_model_kv_override)), + ("tt_overrides", ctypes.POINTER(llama_model_tensor_override)), + ("prune_layers", ctypes.POINTER(ctypes.c_int32)), ] From 9937acbca52a53903bc98eedf44c86df08403f71 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Fri, 3 Apr 2026 05:27:52 +0200 Subject: [PATCH 295/518] fix Qwen3.5 chat template bugs --- llama_cpp/llama_chat_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index bceccb186b..994a3dca03 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5001,7 +5001,7 @@ class Qwen35ChatHandler(MTMDChatHandler): " {{- '<|im_start|>' + message.role + '\n' + content -}}" " {%- endif -%}" " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" - " {%- for tool_call in message.tool_call -%}" + " {%- for tool_call in message.tool_calls -%}" " {%- if tool_call.function is defined -%}" " {%- set tool_call = tool_call.function -%}" " {%- endif -%}" @@ -5016,7 +5016,7 @@ class Qwen35ChatHandler(MTMDChatHandler): " {%- endif -%}" " {%- if tool_call.arguments is defined -%}" " {%- for (args_name, args_value) in tool_calls.arguments | items -%}" - " {{- '\n' -}}" + " {{- '\n' -}}" " {%- set args_value = args_value | tojson | safe if args_value is mapping or args_value is sequence and args_value is not string else args_value | string -%}" " {{- args_value -}}" " {{- '\n' -}}" From aaf1922e55785e5d820f90191ee99c2ac711428e Mon Sep 17 00:00:00 2001 From: Alcoft Date: Fri, 3 Apr 2026 06:46:52 +0200 Subject: [PATCH 296/518] Implemented 'LFM25VLChatHandler'. --- llama_cpp/llama_chat_format.py | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 994a3dca03..5d9a676b1f 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5084,6 +5084,72 @@ def __call__(self, **kwargs): # Use parent implementation return super().__call__(**kwargs) +class LFM25VLChatHandler(MTMDChatHandler): + CHAT_FORMAT = ( + "{{- bos_token -}}" + "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}" + "{%- set ns = namespace(system_prompt='', last_assistant_index=-1) -%}" + "{%- if messages[0]['role'] == 'system' -%}" + "{%- if messages[0]['content'] is string -%}" + "{%- set ns.system_prompt = messages[0]['content'] -%}" + "{%- else -%}" + "{%- for item in sys_content -%}" + "{%- if item['type'] == 'text' or 'text' in item -%}" + "{%- set ns.system_prompt = ns.system_prompt + item['text'] -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- set messages = messages[1:] -%}" + "{%- endif -%}" + "{%- if tools -%}" + "{%- set ns.system_prompt = ns.system_prompt ~ ('\n' if ns.system_prompt else '') ~ 'List of tools: [' -%}" + "{%- for tool in tools -%}" + "{%- set tool = (tool | tojson) if tool is not string else tool -%}" + "{%- set ns.system_prompt = ns.system_prompt ~ tool ~ (', ' if not loop.last else '') -%}" + "{%- endfor -%}" + "{%- set ns.system_prompt = ns.system_prompt ~ ']' -%}" + "{%- endif -%}" + "{{- ('<|im_start|>system\n' ~ ns.system_prompt ~ '<|im_end|>\n') if ns.system_prompt else '' -}}" + "{%- for message in messages -%}" + "{%- if message['role'] == 'assistant' -%}" + "{%- set ns.last_assistant_index = loop.index0 -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- for message in messages -%}" + "{{- '<|im_start|>' ~ message['role'] ~ '\n' -}}" + "{%- if message['content'] is string -%}" + "{%- set content = message['content'] -%}" + "{%- else -%}" + "{%- set content = '' -%}" + "{%- for item in message['content'] -%}" + "{%- if item['type'] in ['image', 'image_url'] and item['type'] in item -%}" + "{%- set content = content ~ (item[item['type']] if item[item['type']] is string else item[item['type']]['url']) -%}" + "{%- elif item['type'] == 'text' and 'text' in item -%}" + "{%- set content = content ~ item['text'] -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index and '' in content -%}" + "{%- set content = content.split('')[-1] | trim -%}" + "{%- endif -%}" + "{{- content ~ '<|im_end|>\n' -}}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|im_start|>assistant\n' -}}" + "{%- if not enable_thinking -%}" + "{{- '\n\n\n' -}}" + "{%- endif -%}" + "{%- endif -%}" + ) # Variables: keep_past_thinking, enable_thinking + + def __init__(self, enable_thinking: bool = True, keep_past_thinking: bool = False, **kwargs): + super().__init__(**kwargs) + self.extra_template_arguments["enable_thinking"] = enable_thinking + self.extra_template_arguments["keep_past_thinking"] = keep_past_thinking + + def __call__(self, **kwargs): + super().__call__(**kwargs) + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama_core.Llama, From 2e27fd8a14cca20e2bb87d176d44480535d26a59 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 3 Apr 2026 19:17:22 +0800 Subject: [PATCH 297/518] Update Submodule vendor/llama.cpp 12dbf1d..277ff5f --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 12dbf1da95..277ff5fff7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 12dbf1da9558524b315bc47fae976fe90ec3a95e +Subproject commit 277ff5fff79d49cc3d2292ddf410ca95dd51c3a9 From 7ef09e953b62530d38784a1d852d14519c3fc21c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 3 Apr 2026 23:49:13 +0800 Subject: [PATCH 298/518] Update llama_vocab_pre_type varriable --- llama_cpp/llama_cpp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 997ed75c84..5d7fcd5fd2 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -191,7 +191,8 @@ # LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, # LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, # LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, -#. LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, +# LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, +# LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -209,7 +210,7 @@ LLAMA_VOCAB_PRE_TYPE_DBRX = 13 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 LLAMA_VOCAB_PRE_TYPE_PORO = 15 -LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16 +LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 LLAMA_VOCAB_PRE_TYPE_VIKING = 18 LLAMA_VOCAB_PRE_TYPE_JAIS = 19 @@ -243,6 +244,7 @@ LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 +LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 # // note: these values should be synchronized with ggml_rope From a5b47625fbcb3e6bc0ae94f65d025ca3856ef093 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 4 Apr 2026 11:17:28 +0800 Subject: [PATCH 299/518] feat(chat_format): add Gemma 4 chat handler with multimodal and tool support - Implement `Gemma4ChatHandler` with Gemma 4 specific tokens (`<|turn>`, `<|channel>`, etc.). - Add complex Jinja2 template for advanced nested tool/function schema formatting. - Support multimodal content injection for `image_url`, `audio_url`, and `input_audio` (including base64 reconstruction). - Integrate reasoning/thinking controls via `enable_thinking` toggle and `<|channel>thought` formatting. - Configure `` as the primary stop sequence for generation boundaries. Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 323 +++++++++++++++++++++++++++++++++ 2 files changed, 324 insertions(+) diff --git a/README.md b/README.md index 41555debd7..b21e379926 100644 --- a/README.md +++ b/README.md @@ -735,6 +735,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [minicpm-v-4.5](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) | `MiniCPMv45ChatHandler` | `minicpm-v-4.5` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | +| [gemma4](https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | | [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | | [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 994a3dca03..915e713277 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4335,6 +4335,329 @@ class Gemma3ChatHandler(MTMDChatHandler): ) +class Gemma4ChatHandler(MTMDChatHandler): + """ + Handler for Gemma 4 models. + """ + + # The special token in Gemma 4 + GEMMA4_BOI_TOKEN = "<|image>" + GEMMA4_EOI_TOKEN = "" + GEMMA4_BOA_TOKEN = "<|audio>" + GEMMA4_EOA_TOKEN = "" + GEMMA4_BOS_TOKEN = "" + GEMMA4_EOS_TOKEN = "" + GEMMA4_SOT_TOKEN = "<|turn>" + GEMMA4_EOT_TOKEN = "" + GEMMA4_SOC_TOKEN = "<|channel>" + GEMMA4_EOC_TOKEN = "" + GEMMA4_STC_TOKEN = "<|tool_call>" + GEMMA4_ETC_TOKEN = "" + GEMMA4_STD_TOKEN = "<|tool>" + GEMMA4_ETD_TOKEN = "" + GEMMA4_STR_TOKEN = "<|tool_response>" + GEMMA4_ETR_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro format_parameters(properties, required) -%}\n" + " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in properties | dictsort -%}\n" + " {%- set add_comma = false -%}\n" + " {%- if key not in standard_keys -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {{ key }}:{\n" + " {%- if value['description'] -%}\n" + " description:<|\"|>{{ value['description'] }}<|\"|>\n" + " {%- set add_comma = true -%}\n" + " {%- endif -%}\n" + " {%- if value['nullable'] %}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " nullable:true\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'STRING' -%}\n" + " {%- if value['enum'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " enum:{{ format_argument(value['enum']) }}\n" + " {%- endif -%}\n" + " {%- elif value['type'] | upper == 'OBJECT' -%}\n" + " ,properties:{\n" + " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" + " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" + " {%- elif value is mapping -%}\n" + " {{- format_parameters(value, value['required'] | default([])) -}}\n" + " {%- endif -%}\n" + " }\n" + " {%- if value['required'] -%}\n" + " ,required:[\n" + " {%- for item in value['required'] | default([]) -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- endif -%}\n" + " {%- elif value['type'] | upper == 'ARRAY' -%}\n" + " {%- if value['items'] is mapping and value['items'] -%}\n" + " ,items:{\n" + " {%- set ns_items = namespace(found_first=false) -%}\n" + " {%- for item_key, item_value in value['items'] | dictsort -%}\n" + " {%- if item_value is not none -%}\n" + " {%- if ns_items.found_first %},{% endif -%}\n" + " {%- set ns_items.found_first = true -%}\n" + " {%- if item_key == 'properties' -%}\n" + " properties:{\n" + " {%- if item_value is mapping -%}\n" + " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" + " {%- endif -%}\n" + " }\n" + " {%- elif item_key == 'required' -%}\n" + " required:[\n" + " {%- for req_item in item_value -%}\n" + " <|\"|>{{- req_item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- elif item_key == 'type' -%}\n" + " {%- if item_value is string -%}\n" + " type:{{ format_argument(item_value | upper) }}\n" + " {%- else -%}\n" + " type:{{ format_argument(item_value | map('upper') | list) }}\n" + " {%- endif -%}\n" + " {%- else -%}\n" + " {{ item_key }}:{{ format_argument(item_value) }}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " }\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + "{%- endmacro -%}\n" + "{%- macro format_function_declaration(tool_data) -%}\n" + " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" + " {%- set params = tool_data['function']['parameters'] -%}\n" + " {%- if params -%}\n" + " ,parameters:{\n" + " {%- if params['properties'] -%}\n" + " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" + " {%- endif -%}\n" + " {%- if params['required'] -%}\n" + " required:[\n" + " {%- for item in params['required'] -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {{- ',' if not loop.last -}}\n" + " {%- endfor -%}\n" + " ],\n" + " {%- endif -%}\n" + " {%- if params['type'] -%}\n" + " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if 'response' in tool_data['function'] -%}\n" + " {%- set response_declaration = tool_data['function']['response'] -%}\n" + " ,response:{\n" + " {%- if response_declaration['description'] -%}\n" + " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" + " {%- endif -%}\n" + " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" + " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " }\n" + "{%- endmacro -%}\n" + "{%- macro format_argument(argument, escape_keys=True) -%}\n" + " {%- if argument is string -%}\n" + " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" + " {%- elif argument is boolean -%}\n" + " {{- 'true' if argument else 'false' -}}\n" + " {%- elif argument is mapping -%}\n" + " {{- '{' -}}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in argument | dictsort -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {%- if escape_keys -%}\n" + " {{- '<|\"|>' + key + '<|\"|>' -}}\n" + " {%- else -%}\n" + " {{- key -}}\n" + " {%- endif -%}\n" + " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- elif argument is sequence -%}\n" + " {{- '[' -}}\n" + " {%- for item in argument -%}\n" + " {{- format_argument(item, escape_keys=escape_keys) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- ']' -}}\n" + " {%- else -%}\n" + " {{- argument -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- macro strip_thinking(text) -%}\n" + " {%- set ns = namespace(result='') -%}\n" + " {%- for part in text.split('') -%}\n" + " {%- if '<|channel>' in part -%}\n" + " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" + " {%- else -%}\n" + " {%- set ns.result = ns.result + part -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.result | trim -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- set ns = namespace(prev_message_type=None) -%}\n" + "{%- set loop_messages = messages -%}\n" + "{{ bos_token }}\n" + "{#- Handle System/Tool Definitions Block -#}\n" + "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" + " {{- '<|turn>system\\n' -}}\n" + "\n" + " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" + " {%- if enable_thinking is defined and enable_thinking -%}\n" + " {{- '<|think|>' -}}\n" + " {%- set ns.prev_message_type = 'think' -%}\n" + " {%- endif -%}\n" + "\n" + " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" + " {{- messages[0]['content'] | trim -}}\n" + " {%- set loop_messages = messages[1:] -%}\n" + " {%- endif -%}\n" + "\n" + " {%- if tools -%}\n" + " {%- for tool in tools %}\n" + " {{- '<|tool>' -}}\n" + " {{- format_function_declaration(tool) | trim -}}\n" + " {{- '' -}}\n" + " {%- endfor %}\n" + " {%- set ns.prev_message_type = 'tool' -%}\n" + " {%- endif -%}\n" + "\n" + " {{- '\\n' -}}\n" + "{%- endif %}\n" + "\n" + "{#- Loop through messages -#}\n" + "{%- for message in loop_messages -%}\n" + " {%- set ns.prev_message_type = None -%}\n" + " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" + " {{- '<|turn>' + role + '\\n' }}\n" + "\n" + " {%- if message['tool_calls'] -%}\n" + " {%- for tool_call in message['tool_calls'] -%}\n" + " {%- set function = tool_call['function'] -%}\n" + " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" + " {%- if function['arguments'] is mapping -%}\n" + " {%- set ns_args = namespace(found_first=false) -%}\n" + " {%- for key, value in function['arguments'] | dictsort -%}\n" + " {%- if ns_args.found_first %},{% endif -%}\n" + " {%- set ns_args.found_first = true -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- endfor -%}\n" + " {%- elif function['arguments'] is string -%}\n" + " {{- function['arguments'] -}}\n" + " {%- endif -%}\n" + " {{- '}' -}}\n" + " {%- endfor -%}\n" + " {%- set ns.prev_message_type = 'tool_call' -%}\n" + " {%- endif -%}\n" + "\n" + " {%- if message['tool_responses'] -%}\n" + " {#- Tool Response handling -#}\n" + " {%- for tool_response in message['tool_responses'] -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- if tool_response['response'] is mapping -%}\n" + " {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}\n" + " {%- for key, value in tool_response['response'] | dictsort -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- else -%}\n" + " {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}\n" + " {%- endif -%}\n" + " {{- '' -}}\n" + " {%- endfor -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endif -%}\n" + "\n" + " {%- if message['content'] is string -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(message['content']) -}}\n" + " {%- else -%}\n" + " {{- message['content'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif message['content'] is sequence -%}\n" + " {%- for item in message['content'] -%}\n" + " {%- if item['type'] == 'text' -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(item['text']) -}}\n" + " {%- else -%}\n" + " {{- item['text'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif item['type'] == 'image_url' -%}\n" + " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {{- '\\n\\n<|image|>' + url_val + '\\n\\n' -}}\n" + " {%- set ns.prev_message_type = 'image' -%}\n" + " {%- elif item['type'] == 'audio_url' -%}\n" + " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" + " {{- '\\n\\n<|audio|>' + audio_val + '\\n\\n' -}}\n" + " {%- set ns.prev_message_type = 'audio' -%}\n" + " {%- elif item['type'] == 'input_audio' -%}\n" + " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" + " {{- '\\n\\n<|audio|>' + audio_val + '\\n\\n' -}}\n" + " {%- set ns.prev_message_type = 'audio' -%}\n" + # " {%- elif item['type'] == 'video_url' -%}\n" + # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" + # " {{- '\\n\\n<|video|>' + video_val + '\\n\\n' -}}\n" + # " {%- set ns.prev_message_type = 'video' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "\n" + " {%- if not (message['tool_responses'] and not message['content']) -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- if add_generation_prompt -%}\n" + " {%- if ns.prev_message_type != 'tool_response' -%}\n" + " {{- '<|turn>model\\n' -}}\n" + " {%- endif -%}\n" + " {%- if not enable_thinking | default(false) -%}\n" + " {{- '<|channel>thought\\n' -}}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Gemma 4 Handler. + + Args: + enable_thinking (bool): Controls whether the <|think|> tag is injected and + manages <|channel>thought behavior. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set the stop token based on Gemma 4's format () + kwargs['stop'] = [self.GEMMA4_EOT_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + class GLM41VChatHandler(MTMDChatHandler): # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. From 10a6abd1e6afb0a64dc707b74b563ab2d36dd578 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 4 Apr 2026 11:23:22 +0800 Subject: [PATCH 300/518] fix(Qwen35ChatHandler): Correct CHAT_FORMAT `tool_call` typo Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 915e713277..8a44c35296 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5338,7 +5338,7 @@ class Qwen35ChatHandler(MTMDChatHandler): " {{- '\n\n\n' -}}" " {%- endif -%}" " {%- if tool_call.arguments is defined -%}" - " {%- for (args_name, args_value) in tool_calls.arguments | items -%}" + " {%- for (args_name, args_value) in tool_call.arguments | items -%}" " {{- '\n' -}}" " {%- set args_value = args_value | tojson | safe if args_value is mapping or args_value is sequence and args_value is not string else args_value | string -%}" " {{- args_value -}}" From 73c3b06ba9dd2e2c82622c50df143670c194ba19 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 4 Apr 2026 17:39:38 +0800 Subject: [PATCH 301/518] Update Submodule vendor/llama.cpp 277ff5f..b7ad48e --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 277ff5fff7..b7ad48ebda 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 277ff5fff79d49cc3d2292ddf410ca95dd51c3a9 +Subproject commit b7ad48ebda2287c778fd826606d7b3b3570f60ab From 6e99244026996d48020f165d72f0431b9881dc44 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 4 Apr 2026 18:07:59 +0800 Subject: [PATCH 302/518] Update llama_types.py OpenAI OpenAPI Link --- llama_cpp/llama_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index f647822ff5..2a7681cf5d 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -3,7 +3,7 @@ NOTE: These types may change to match the OpenAI OpenAPI specification. Based on the OpenAI OpenAPI specification: -https://github.com/openai/openai-openapi/blob/master/openapi.yaml +https://app.stainless.com/api/spec/documented/openai/openapi.documented.yml """ From 7bd7175894956637b0df6993a880a594aadbd3d5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 4 Apr 2026 19:10:43 +0800 Subject: [PATCH 303/518] docs: clarify enable_thinking compatibility for Gemma 4 models - Update `Gemma4ChatHandler` class docstring and `__init__` args documentation. - Specify that the `enable_thinking` toggle is exclusively supported by Gemma4 31B and 26BA4B variants. - Explicitly note that E2B and E4B models do not currently support this feature to prevent configuration errors. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 8a44c35296..dd011487d1 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4338,6 +4338,10 @@ class Gemma3ChatHandler(MTMDChatHandler): class Gemma4ChatHandler(MTMDChatHandler): """ Handler for Gemma 4 models. + + Note on `enable_thinking`: + The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. + It is NOT supported by Gemma4 E2B and E4B models. """ # The special token in Gemma 4 @@ -4641,6 +4645,8 @@ def __init__(self, enable_thinking: bool = True, **kwargs): Args: enable_thinking (bool): Controls whether the <|think|> tag is injected and manages <|channel>thought behavior. + Note: ONLY supported on Gemma4 31B and 26BA4B models. + NOT supported on Gemma4 E2B and E4B models. """ self.enable_thinking = enable_thinking super().__init__(**kwargs) From a1a0c9470d5721d435b175a762f77e99e34c6740 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 5 Apr 2026 07:51:36 +0800 Subject: [PATCH 304/518] Update Submodule vendor/llama.cpp b7ad48e..b863507 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b7ad48ebda..b8635075ff 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b7ad48ebda2287c778fd826606d7b3b3570f60ab +Subproject commit b8635075ffe27b135c49afb9a8b5c434bd42c502 From 1dc235f1d86296d8b00997b9efd8abf575b65a1c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 5 Apr 2026 08:59:12 +0800 Subject: [PATCH 305/518] feat(types): align with latest OpenAI OpenAPI spec (audio, structured outputs) - Add `developer` role. - Replace Anyscale-specific JSON schema with official OpenAI `json_schema` response format for Structured Outputs. - Add `input_audio` and `file` types to request message content parts. - Add `audio`, `refusal`, and `annotations` (e.g., URL citations) fields to response messages. - Add `content_filter` to finish reasons and strictly define global `ChatCompletionRole`. Signed-off-by: JamePeng --- llama_cpp/llama_types.py | 94 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 2a7681cf5d..336cddab0f 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -46,7 +46,7 @@ class CompletionChoice(TypedDict): text: str index: int logprobs: Optional[CompletionLogprobs] - finish_reason: Optional[Literal["stop", "length"]] + finish_reason: Optional[Literal["stop", "length", "content_filter"]] class CompletionUsage(TypedDict): @@ -61,6 +61,7 @@ class CreateCompletionResponse(TypedDict): created: int model: str choices: List[CompletionChoice] + object: Optional[Literal["text_completion"]] usage: NotRequired[CompletionUsage] @@ -69,11 +70,37 @@ class ChatCompletionResponseFunctionCall(TypedDict): arguments: str +class ChatCompletionResponseMessageFunctionCall(TypedDict): + arguments: str + name: str + + +class ChatCompletionResponseMessageAudio(TypedDict): + id: str + expires_at: int + data: str + transcript: str + + +class ChatCompletionResponseMessageAnnotationURLCitation(TypedDict): + end_index: int + start_index: int + url: str + title: str + +class ChatCompletionResponseMessageAnnotation(TypedDict): + type: Literal["url_citation"] + url_citation: ChatCompletionResponseMessageAnnotationURLCitation + + class ChatCompletionResponseMessage(TypedDict): content: Optional[str] + refusal: Optional[str] + role: Literal["assistant"] tool_calls: NotRequired["ChatCompletionMessageToolCalls"] - role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here - function_call: NotRequired[ChatCompletionResponseFunctionCall] # DEPRECATED + annotations: NotRequired[List[ChatCompletionResponseMessageAnnotation]] + function_call: NotRequired[ChatCompletionResponseMessageFunctionCall] # DEPRECATED + audio: NotRequired[Optional[ChatCompletionResponseMessageAudio]] class ChatCompletionFunction(TypedDict): @@ -137,13 +164,23 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict): arguments: str +ChatCompletionRole = Literal[ + "developer", + "system", + "user", + "assistant", + "tool", + "function" +] + + class ChatCompletionStreamResponseDelta(TypedDict): content: NotRequired[Optional[str]] function_call: NotRequired[ Optional[ChatCompletionStreamResponseDeltaFunctionCall] ] # DEPRECATED tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]] - role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]] + role: NotRequired[Optional[Literal["developer", "system", "user", "assistant", "tool"]]] class ChatCompletionStreamResponseChoice(TypedDict): @@ -151,7 +188,7 @@ class ChatCompletionStreamResponseChoice(TypedDict): delta: Union[ ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty ] - finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]] + finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter", "function_call"]] logprobs: NotRequired[Optional[ChatCompletionLogprobs]] @@ -173,11 +210,16 @@ class ChatCompletionFunctionCallOption(TypedDict): name: str +class ChatCompletionResponseFormatJSONSchema(TypedDict): + name: str + description: NotRequired[str] + schema: NotRequired[Dict[str, Any]] + strict: NotRequired[Optional[bool]] + + class ChatCompletionRequestResponseFormat(TypedDict): - type: Literal["text", "json_object"] - schema: NotRequired[ - JsonType - ] # https://docs.endpoints.anyscale.com/guides/json_mode/ + type: Literal["text", "json_object", "json_schema"] + json_schema: NotRequired[ChatCompletionResponseFormatJSONSchema] class ChatCompletionRequestMessageContentPartText(TypedDict): @@ -195,12 +237,45 @@ class ChatCompletionRequestMessageContentPartImage(TypedDict): image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl] +class ChatCompletionRequestMessageContentPartInputAudioData(TypedDict): + data: str + format: Literal["wav", "mp3"] + + +class ChatCompletionRequestMessageContentPartAudio(TypedDict): + type: Literal["input_audio"] + input_audio: ChatCompletionRequestMessageContentPartInputAudioData + + +class ChatCompletionRequestMessageContentPartFileData(TypedDict): + filename: NotRequired[str] + file_data: NotRequired[str] + file_id: NotRequired[str] + + +class ChatCompletionRequestMessageContentPartFile(TypedDict): + type: Literal["file"] + file: ChatCompletionRequestMessageContentPartFileData + + +class ChatCompletionRequestMessageContentPartRefusal(TypedDict): + type: Literal["refusal"] + refusal: str + + ChatCompletionRequestMessageContentPart = Union[ ChatCompletionRequestMessageContentPartText, ChatCompletionRequestMessageContentPartImage, + ChatCompletionRequestMessageContentPartAudio, + ChatCompletionRequestMessageContentPartFile, ] +class ChatCompletionRequestDeveloperMessage(TypedDict): + role: Literal["developer"] + content: Optional[str] + + class ChatCompletionRequestSystemMessage(TypedDict): role: Literal["system"] content: Optional[str] @@ -252,6 +327,7 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionRequestMessage = Union[ + ChatCompletionRequestDeveloperMessage, ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, ChatCompletionRequestAssistantMessage, From 17366d307b1ba85de5698d178a968b3d392d49e4 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sun, 5 Apr 2026 08:59:33 +0200 Subject: [PATCH 306/518] updated code --- llama_cpp/llama_chat_format.py | 138 +++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 59 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index de164159b2..a020e079be 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5414,70 +5414,90 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) class LFM25VLChatHandler(MTMDChatHandler): + # Aligned with LFM2.5-VL tokenizer_config + LFM25VL_BOS_TOKEN = "<|startoftext|>" + LFM25VL_EOS_TOKEN = "<|im_end|>" + LFM25VL_PAD_TOKEN = "<|pad|>" + + # Image specific tokens + LFM25VL_IMAGE_TOKEN = "" + LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" + LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" + CHAT_FORMAT = ( - "{{- bos_token -}}" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}" - "{%- set ns = namespace(system_prompt='', last_assistant_index=-1) -%}" - "{%- if messages[0]['role'] == 'system' -%}" - "{%- if messages[0]['content'] is string -%}" - "{%- set ns.system_prompt = messages[0]['content'] -%}" - "{%- else -%}" - "{%- for item in sys_content -%}" - "{%- if item['type'] == 'text' or 'text' in item -%}" - "{%- set ns.system_prompt = ns.system_prompt + item['text'] -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- set messages = messages[1:] -%}" - "{%- endif -%}" - "{%- if tools -%}" - "{%- set ns.system_prompt = ns.system_prompt ~ ('\n' if ns.system_prompt else '') ~ 'List of tools: [' -%}" - "{%- for tool in tools -%}" - "{%- set tool = (tool | tojson) if tool is not string else tool -%}" - "{%- set ns.system_prompt = ns.system_prompt ~ tool ~ (', ' if not loop.last else '') -%}" - "{%- endfor -%}" - "{%- set ns.system_prompt = ns.system_prompt ~ ']' -%}" - "{%- endif -%}" - "{{- ('<|im_start|>system\n' ~ ns.system_prompt ~ '<|im_end|>\n') if ns.system_prompt else '' -}}" - "{%- for message in messages -%}" - "{%- if message['role'] == 'assistant' -%}" - "{%- set ns.last_assistant_index = loop.index0 -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- for message in messages -%}" - "{{- '<|im_start|>' ~ message['role'] ~ '\n' -}}" - "{%- if message['content'] is string -%}" - "{%- set content = message['content'] -%}" - "{%- else -%}" - "{%- set content = '' -%}" - "{%- for item in message['content'] -%}" - "{%- if item['type'] in ['image', 'image_url'] and item['type'] in item -%}" - "{%- set content = content ~ (item[item['type']] if item[item['type']] is string else item[item['type']]['url']) -%}" - "{%- elif item['type'] == 'text' and 'text' in item -%}" - "{%- set content = content ~ item['text'] -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index and '' in content -%}" - "{%- set content = content.split('')[-1] | trim -%}" - "{%- endif -%}" - "{{- content ~ '<|im_end|>\n' -}}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|im_start|>assistant\n' -}}" - "{%- if not enable_thinking -%}" - "{{- '\n\n\n' -}}" - "{%- endif -%}" - "{%- endif -%}" - ) # Variables: keep_past_thinking, enable_thinking + "{{- bos_token -}}\n" + "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" + "{%- set ns = namespace(system_prompt='', content='') -%}\n" + "{%- if messages[0]['role'] == 'system' -%}\n" + " {%- set ns.system_prompt = messages[0]['content'] -%}\n" + " {%- set messages = messages[1:] -%}\n" + "{%- endif -%}\n" + "{%- if tools -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" + " {%- for tool in tools -%}\n" + " {%- if tool is not string -%}\n" + " {%- set tool = tool | tojson -%}\n" + " {%- endif -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" + " {%- if not loop.last -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" + "{%- endif -%}\n" + "{%- if ns.system_prompt -%}\n" + " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" + "{%- endif -%}\n" + "{%- set ns.last_assistant_index = -1 -%}\n" + "{%- for message in messages -%}\n" + " {%- if message['role'] == 'assistant' -%}\n" + " {%- set ns.last_assistant_index = loop.index0 -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- for message in messages -%}\n" + " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" + " {%- set content = message['content'] -%}\n" + " {%- if content is not string -%}\n" + " {%- set ns.content = '' -%}\n" + " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" + " {%- for item in content -%}\n" + " {%- if item['type'] == 'image_url' -%}\n" + " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {%- set ns.content = ns.content + img_val -%}\n" + " {%- elif item['type'] == 'text' -%}\n" + " {%- set ns.content = ns.content + item['text'] -%}\n" + " {%- else -%}\n" + " {%- set ns.content = ns.content + (item | tojson) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set content = ns.content -%}\n" + " {%- endif -%}\n" + " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" + " {%- if '' in content -%}\n" + " {%- set content = content.split('')[-1] | trim -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {{- content + '<|im_end|>\\n' -}}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) - def __init__(self, enable_thinking: bool = True, keep_past_thinking: bool = False, **kwargs): + def __init__(self, keep_past_thinking: bool = False, **kwargs): + self.keep_past_thinking = keep_past_thinking super().__init__(**kwargs) - self.extra_template_arguments["enable_thinking"] = enable_thinking - self.extra_template_arguments["keep_past_thinking"] = keep_past_thinking + def __call__(self, **kwargs): - super().__call__(**kwargs) + self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking + + kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") + return super().__call__(**kwargs) @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( From 00e67b8f527b3c5bc63131414b300408f78700a1 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 6 Apr 2026 03:18:49 +0200 Subject: [PATCH 307/518] prevent errors by setting 'image_min_tokens' to 256 if higher values are detected --- llama_cpp/llama_chat_format.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a020e079be..91e3ad699a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5491,6 +5491,12 @@ def __init__(self, keep_past_thinking: bool = False, **kwargs): def __call__(self, **kwargs): + if self.image_min_tokens > 256: + if self.verbose: + print(f"For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Setting to **256**.") + + self.image_min_tokens = 256 + self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] From ccc2af1dbe0daa81c58a4c82bc16bd6fc842b752 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 6 Apr 2026 09:25:11 +0800 Subject: [PATCH 308/518] Update Submodule vendor/llama.cpp b863507..58190cc --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b8635075ff..58190cc84d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b8635075ffe27b135c49afb9a8b5c434bd42c502 +Subproject commit 58190cc84d846d8575ba26e8486bc29d9fd8ad55 From d7478dec2eeea7b116139e87bb9156feb6f3520a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 6 Apr 2026 10:01:15 +0800 Subject: [PATCH 309/518] Relocate LFM25VLChatHandler, add comment details, and update README.md --- README.md | 1 + llama_cpp/llama_chat_format.py | 188 +++++++++++++++++---------------- 2 files changed, 98 insertions(+), 91 deletions(-) diff --git a/README.md b/README.md index b21e379926..a30040d318 100644 --- a/README.md +++ b/README.md @@ -740,6 +740,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` | | [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | +| [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 91e3ad699a..b18bb2b0a8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4944,6 +4944,103 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class LFM25VLChatHandler(MTMDChatHandler): + """ + Handler for LFM2.5-VL multimodal models. + + Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. + """ + # Aligned with LFM2.5-VL tokenizer_config + LFM25VL_BOS_TOKEN = "<|startoftext|>" + LFM25VL_EOS_TOKEN = "<|im_end|>" + LFM25VL_PAD_TOKEN = "<|pad|>" + + # Image specific tokens + LFM25VL_IMAGE_TOKEN = "" + LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" + LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" + + CHAT_FORMAT = ( + "{{- bos_token -}}\n" + "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" + "{%- set ns = namespace(system_prompt='', content='') -%}\n" + "{%- if messages[0]['role'] == 'system' -%}\n" + " {%- set ns.system_prompt = messages[0]['content'] -%}\n" + " {%- set messages = messages[1:] -%}\n" + "{%- endif -%}\n" + "{%- if tools -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" + " {%- for tool in tools -%}\n" + " {%- if tool is not string -%}\n" + " {%- set tool = tool | tojson -%}\n" + " {%- endif -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" + " {%- if not loop.last -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" + "{%- endif -%}\n" + "{%- if ns.system_prompt -%}\n" + " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" + "{%- endif -%}\n" + "{%- set ns.last_assistant_index = -1 -%}\n" + "{%- for message in messages -%}\n" + " {%- if message['role'] == 'assistant' -%}\n" + " {%- set ns.last_assistant_index = loop.index0 -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- for message in messages -%}\n" + " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" + " {%- set content = message['content'] -%}\n" + " {%- if content is not string -%}\n" + " {%- set ns.content = '' -%}\n" + " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" + " {%- for item in content -%}\n" + " {%- if item['type'] == 'image_url' -%}\n" + " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {%- set ns.content = ns.content + img_val -%}\n" + " {%- elif item['type'] == 'text' -%}\n" + " {%- set ns.content = ns.content + item['text'] -%}\n" + " {%- else -%}\n" + " {%- set ns.content = ns.content + (item | tojson) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set content = ns.content -%}\n" + " {%- endif -%}\n" + " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" + " {%- if '' in content -%}\n" + " {%- set content = content.split('')[-1] | trim -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {{- content + '<|im_end|>\\n' -}}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, keep_past_thinking: bool = False, **kwargs): + self.keep_past_thinking = keep_past_thinking + super().__init__(**kwargs) + + + def __call__(self, **kwargs): + if self.image_min_tokens > 256: + if self.verbose: + print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") + self.image_min_tokens = -1 + + self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking + + kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") + return super().__call__(**kwargs) + + class PaddleOCRChatHandler(MTMDChatHandler): """ Handler for PaddleOCR 1.5 multimodal models. @@ -5413,97 +5510,6 @@ def __call__(self, **kwargs): # Use parent implementation return super().__call__(**kwargs) -class LFM25VLChatHandler(MTMDChatHandler): - # Aligned with LFM2.5-VL tokenizer_config - LFM25VL_BOS_TOKEN = "<|startoftext|>" - LFM25VL_EOS_TOKEN = "<|im_end|>" - LFM25VL_PAD_TOKEN = "<|pad|>" - - # Image specific tokens - LFM25VL_IMAGE_TOKEN = "" - LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" - LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" - - CHAT_FORMAT = ( - "{{- bos_token -}}\n" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" - "{%- set ns = namespace(system_prompt='', content='') -%}\n" - "{%- if messages[0]['role'] == 'system' -%}\n" - " {%- set ns.system_prompt = messages[0]['content'] -%}\n" - " {%- set messages = messages[1:] -%}\n" - "{%- endif -%}\n" - "{%- if tools -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" - " {%- for tool in tools -%}\n" - " {%- if tool is not string -%}\n" - " {%- set tool = tool | tojson -%}\n" - " {%- endif -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" - " {%- if not loop.last -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" - "{%- endif -%}\n" - "{%- if ns.system_prompt -%}\n" - " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" - "{%- endif -%}\n" - "{%- set ns.last_assistant_index = -1 -%}\n" - "{%- for message in messages -%}\n" - " {%- if message['role'] == 'assistant' -%}\n" - " {%- set ns.last_assistant_index = loop.index0 -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- for message in messages -%}\n" - " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" - " {%- set content = message['content'] -%}\n" - " {%- if content is not string -%}\n" - " {%- set ns.content = '' -%}\n" - " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" - " {%- for item in content -%}\n" - " {%- if item['type'] == 'image_url' -%}\n" - " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {%- set ns.content = ns.content + img_val -%}\n" - " {%- elif item['type'] == 'text' -%}\n" - " {%- set ns.content = ns.content + item['text'] -%}\n" - " {%- else -%}\n" - " {%- set ns.content = ns.content + (item | tojson) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set content = ns.content -%}\n" - " {%- endif -%}\n" - " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" - " {%- if '' in content -%}\n" - " {%- set content = content.split('')[-1] | trim -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {{- content + '<|im_end|>\\n' -}}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, keep_past_thinking: bool = False, **kwargs): - self.keep_past_thinking = keep_past_thinking - super().__init__(**kwargs) - - - def __call__(self, **kwargs): - if self.image_min_tokens > 256: - if self.verbose: - print(f"For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Setting to **256**.") - - self.image_min_tokens = 256 - - self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking - - kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") - return super().__call__(**kwargs) @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( From d4fef6a268d7a489e2bcaa587909dfe664a0e16c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 6 Apr 2026 10:20:59 +0800 Subject: [PATCH 310/518] fix: expand stop sequences for Gemma4ChatHandler - Add `GEMMA4_EOS_TOKEN` and `GEMMA4_STR_TOKEN` to the generation stop criteria. - Align the stopping logic with the model's `generation_config.json` definitions. - Prevent potential over-generation by ensuring the model halts correctly at standard EOS or when initiating a tool response. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index b18bb2b0a8..3c426b38f5 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4656,7 +4656,8 @@ def __call__(self, **kwargs): self.extra_template_arguments["enable_thinking"] = self.enable_thinking # Set the stop token based on Gemma 4's format () - kwargs['stop'] = [self.GEMMA4_EOT_TOKEN] + # generation_config.json: "eos_token_id": [ 1, 106, 50] + kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] if self.verbose: print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") From 232092e32b3563159a86aacb168da06c4937192b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 6 Apr 2026 10:59:07 +0800 Subject: [PATCH 311/518] Update README.md --- README.md | 76 +++++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index a30040d318..e316972b78 100644 --- a/README.md +++ b/README.md @@ -727,7 +727,6 @@ Below are the supported multi-modal models and their respective chat handlers (P | Model | `LlamaChatHandler` | `chat_format` | |:--- |:--- |:--- | | [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` | -| [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` | | [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` | | [moondream2](https://huggingface.co/vikhyatk/moondream2) | `MoondreamChatHandler` | `moondream2` | | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` | @@ -751,12 +750,16 @@ Then you'll need to use a custom chat handler to load the clip model and process ```python from llama_cpp import Llama from llama_cpp.llama_chat_format import Llava15ChatHandler -chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin") + +model_path="path/to/llava/ggml-model-f16.gguf" +mmproj_path="path/to/llava/mmproj-model-f16.gguf" + llm = Llama( - model_path="./path/to/llava/llama-model.gguf", - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + model_path=model_path, + chat_handler=Llava15ChatHandler(clip_model_path=mmproj_path), + n_ctx=2048, ) + llm.create_chat_completion( messages = [ {"role": "system", "content": "You are an assistant who perfectly describes images."}, @@ -806,38 +809,6 @@ print(response["choices"][0]["text"]) **Note**: Multi-modal models also support tool calling and JSON mode. -
-Loading a Local Image - -Images can be passed as base64 encoded data URIs. The following example demonstrates how to do this. - -```python -import base64 - -def image_to_base64_data_uri(file_path): - with open(file_path, "rb") as img_file: - base64_data = base64.b64encode(img_file.read()).decode('utf-8') - return f"data:image/png;base64,{base64_data}" - -# Replace 'file_path.png' with the actual path to your PNG file -file_path = 'file_path.png' -data_uri = image_to_base64_data_uri(file_path) - -messages = [ - {"role": "system", "content": "You are an assistant who perfectly describes images."}, - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": data_uri }}, - {"type" : "text", "text": "Describe this image in detail please."} - ] - } -] - -``` - -
- ## Loading a Local Image With Qwen3VL(Thinking/Instruct) This script demonstrates how to load a local image, encode it as a base64 Data URI, and pass it to a local Qwen3-VL model (with the 'force_reasoning' parameter enabled for thinking model, disabled for instruct model) for processing using the llama-cpp-python library. @@ -861,8 +832,8 @@ llm = Llama( # Set up the chat handler for Qwen3-VL, specifying the projector path chat_handler=Qwen3VLChatHandler( clip_model_path=MMPROJ_PATH, - force_reasoning=True, - image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks + force_reasoning=True, # Note: Some models use `enable_thinking` as a switch variable. See the comments in the corresponding model's chathandler for details. + image_min_tokens=1024, # Note: Qwen3-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks ), n_gpu_layers=-1, # Offload all layers to the GPU n_ctx=10240, # Set the context window size @@ -1165,9 +1136,13 @@ The context window of the Llama models determines the maximum number of tokens t For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: ```python -llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048) +llm = Llama(model_path="./models/llama-model.gguf", n_ctx=2048) ``` +## Docker image + +See here: https://github.com/JamePeng/llama-cpp-python/tree/main/docker#cuda_simple + ## OpenAI Compatible Web Server (Deprecated) `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. @@ -1214,16 +1189,6 @@ python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --m - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models) - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support) -## Docker image - -A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: - -```bash -docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest -``` - -[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) - ## Low-level API [API Reference](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#low-level-api) @@ -1363,6 +1328,17 @@ The reason libraries from other authors are smaller is that they often **only co * 2. Regarding AMD and Intel graphics cards, AMD can certainly use ROCm as the primary backend (but the drawback is that it's basically only stable on Linux platforms), and Intel's Sycl will also encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. +* 3. If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: + + ```python + llm = Llama( + model_path="./Qwen3.5-VL-9B.gguf", + chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), + n_ctx=4096, + ctx_checkpoints=0 # <-- SET THIS TO 0 TO ENABLE ZERO-LATENCY FAST PATH + ) + ``` + ### Any suggestions, contributions, and modifications to this package will be directed toward building a user-friendly, efficient, and secure Python library. From e1ade17c6330e3cc46a2b08f9b48b1540521b231 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 6 Apr 2026 12:07:58 +0800 Subject: [PATCH 312/518] Bump version to 0.3.35 Signed-off-by: JamePeng --- CHANGELOG.md | 48 +++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38b4b90b50..156cbd334e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,54 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.35] Gemma 4 series & LFM 2.5-VL Support, OpenAI OpenAPI Alignment and Logging Architecture Migration + +- fix: expand stop sequences for `Gemma4ChatHandler` + - Add `GEMMA4_EOS_TOKEN` and `GEMMA4_STR_TOKEN` to the generation stop criteria. + - Align the stopping logic with the model's `generation_config.json` definitions. + - Prevent potential over-generation by ensuring the model halts correctly at standard EOS or when initiating a tool response. + +- feat(types): align with latest OpenAI OpenAPI spec (audio, structured outputs) + - Update llama_types.py OpenAI [OpenAPI Link](https://app.stainless.com/api/spec/documented/openai/openapi.documented.yml) + - Add `developer` role. + - Replace Anyscale-specific JSON schema with official OpenAI `json_schema` response format for Structured Outputs. + - Add `input_audio` and `file` types to request message content parts. + - Add `audio`, `refusal`, and `annotations` (e.g., URL citations) fields to response messages. + - Add `content_filter` to finish reasons and strictly define global `ChatCompletionRole`. + +- docs: clarify `enable_thinking` compatibility for **Gemma 4** models + - Update `Gemma4ChatHandler` class docstring and `__init__` args documentation. + - Specify that the `enable_thinking` toggle is exclusively supported by Gemma4 31B and 26BA4B variants. + - Explicitly note that E2B and E4B models do not currently support this feature to prevent configuration errors. + +- feat(chat_format): Implemented `Gemma4ChatHandler`, add Gemma 4 chat handler with multimodal and tool support + - Implement `Gemma4ChatHandler` with Gemma 4 specific tokens (`<|turn>`, `<|channel>`, etc.). + - Add complex Jinja2 template for advanced nested tool/function schema formatting. + - Support multimodal content injection for `image_url`, `audio_url`, and `input_audio` (including base64 reconstruction). + - Integrate reasoning/thinking controls via `enable_thinking` toggle and `<|channel>thought` formatting. + - Configure `` as the primary stop sequence for generation boundaries. + +- feat(chat_format) Implemented `LFM25VLChatHandler` for **LFM2.5-VL** (by **@alcoftTAO**) + +- fix Qwen3.5 chat template typos(reported by **@abdullah-cod9**) + +- refactor(logger): migrate from llama_log_callback to ggml_log_callback + - Remove the deprecated `llama_log_callback` typedef from `llama_cpp.py`. + - Update `_logger.py` to use `ggml_log_callback` from `_ggml`, aligning with the upstream GGML logging architecture. + - Rename the callback references across the codebase, including the MTMD context initialization in `llama_chat_format.py`. + +- feat(ggml): add support for ggml-base library and new function bindings + - Load the new `ggml-base` shared library alongside `ggml`. + - Add `ctypes` bindings for `ggml_log_get`, `ggml_log_set`, and `ggml_set_zero` using the `ggml_base_function` decorator. + +- Update README.md + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/58190cc84d846d8575ba26e8486bc29d9fd8ad55](https://github.com/ggml-org/llama.cpp/commit/58190cc84d846d8575ba26e8486bc29d9fd8ad55) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260402 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/a184583e908cc138fd15794986b3581521fb9b0c...232092e32b3563159a86aacb168da06c4937192b + ## [0.3.34] Dynamic LoRA Routing, Control Vectors, and Assistant Prefill - **feat(chat_format): added assistant_prefill to seamlessly continue responses** diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 5a0a40d108..fb263e7825 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.34" +__version__ = "0.3.35" From 35366e445149fdc1f67070e2bfe40c51cc686d79 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 6 Apr 2026 22:41:05 +0800 Subject: [PATCH 313/518] Separating the GitHub action cu124-126 workflows into independent cu124 and cu126 tasks - Because a single task takes too long(5-6 hours), one of them might fail simultaneously. Recompiling would then require both tasks to run again, consuming even more time. --- ...126-win.yml => build-wheels-cu124-win.yml} | 4 +- .github/workflows/build-wheels-cu126-win.yml | 134 ++++++++++++++++++ 2 files changed, 136 insertions(+), 2 deletions(-) rename .github/workflows/{build-wheels-cu124-cu126-win.yml => build-wheels-cu124-win.yml} (98%) create mode 100644 .github/workflows/build-wheels-cu126-win.yml diff --git a/.github/workflows/build-wheels-cu124-cu126-win.yml b/.github/workflows/build-wheels-cu124-win.yml similarity index 98% rename from .github/workflows/build-wheels-cu124-cu126-win.yml rename to .github/workflows/build-wheels-cu124-win.yml index 6513526cde..c6800e246a 100644 --- a/.github/workflows/build-wheels-cu124-cu126-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -1,4 +1,4 @@ -name: Build Wheels (CU124-126) for Windows +name: Build Wheels (CU124) for Windows on: workflow_dispatch: @@ -14,7 +14,7 @@ jobs: matrix: os: ['windows-2022'] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] - cuda: ["12.4.1","12.6.3"] + cuda: ["12.4.1"] releasetag: ["Basic"] cudaarch: ["all"] defaults: diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml new file mode 100644 index 0000000000..eec32f6f0d --- /dev/null +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -0,0 +1,134 @@ +name: Build Wheels (CU126) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['windows-2022'] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["12.6.3"] + releasetag: ["Basic"] + cudaarch: ["all"] + defaults: + run: + shell: pwsh + env: + CUDAVER: ${{ matrix.cuda }} + AVXVER: ${{ matrix.releasetag }} + CUDAARCHVER: ${{ matrix.cudaarch }} + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list + # e.g. "all" "89" "90" "100" "120" + MAX_JOBS: 8 + + steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v2 + with: + msbuild-architecture: x64 + + - uses: actions/checkout@v5 + with: + submodules: "recursive" + + # from kingbri1/flash-attention build-wheels.yml + - name: Install CUDA ${{ matrix.cuda }} + uses: N-Storm/cuda-toolkit@v0.2.28 + id: cuda-toolkit + with: + cuda: "${{ matrix.cuda }}" + use-github-cache: false + + # from astral-sh/setup-uv + - name: Install the latest version of uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install Dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Build Wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" + + if ($env:AVXVER -eq 'AVX') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' + } + if ($env:AVXVER -eq 'AVX2') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' + } + if ($env:AVXVER -eq 'AVXVNNI') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' + } + # if ($env:AVXVER -eq 'AVX512') { + # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' + # } + # Basic options for compiling without AVX instructions + if ($env:AVXVER -eq 'Basic') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + } + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Split wheel filename: name-ver-py-abi-plat.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" + + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get Current Date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create Release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v2 + with: + files: dist/* + # Set tag_name to -cu--win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 0e19b5e9d52ace58c4a82e7954b4185e72f86a69 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 00:35:22 +0800 Subject: [PATCH 314/518] Update Submodule vendor/llama.cpp 58190cc..69c28f1 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 58190cc84d..69c28f1547 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 58190cc84d846d8575ba26e8486bc29d9fd8ad55 +Subproject commit 69c28f1547c169902f62ca48bee75fb876c4d8e6 From c3d6fdecb80e1a8389fa30acaa809496253c2ec9 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 00:37:06 +0800 Subject: [PATCH 315/518] Workflow (metal): Try using gh to replace the unmaintained softprops/action-gh-release. --- .github/workflows/build-wheels-metal.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index abb8969247..b583d37db3 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -85,10 +85,9 @@ jobs: # Store the date in environment variable for the release step echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - name: Publish Release - uses: softprops/action-gh-release@v2.2.2 - with: - files: dist2/* - tag_name: v${{ needs.build_wheels.outputs.version }}-Metal-macos-${{ env.BUILD_DATE }} + - name: Publish Release via GitHub CLI env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: + TAG_NAME="v${{ needs.build_wheels.outputs.version }}-Metal-macos-$BUILD_DATE" + gh release create "$TAG_NAME" dist2/* --title "$TAG_NAME" --generate-notes From ac38f388cd8c52a56ad6f498193a5c0c9a0f5369 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 01:44:35 +0800 Subject: [PATCH 316/518] ci: restrict cudaarch to Volta-Hopper to fix GitHub Actions timeout Using the `all` option for `cudaarch` on CUDA 12.4-12.6 causes the compilation process to exceed the 6-hour maximum execution limit on GitHub Actions, leading to cancelled jobs. To resolve this and reduce build times, the target architectures are now restricted to explicitly support compute capabilities 7.0 through 9.0 (`70-real` to `90-real`). This maintains support for all modern NVIDIA GPUs equipped with Tensor Cores (from Volta up to Hopper architectures) while keeping the build time safely within CI constraints. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu124-win.yml | 2 +- .github/workflows/build-wheels-cu126-win.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index c6800e246a..f020cd6708 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -16,7 +16,7 @@ jobs: pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.4.1"] releasetag: ["Basic"] - cudaarch: ["all"] + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] defaults: run: shell: pwsh diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index eec32f6f0d..08115a2ef5 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -16,7 +16,7 @@ jobs: pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.6.3"] releasetag: ["Basic"] - cudaarch: ["all"] + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] defaults: run: shell: pwsh From 4f2a132e35e1a5eb6491e1046fb36efd9660cff5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 02:05:31 +0800 Subject: [PATCH 317/518] Update CI Action runner version microsoft/setup-msbuild@v2 -> v3 actions/checkout@v5 -> v6 actions/upload-artifact@v4 -> v6 actions/download-artifact@v4 -> v6 Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu124-win.yml | 4 ++-- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu126-win.yml | 4 ++-- .github/workflows/build-wheels-cu128-linux.yml | 2 +- .github/workflows/build-wheels-cu128-win.yml | 4 ++-- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 4 ++-- .github/workflows/build-wheels-metal.yaml | 4 ++-- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 9a55248124..f14684289d 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -34,7 +34,7 @@ jobs: apt update apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - uses: actions/checkout@v4 # Checkout code + - uses: actions/checkout@v6 # Checkout code with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index f020cd6708..0989afc8dd 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -32,11 +32,11 @@ jobs: steps: - name: Add MSBuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 + uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index bca09d2f66..1eda5d10f2 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -34,7 +34,7 @@ jobs: apt update apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - uses: actions/checkout@v4 # Checkout code + - uses: actions/checkout@v6 # Checkout code with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index 08115a2ef5..19474b530d 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -32,11 +32,11 @@ jobs: steps: - name: Add MSBuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 + uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index ad13b30706..a4ab9e8eb2 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -34,7 +34,7 @@ jobs: apt update apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - uses: actions/checkout@v4 # Checkout code + - uses: actions/checkout@v6 # Checkout code with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index e9d36602bd..0d87e09a45 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -32,11 +32,11 @@ jobs: steps: - name: Add MSBuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 + uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 574690cdf2..dbc710e18a 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -34,7 +34,7 @@ jobs: apt update apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - uses: actions/checkout@v5 # Checkout code + - uses: actions/checkout@v6 # Checkout code with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index d055db43af..5b8b9c1992 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -32,11 +32,11 @@ jobs: steps: - name: Add MSBuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v2 + uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index b583d37db3..094b735594 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -60,7 +60,7 @@ jobs: output-dir: wheelhouse2 - name: Upload artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: wheels-metal_${{ matrix.os }} path: ./wheelhouse2/*.whl @@ -72,7 +72,7 @@ jobs: steps: - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v6 with: merge-multiple: true path: dist2 From e9c3013d6deb8ffcc08e43284371e9fb363ec32c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 02:10:15 +0800 Subject: [PATCH 318/518] fix multi-line script in metal workflow --- .github/workflows/build-wheels-metal.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 094b735594..09ae8470a6 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -88,6 +88,9 @@ jobs: - name: Publish Release via GitHub CLI env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: + run: | TAG_NAME="v${{ needs.build_wheels.outputs.version }}-Metal-macos-$BUILD_DATE" + + echo "Ready to create release with tag: $TAG_NAME" + gh release create "$TAG_NAME" dist2/* --title "$TAG_NAME" --generate-notes From 58c5fe1f5a90654d1cf5e4f31f6be398801471ec Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 02:37:47 +0800 Subject: [PATCH 319/518] Update Submodule vendor/llama.cpp 69c28f1..4eb1951 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 69c28f1547..4eb19514dd 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 69c28f1547c169902f62ca48bee75fb876c4d8e6 +Subproject commit 4eb19514dd2984662f13aacbb052c559c8fde3b1 From fbbf2ddb3d05b06bbf8de2ef774d4f8be1f52fd9 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 02:38:40 +0800 Subject: [PATCH 320/518] Adding the `actions/checkout` step to the release job to provide the `.git` context required by the CLI's `--generate-notes` flag. --- .github/workflows/build-wheels-metal.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 09ae8470a6..40675b4c26 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -71,6 +71,9 @@ jobs: runs-on: ubuntu-latest steps: + - name: Checkout repository + uses: actions/checkout@v6 + - name: Download artifacts uses: actions/download-artifact@v6 with: From 645c8ed91c53c00400cfc90b4d2604e94f718a84 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 04:06:28 +0800 Subject: [PATCH 321/518] feat(types): align with latest OpenAI API spec and fix type issues - Expand `CompletionUsage` with `PromptTokensDetails` and `CompletionTokensDetails` for granular token tracking. - Add `usage` to `CreateChatCompletionStreamResponse` to support usage reporting in streaming mode. - Fix duplicate `object` field in `CreateCompletionResponse`. - Update `ChatCompletionRequestAssistantMessage` to accept `None` for `content` and introduce the new `refusal` field. - Clean up `ChatCompletionRequestMessage` Union by removing the duplicate user message type. - Broaden `ChatCompletionToolChoiceOption` to fully support `allowed_tools` and `custom` tool choice behaviors. Signed-off-by: JamePeng --- llama_cpp/llama_types.py | 44 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 336cddab0f..60202ae8f0 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -49,10 +49,24 @@ class CompletionChoice(TypedDict): finish_reason: Optional[Literal["stop", "length", "content_filter"]] +class PromptTokensDetails(TypedDict): + cached_tokens: NotRequired[int] + audio_tokens: NotRequired[int] + + +class CompletionTokensDetails(TypedDict): + reasoning_tokens: NotRequired[int] + audio_tokens: NotRequired[int] + accepted_prediction_tokens: NotRequired[int] + rejected_prediction_tokens: NotRequired[int] + + class CompletionUsage(TypedDict): prompt_tokens: int completion_tokens: int total_tokens: int + prompt_tokens_details: NotRequired[PromptTokensDetails] + completion_tokens_details: NotRequired[CompletionTokensDetails] class CreateCompletionResponse(TypedDict): @@ -61,7 +75,6 @@ class CreateCompletionResponse(TypedDict): created: int model: str choices: List[CompletionChoice] - object: Optional[Literal["text_completion"]] usage: NotRequired[CompletionUsage] @@ -198,6 +211,7 @@ class CreateChatCompletionStreamResponse(TypedDict): object: Literal["chat.completion.chunk"] created: int choices: List[ChatCompletionStreamResponseChoice] + usage: NotRequired[CompletionUsage] class ChatCompletionFunctions(TypedDict): @@ -307,7 +321,8 @@ class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): class ChatCompletionRequestAssistantMessage(TypedDict): role: Literal["assistant"] - content: NotRequired[str] + content: NotRequired[Optional[str]] + refusal: NotRequired[Optional[str]] tool_calls: NotRequired[ChatCompletionMessageToolCalls] function_call: NotRequired[ ChatCompletionRequestAssistantMessageFunctionCall @@ -331,7 +346,6 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, ChatCompletionRequestAssistantMessage, - ChatCompletionRequestUserMessage, ChatCompletionRequestToolMessage, ChatCompletionRequestFunctionMessage, ] @@ -359,6 +373,16 @@ class ChatCompletionTool(TypedDict): function: ChatCompletionToolFunction +class ChatCompletionAllowedTools(TypedDict): + mode: Literal["auto", "required"] + tools: List[Dict[str, Any]] + + +class ChatCompletionAllowedToolsChoice(TypedDict): + type: Literal["allowed_tools"] + allowed_tools: ChatCompletionAllowedTools + + class ChatCompletionNamedToolChoiceFunction(TypedDict): name: str @@ -368,8 +392,20 @@ class ChatCompletionNamedToolChoice(TypedDict): function: ChatCompletionNamedToolChoiceFunction +class ChatCompletionNamedToolChoiceCustomObject(TypedDict): + name: str + + +class ChatCompletionNamedToolChoiceCustom(TypedDict): + type: Literal["custom"] + custom: ChatCompletionNamedToolChoiceCustomObject + + ChatCompletionToolChoiceOption = Union[ - Literal["none", "auto", "required"], ChatCompletionNamedToolChoice + Literal["none", "auto", "required"], + ChatCompletionAllowedToolsChoice, + ChatCompletionNamedToolChoice, + ChatCompletionNamedToolChoiceCustom ] From 6ebab28a06c79cd02d3b766c61fa75b73c85203f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 8 Apr 2026 19:32:02 +0800 Subject: [PATCH 322/518] Update Submodule vendor/llama.cpp 4eb1951..d12cc3d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4eb19514dd..d12cc3d1ca 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4eb19514dd2984662f13aacbb052c559c8fde3b1 +Subproject commit d12cc3d1ca6bba741cd77887ac9c9ee18c8415c7 From 9241b0fafad5ba1ed3730893ff9e60e9b2b6c91f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 9 Apr 2026 03:09:49 +0800 Subject: [PATCH 323/518] Sync ggml: add Q1_0 1-bit quantization support (CPU) (#21273) --- llama_cpp/_ggml.py | 8 ++++++-- llama_cpp/llama_cpp.py | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index f22c9eb94d..733a3520c1 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -121,7 +121,8 @@ class GGMLStatus(enum.IntEnum): # // GGML_TYPE_IQ4_NL_8_8 = 38, # GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) # GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) -# GGML_TYPE_COUNT = 41, +# GGML_TYPE_Q1_0 = 41, +# GGML_TYPE_COUNT = 42, # }; class GGMLType(enum.IntEnum): GGML_TYPE_F32 = 0 @@ -157,7 +158,8 @@ class GGMLType(enum.IntEnum): GGML_TYPE_TQ2_0 = 35 GGML_TYPE_MXFP4 = 39 GGML_TYPE_NVFP4 = 40 - GGML_TYPE_COUNT = 41 + GGML_TYPE_Q1_0 = 41 + GGML_TYPE_COUNT = 42 # // precision @@ -198,6 +200,7 @@ class GGMLPrec(enum.IntEnum): # GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors # GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors # GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors # }; class GGMLFType(enum.IntEnum): GGML_FTYPE_UNKNOWN = -1 @@ -226,6 +229,7 @@ class GGMLFType(enum.IntEnum): GGML_FTYPE_MOSTLY_BF16 = 24 GGML_FTYPE_MOSTLY_MXFP4 = 25 GGML_FTYPE_MOSTLY_NVFP4 = 26 + GGML_FTYPE_MOSTLY_Q1_0 = 27 # // available tensor operations: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 5d7fcd5fd2..1e1e40b70f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -351,6 +351,7 @@ # LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors # LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors # LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -391,6 +392,7 @@ LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 LLAMA_FTYPE_MOSTLY_NVFP4 = 39 +LLAMA_FTYPE_MOSTLY_Q1_0 = 40 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { From abe789f0223b9d930292c7a54bb87170e088b9eb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 9 Apr 2026 04:34:05 +0800 Subject: [PATCH 324/518] Implement `Step3VLChatHandler` for Step3-VL-10B Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 137 +++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/README.md b/README.md index e316972b78..77eb0e661c 100644 --- a/README.md +++ b/README.md @@ -744,6 +744,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | | [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` | +| [step3-vl](https://huggingface.co/JamePeng2023/Step3-VL-10B-GGUF) | `Step3VLChatHandler` | `step3-vl` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 3c426b38f5..73c186aa8d 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5512,6 +5512,143 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class Step3VLChatHandler(MTMDChatHandler): + """ + Handler for Step3-VL models. + """ + + STEP3VL_BOS_TOKEN = "<|im_start|>" + STEP3VL_EOS_TOKEN = "<|im_end|>" + STEP3VL_PAD_TOKEN = "<|endoftext|>" + STEP3VL_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro render_content(content) -%}\n" + " {%- if content is none -%}{{- '' -}}\n" + " {%- elif content is string -%}{{- content -}}\n" + " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" + " {%- elif content is iterable -%}\n" + " {%- for item in content -%}\n" + " {%- if item.type == 'text' -%}\n" + " {{- item['value'] if 'value' in item else item['text'] -}}\n" + " {%- elif item.type in ['image', 'image_url'] -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.image_url -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {{- '' + url_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "\n" + "{%- if tools -%}\n" + " {{- '<|im_start|>system\\n' -}}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" + " {%- endif -%}\n" + " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" + " {%- for tool in tools -%}\n" + " {{- '\\n' -}}\n" + " {{- tool | tojson -}}\n" + " {%- endfor -%}\n" + " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" + "{%- else -%}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + "\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" + "{%- for message in messages[::-1] -%}\n" + " {%- set index = (messages|length - 1) - loop.index0 -%}\n" + " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" + " {%- set ns.multi_step_tool = false -%}\n" + " {%- set ns.last_query_index = index -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- for message in messages -%}\n" + " {%- set content = render_content(message.content) -%}\n" + " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" + " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" + " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" + " {%- elif message.role == 'assistant' -%}\n" + " {%- if message.reasoning_content is string -%}\n" + " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" + " {%- else -%}\n" + " {%- if '' in content -%}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" + " {%- else -%}\n" + " {%- set reasoning_content = '' -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if loop.index0 > ns.last_query_index -%}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" + " {%- else -%}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" + " {%- endif -%}\n" + " {%- if message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- for tool_call in message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- if tool_call.function -%}\n" + " {%- set tool_call = tool_call.function -%}\n" + " {%- endif -%}\n" + " {{- '\\n{\"name\": \"' -}}\n" + " {{- tool_call.name -}}\n" + " {{- '\", \"arguments\": ' -}}\n" + " {%- if tool_call.arguments is string -%}\n" + " {{- tool_call.arguments -}}\n" + " {%- else -%}\n" + " {{- tool_call.arguments | tojson -}}\n" + " {%- endif -%}\n" + " {{- '}\\n' -}}\n" + " {%- endfor -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- elif message.role == 'tool' -%}\n" + " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" + " {{- '<|im_start|>tool_response' -}}\n" + " {%- endif -%}\n" + " {{- '\\n\\n' -}}\n" + " {{- content -}}\n" + " {{- '\\n' -}}\n" + " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Step3-VL Handler. + + Args: + enable_thinking (bool): If False, injects an empty block to bypass reasoning. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Pass thinking toggle into Jinja + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Step3 uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama_core.Llama, From 5a47267c1c45b5ea6002963325e51622038d04f1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 10 Apr 2026 03:40:52 +0800 Subject: [PATCH 325/518] Update Submodule vendor/llama.cpp d12cc3d..d132f22 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d12cc3d1ca..d132f22fc9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d12cc3d1ca6bba741cd77887ac9c9ee18c8415c7 +Subproject commit d132f22fc92f36848f7ccf2fc9987cd0b0120825 From 6866eba30deb12bf665339ea55ac8dce192872b5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 10 Apr 2026 03:43:24 +0800 Subject: [PATCH 326/518] Sync ggml: backend-agnostic tensor parallelism (experimental) (#19378) --- llama_cpp/llama_cpp.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1e1e40b70f..a527904637 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -460,14 +460,16 @@ def llama_flash_attn_type_name( """ # enum llama_split_mode { -# LLAMA_SPLIT_MODE_NONE = 0, // single GPU -# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs -# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs +# LLAMA_SPLIT_MODE_NONE = 0, // single GPU +# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs +# LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported +# LLAMA_SPLIT_MODE_TENSOR = 3, # }; -LLAMA_SPLIT_MODE_NONE = 0 -LLAMA_SPLIT_MODE_LAYER = 1 -LLAMA_SPLIT_MODE_ROW = 2 - +class llama_split_mode(enum.IntEnum): + LLAMA_SPLIT_MODE_NONE = 0 + LLAMA_SPLIT_MODE_LAYER = 1 + LLAMA_SPLIT_MODE_ROW = 2 + LLAMA_SPLIT_MODE_TENSOR = 3 # typedef struct llama_token_data { # llama_token id; // token id From 7d2c4ba6c49ff4f3eec81d6eb84b0aab9421e451 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 10 Apr 2026 04:05:29 +0800 Subject: [PATCH 327/518] Update Makefile --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 2e1cd1c71b..db99016262 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,9 @@ build.musa: build.openblas: CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e . +build.openvino: + CMAKE_ARGS="-DGGML_OPENVINO=ON" python3 -m pip install --verbose -e . + build.rpc: CMAKE_ARGS="-DGGML_RPC=on" python3 -m pip install --verbose -e . @@ -63,6 +66,9 @@ build.webgpu: build.zdnn: CMAKE_ARGS="-DGGML_ZDNN=ON" python3 -m pip install --verbose -e . +build.zendnn : + CMAKE_ARGS="-DGGML_ZENDNN=ON" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist --verbose From f22b819d8129d4ffe917429def81310e28d2967d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 10 Apr 2026 04:11:27 +0800 Subject: [PATCH 328/518] fix missing change of llama_cpp.llama_split_mode.LLAMA_SPLIT_MODE_LAYER, --- llama_cpp/llama.py | 2 +- llama_cpp/server/settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d6c6926e60..59e636b56e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -71,7 +71,7 @@ def __init__( *, # Model Params n_gpu_layers: int = 0, - split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, + split_mode: int = llama_cpp.llama_split_mode.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, vocab_only: bool = False, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index db96a41705..350ccc2323 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -31,7 +31,7 @@ class ModelSettings(BaseSettings): description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", ) split_mode: int = Field( - default=llama_cpp.LLAMA_SPLIT_MODE_LAYER, + default=llama_cpp.llama_split_mode.LLAMA_SPLIT_MODE_LAYER, description="The split mode to use.", ) main_gpu: int = Field( From 634a712e970f86bebec5ced94d04042fffacbe9b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 10 Apr 2026 04:58:57 +0800 Subject: [PATCH 329/518] Update README.md for OpenVINO/Metal/Vulkan/SYCL Signed-off-by: JamePeng --- README.md | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 77eb0e661c..2639556ab0 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,8 @@ Installing a CUDA-supported version requires the `CUDA Toolkit` environment to b See here: https://developer.nvidia.com/cuda-toolkit-archive +More Information see: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#cuda + Then, set the `GGML_CUDA=on` environment variable before installing: ```bash @@ -169,13 +171,64 @@ CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install "llama-cpp-p ``` +
+OpenVINO + +### Install OpenVINO Runtime + +Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html) + +- **Linux:** + +
+ 📦 Click to expand OpenVINO installation from an archive file on Ubuntu +
+ + ```bash + wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh + chmod +x install-openvino-from-archive.sh + ./install-openvino-from-archive.sh + ``` + + Verify OpenVINO is initialized properly: + ```bash + echo $OpenVINO_DIR + ``` +
+ +### Supported Devices + +OpenVINO backend supports the following hardware: + +- Intel CPUs +- Intel GPUs (integrated and discrete) +- Intel NPUs + +Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2. + +More Information see: https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENVINO.md + +To install with OpenVINO, set the `GGML_OPENVINO=ON` environment variable before installing: + +```bash +# Linux +source /opt/intel/openvino/setupvars.sh +# Windows +"C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat" +# Build +CMAKE_ARGS="-DGGML_OPENVINO=ON" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` +
+
Metal -To install with Metal (MPS), set the `GGML_METAL=on` environment variable before installing: +On MacOS, Metal is enabled by default(`GGML_METAL=ON`). Using Metal makes the computation run on the GPU. + +To disable the Metal build at compile time use the `CMAKE_ARGS="-DGGML_METAL=OFF"` cmake option. ```bash -CMAKE_ARGS="-DGGML_METAL=on -DGGML_METAL_USE_BF16=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` **Pre-built Wheel (New)** @@ -213,7 +266,20 @@ More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/bu - For Windows User: Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. -- For Linux User: Follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide. +- For Linux User: + * First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide. + + * After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this. + +- For Mac User: + * Generally, follow LunarG's [Getting Started with the MacOS Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/mac/getting_started.html) guide for installation and setup of the Vulkan SDK. There are two options of Vulkan drivers on macOS, both of which implement translation layers to map Vulkan to Metal. They can be hot-swapped by setting the `VK_ICD_FILENAMES` environment variable to point to the respective ICD JSON file. Check the box for "KosmicKrisp" during the LunarG Vulkan SDK installation. + + * Set environment variable for the LunarG Vulkan SDK after installation (and optionally add to your shell profile for persistence): + ```bash + source /path/to/vulkan-sdk/setup-env.sh + ``` + +More Information see: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#vulkan Then install with Vulkan support by set the `GGML_VULKAN=on` environment variable before installing: @@ -226,11 +292,35 @@ CMAKE_ARGS="-DGGML_VULKAN=on" pip install "llama-cpp-python @ git+https://github
SYCL +### Supported OS + +| OS | Status | Verified | +|---------|---------|------------------------------------------------| +| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux | +| Windows | Support | Windows 11 | + +### Intel GPU + +SYCL backend supports Intel GPU Family: + +- Intel Data Center Max Series +- Intel Flex Series, Arc Series +- Intel Built-in Arc GPU +- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). + +On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities. + +More Information see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md + To install with SYCL support, set the `GGML_SYCL=on` environment variable before installing: ```bash -source /opt/intel/oneapi/setvars.sh +# Export relevant ENV variables +source /opt/intel/oneapi/setvars.sh +# Option 1: Use FP32 (recommended for better performance in most cases) CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +# Option 2: Use FP16 +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ```
From e73ad59ee9b2c967f2a1012635e380d03bd5aecb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 11 Apr 2026 06:06:21 +0800 Subject: [PATCH 330/518] Update Submodule vendor/llama.cpp d132f22..073bb2c --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d132f22fc9..073bb2c20b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d132f22fc92f36848f7ccf2fc9987cd0b0120825 +Subproject commit 073bb2c20b5b2c919469653214aaa1a9895816a2 From 122d8dbb37ff60fe267d314e9744f67134dbfdfe Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 11 Apr 2026 23:58:05 +0800 Subject: [PATCH 331/518] refactor: update Gemma4ChatHandler with latest google/gemma-4-31B-it chat template from huggingface - Sync `Gemma4ChatHandler` logic with the upstream chat template, incorporating the new `format_tool_response_block` and OpenAI-compatible forward-scan tool resolution. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 179 ++++++++++++++++++++++----------- 1 file changed, 123 insertions(+), 56 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 73c186aa8d..c0936883fc 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4376,34 +4376,15 @@ class Gemma4ChatHandler(MTMDChatHandler): " description:<|\"|>{{ value['description'] }}<|\"|>\n" " {%- set add_comma = true -%}\n" " {%- endif -%}\n" - " {%- if value['nullable'] %}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " nullable:true\n" - " {%- endif -%}\n" " {%- if value['type'] | upper == 'STRING' -%}\n" " {%- if value['enum'] -%}\n" " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" " enum:{{ format_argument(value['enum']) }}\n" " {%- endif -%}\n" - " {%- elif value['type'] | upper == 'OBJECT' -%}\n" - " ,properties:{\n" - " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" - " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" - " {%- elif value is mapping -%}\n" - " {{- format_parameters(value, value['required'] | default([])) -}}\n" - " {%- endif -%}\n" - " }\n" - " {%- if value['required'] -%}\n" - " ,required:[\n" - " {%- for item in value['required'] | default([]) -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- endif -%}\n" " {%- elif value['type'] | upper == 'ARRAY' -%}\n" " {%- if value['items'] is mapping and value['items'] -%}\n" - " ,items:{\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " items:{\n" " {%- set ns_items = namespace(found_first=false) -%}\n" " {%- for item_key, item_value in value['items'] | dictsort -%}\n" " {%- if item_value is not none -%}\n" @@ -4436,6 +4417,32 @@ class Gemma4ChatHandler(MTMDChatHandler): " }\n" " {%- endif -%}\n" " {%- endif -%}\n" + " {%- if value['nullable'] %}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " nullable:true\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'OBJECT' -%}\n" + " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" + " }\n" + " {%- elif value is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value, value['required'] | default([])) -}}\n" + " }\n" + " {%- endif -%}\n" + " {%- if value['required'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " required:[\n" + " {%- for item in value['required'] | default([]) -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- endif -%}\n" + " {%- endif -%}\n" " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" " {%- endif -%}\n" @@ -4514,25 +4521,35 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " {{- ns.result | trim -}}\n" "{%- endmacro -%}\n" - "\n" + "{%- macro format_tool_response_block(tool_name, response) -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- if response is mapping -%}\n" + " {{- 'response:' + tool_name + '{' -}}\n" + " {%- for key, value in response | dictsort -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- else -%}\n" + " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" + " {%- endif -%}\n" + " {{- '' -}}\n" + "{%- endmacro -%}\n" "{%- set ns = namespace(prev_message_type=None) -%}\n" "{%- set loop_messages = messages -%}\n" - "{{ bos_token }}\n" + "{{- bos_token -}}\n" "{#- Handle System/Tool Definitions Block -#}\n" "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" " {{- '<|turn>system\\n' -}}\n" - "\n" " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" " {%- if enable_thinking is defined and enable_thinking -%}\n" - " {{- '<|think|>' -}}\n" + " {{- '<|think|>\\n' -}}\n" " {%- set ns.prev_message_type = 'think' -%}\n" " {%- endif -%}\n" - "\n" " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" " {{- messages[0]['content'] | trim -}}\n" " {%- set loop_messages = messages[1:] -%}\n" " {%- endif -%}\n" - "\n" " {%- if tools -%}\n" " {%- for tool in tools %}\n" " {{- '<|tool>' -}}\n" @@ -4541,16 +4558,41 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor %}\n" " {%- set ns.prev_message_type = 'tool' -%}\n" " {%- endif -%}\n" - "\n" " {{- '\\n' -}}\n" "{%- endif %}\n" - "\n" + "{#- Pre-scan: find last user message index for reasoning guard -#}\n" + "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" + "{%- for i in range(loop_messages | length) -%}\n" + " {%- if loop_messages[i]['role'] == 'user' -%}\n" + " {%- set ns_turn.last_user_idx = i -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" "{#- Loop through messages -#}\n" "{%- for message in loop_messages -%}\n" + " {%- if message['role'] != 'tool' -%}\n" " {%- set ns.prev_message_type = None -%}\n" " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" + " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" + " {%- set prev_nt = namespace(role=None, found=false) -%}\n" + " {%- if loop.index0 > 0 -%}\n" + " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" + " {%- if not prev_nt.found -%}\n" + " {%- if loop_messages[j]['role'] != 'tool' -%}\n" + " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" + " {%- set prev_nt.found = true -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" + " {%- if not continue_same_model_turn -%}\n" " {{- '<|turn>' + role + '\\n' }}\n" - "\n" + " {%- endif -%}\n" + " {#- Render reasoning/reasoning_content as thinking channel -#}\n" + " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" + " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" + " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" + " {%- endif -%}\n" " {%- if message['tool_calls'] -%}\n" " {%- for tool_call in message['tool_calls'] -%}\n" " {%- set function = tool_call['function'] -%}\n" @@ -4569,26 +4611,50 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " {%- set ns.prev_message_type = 'tool_call' -%}\n" " {%- endif -%}\n" - "\n" - " {%- if message['tool_responses'] -%}\n" - " {#- Tool Response handling -#}\n" + " {%- set ns_tr_out = namespace(flag=false) -%}\n" + " {%- if message.get('tool_responses') -%}\n" + " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" " {%- for tool_response in message['tool_responses'] -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- if tool_response['response'] is mapping -%}\n" - " {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}\n" - " {%- for key, value in tool_response['response'] | dictsort -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" + " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endfor -%}\n" + " {%- elif message.get('tool_calls') -%}\n" + " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" + " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" + " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" + " {%- if ns_tool_scan.stopped -%}\n" + " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" + " {%- set ns_tool_scan.stopped = true -%}\n" " {%- else -%}\n" - " {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}\n" + " {%- set follow = loop_messages[k] -%}\n" + " {#- Resolve tool_call_id to function name -#}\n" + " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" + " {%- for tc in message['tool_calls'] -%}\n" + " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" + " {%- set ns_tname.name = tc['function']['name'] -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {#- Handle content as string or content-parts array -#}\n" + " {%- set tool_body = follow.get('content') -%}\n" + " {%- if tool_body is string -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- elif tool_body is sequence and tool_body is not string -%}\n" + " {%- set ns_txt = namespace(s='') -%}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'text' -%}\n" + " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" + " {%- else -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- endif -%}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" " {%- endif -%}\n" - " {{- '' -}}\n" " {%- endfor -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" " {%- endif -%}\n" - "\n" " {%- if message['content'] is string -%}\n" " {%- if role == 'model' -%}\n" " {{- strip_thinking(message['content']) -}}\n" @@ -4605,35 +4671,36 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {%- elif item['type'] == 'image_url' -%}\n" " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {{- '\\n\\n<|image|>' + url_val + '\\n\\n' -}}\n" + " {{- '<|image|>' + url_val -}}\n" " {%- set ns.prev_message_type = 'image' -%}\n" " {%- elif item['type'] == 'audio_url' -%}\n" " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '\\n\\n<|audio|>' + audio_val + '\\n\\n' -}}\n" + " {{- '<|audio|>' + audio_val -}}\n" " {%- set ns.prev_message_type = 'audio' -%}\n" " {%- elif item['type'] == 'input_audio' -%}\n" " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '\\n\\n<|audio|>' + audio_val + '\\n\\n' -}}\n" + " {{- '<|audio|>' + audio_val -}}\n" " {%- set ns.prev_message_type = 'audio' -%}\n" # " {%- elif item['type'] == 'video_url' -%}\n" # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" - # " {{- '\\n\\n<|video|>' + video_val + '\\n\\n' -}}\n" + # " {{- '<|video|>' + video_val -}}\n" # " {%- set ns.prev_message_type = 'video' -%}\n" " {%- endif -%}\n" " {%- endfor -%}\n" " {%- endif -%}\n" - "\n" - " {%- if not (message['tool_responses'] and not message['content']) -%}\n" + " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- elif not (ns_tr_out.flag and not message.get('content')) -%}\n" " {{- '\\n' -}}\n" " {%- endif -%}\n" + " {%- endif -%}\n" "{%- endfor -%}\n" - "\n" "{%- if add_generation_prompt -%}\n" - " {%- if ns.prev_message_type != 'tool_response' -%}\n" + " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" " {{- '<|turn>model\\n' -}}\n" - " {%- endif -%}\n" - " {%- if not enable_thinking | default(false) -%}\n" - " {{- '<|channel>thought\\n' -}}\n" + " {%- if not enable_thinking | default(false) -%}\n" + " {{- '<|channel>thought\\n' -}}\n" + " {%- endif -%}\n" " {%- endif -%}\n" "{%- endif -%}\n" ) From 4ec15acb4249c18aa613d72e32af9d24478ae82a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 13 Apr 2026 22:16:18 +0800 Subject: [PATCH 332/518] Update Submodule vendor/llama.cpp 073bb2c..75f3bc9 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 073bb2c20b..75f3bc94e6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 073bb2c20b5b2c919469653214aaa1a9895816a2 +Subproject commit 75f3bc94e649616162981c322e8e6b88ca5491e8 From 5e6529ea024644a00c3a3da48e137d8f6e849124 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 14 Apr 2026 06:24:55 +0800 Subject: [PATCH 333/518] docs: add audio processing recommendation to Gemma4ChatHandler - Recommend BF16 mmproj for Gemma4 E2B and E4B models. - Note known degraded audio performance with other quantizations. - Add reference link to the relevant llama.cpp PR/issue comment. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index c0936883fc..1149c7677c 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4342,6 +4342,11 @@ class Gemma4ChatHandler(MTMDChatHandler): Note on `enable_thinking`: The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. It is NOT supported by Gemma4 E2B and E4B models. + + [Important Note for Audio Processing!] + It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. + Other quantizations are known to have degraded performance; + ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 """ # The special token in Gemma 4 From e70304be471e0fe4fbcb3bcc08e6b2bf0e98262d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 14 Apr 2026 21:41:26 +0800 Subject: [PATCH 334/518] Update Submodule vendor/llama.cpp 75f3bc9..1f30ac0 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 75f3bc94e6..1f30ac0cea 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 75f3bc94e649616162981c322e8e6b88ca5491e8 +Subproject commit 1f30ac0ceac0e2b4400069d81857089b6e04872a From 711577248c6d4928bfa3b79111a3ef876803a272 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 14 Apr 2026 21:47:25 +0800 Subject: [PATCH 335/518] docs: add comprehensive omni multimodal example for Gemma-4 - Wrapped the existing Qwen3-VL image loading example in a `
` block to improve README readability and save vertical space. - Introduced a complete, production-ready "Omni MultiModal" example demonstrating simultaneous Vision and Audio processing using the `Gemma4ChatHandler`. - Added a universal `build_media_payload` helper function to dynamically route and encode local files into OpenAI-compatible `image_url` and `input_audio` payload structures. - Added crucial documentation clarifying multimodal capability differences across Gemma-4 variants (E2B/E4B supporting full audio/vision vs. 31B/26BA4B supporting vision only). Signed-off-by: JamePeng --- README.md | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 164 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2639556ab0..7dcd26576c 100644 --- a/README.md +++ b/README.md @@ -900,9 +900,13 @@ print(response["choices"][0]["text"]) **Note**: Multi-modal models also support tool calling and JSON mode. + ## Loading a Local Image With Qwen3VL(Thinking/Instruct) -This script demonstrates how to load a local image, encode it as a base64 Data URI, and pass it to a local Qwen3-VL model (with the 'force_reasoning' parameter enabled for thinking model, disabled for instruct model) for processing using the llama-cpp-python library. +This script demonstrates how to load a local image, encode it as a base64 Data URI, and pass it to a local Qwen3-VL model (with the 'force_reasoning' parameter enabled for thinking model, disabled for instruct model) for processing using the llama-cpp-python library.
+ + +**Example Code**:
```python # Import necessary libraries @@ -1052,6 +1056,165 @@ print(res["choices"][0]["message"]["content"]) ``` +
+ +## Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text) + +Below is a complete, production-ready example demonstrating how to dynamically route and process both image and audio files. It includes a universal media processor that automatically converts local files into the correct payload structure (Data URIs for images, and `input_audio` for audio files). + +> **⚠️ IMPORTANT: GEMMA-4 MODEL CAPABILITIES & LIMITATIONS** +> * **Gemma4 E2B / E4B:** Supports Full Multimodal (Vision + Audio + Text). `enable_thinking` **MUST** be `True`(default). +> * **Gemma4 31B / 26BA4B:** Supports Vision + Text ONLY (Audio is NOT supported). `enable_thinking` can be toggled (`True` or `False`). + +```python +from llama_cpp import Llama +from llama_cpp.llama_chat_format import Gemma4ChatHandler +import base64 +import os + +# Model and multimodal projection paths +MODEL_PATH = r"/path/to/Gemma-4-E4B-It-BF16.gguf" +# BF16 mmproj is required for audio. Other quantizations are known to have degraded performance. +MMPROJ_PATH = r"/path/to/mmproj-Gemma-4-E4B-It-BF16.gguf" + +# Initialize the Llama model with multimodal support +# Note: Since we are using E4B here, enable_thinking MUST be True, and audio is supported. +llm = Llama( + model_path=MODEL_PATH, + chat_handler=Gemma4ChatHandler( + clip_model_path=MMPROJ_PATH, + enable_thinking=True, # MUST be True for E2B/E4B models + verbose=True, # Enable Debug Info + ), + n_gpu_layers=-1, + n_ctx=10240, + verbose=True, # Enable Debug Info +) + +# 1. Extend the MIME dictionary to support audio formats +_MEDIA_MIME_TYPES = { + # ------ Image formats ------ + '.png': ('image', 'image/png'), + '.jpg': ('image', 'image/jpeg'), + '.jpeg': ('image', 'image/jpeg'), + '.gif': ('image', 'image/gif'), + '.webp': ('image', 'image/webp'), + '.bmp': ('image', 'image/bmp'), + + # ------ Audio formats ------ + '.wav': ('audio', 'wav'), # OpenAI standard usually uses raw format names for audio + '.mp3': ('audio', 'mp3'), + # '.flac': ('audio', 'flac'), +} + +def build_media_payload(file_path: str) -> dict: + """ + Read a local media file (image or audio) and convert it into a valid input payload for the LLM. + """ + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Media file not found: {file_path}") + + extension = os.path.splitext(file_path)[1].lower() + media_category, mime_or_format = _MEDIA_MIME_TYPES.get(extension, ('unknown', 'application/octet-stream')) + + if media_category == 'unknown': + print(f"Warning: Unknown extension '{extension}'. It might not be processed correctly.") + + # Read and Base64 encode the file + with open(file_path, "rb") as f: + encoded_data = base64.b64encode(f.read()).decode("utf-8") + + # 2. Return the appropriate dictionary structure based on the media type + if media_category == 'image': + # Image format: Data URI (OpenAI compatible) + data_uri = f"data:{mime_or_format};base64,{encoded_data}" + return { + "type": "image_url", + "image_url": {"url": data_uri} + } + + elif media_category == 'audio': + # Audio format: input_audio (OpenAI compatible) + return { + "type": "input_audio", + "input_audio": { + "data": encoded_data, + "format": mime_or_format + } + } + else: + # Fallback for unsupported formats + return {"type": "text", "text": f"[Attached unsupported file: {file_path}]"} + + +def run_inference(media_paths: list, text_prompt: str): + """ + Helper function to dynamically build the payload and run inference. + """ + # 3. Build the user_content list + user_content = [] + + # Automatically parse each file and append to the payload + for path in media_paths: + payload = build_media_payload(path) + user_content.append(payload) + + # Append the final text instruction + user_content.append({ + "type": "text", + "text": text_prompt + }) + + print(f"\n--- Running Inference with {len(media_paths)} media file(s) ---") + + # 4. Send to the model for inference + response = llm.create_chat_completion( + messages=[ + {"role": "system", "content": """ + You are a highly capable multimodal assistant that can process both text, vision and audio. + + """}, # Note: Supported ONLY by Gemma4 E2B / E4B. + {"role": "user", "content": user_content} + ], + temperature=1.0, + top_p=0.95, + top_k=64, + max_tokens=8192, + ) + + print("\n[Model Response]:") + print(response["choices"][0]["message"]["content"]) + print("-" * 60) + + +# ============================================================================== +# Main Inference Examples +# Uncomment the example block you wish to execute. +# ============================================================================== + +# --- Example A: Image + Audio (Full Multimodal) --- +# Note: Supported ONLY by Gemma4 E2B / E4B. +run_inference( + media_paths=[r"/path/to/test.png", r"/path/to/test.wav"], + text_prompt="Introduce the content by combining the images and converting the audio to text." +) + +# --- Example B: Image Only (Vision + Text) --- +# Note: Supported by all Gemma4 variants (E2B, E4B, 31B, 26BA4B). +# run_inference( +# media_paths=[r"/path/to/test.png"], +# text_prompt="Describe the contents of this image in detail." +# ) + +# --- Example C: Audio Only (Audio + Text) --- +# Note: Supported ONLY by Gemma4 E2B / E4B. +# run_inference( +# media_paths=[r"/path/to/test.wav"], +# text_prompt="Transcribe this audio and summarize the main points." +# ) +``` + + --- ## Embeddings & Reranking (GGUF) From 701195e63a4a09b48dfd28936980532a03598ba3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 15 Apr 2026 20:23:27 +0800 Subject: [PATCH 336/518] Update Submodule vendor/llama.cpp 1f30ac0..8dc530b --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1f30ac0cea..8dc530b86d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1f30ac0ceac0e2b4400069d81857089b6e04872a +Subproject commit 8dc530b86d44cd0667a685539f29ded70a08ae0a From 9d2b2cb0b2a3c175681a5f720bdbb0eafd26e957 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 15 Apr 2026 23:10:56 +0800 Subject: [PATCH 337/518] Sync mtmd: add mtmd_image_tokens_get_decoder_pos() API (#21851) Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 115 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 15 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 3368f848e8..ba64b21ef6 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -82,15 +82,59 @@ class mtmd_input_chunk_type(enum.IntEnum): mtmd_context_p = NewType("mtmd_context_p", int) mtmd_context_p_ctypes = c_void_p -# struct mtmd_bitmap; +# // represents raw image data, layout is RGBRGBRGB... +# // length of data must be nx * ny * 3 +# struct mtmd_bitmap { +# uint32_t nx; +# uint32_t ny; +# std::vector data; +# std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking +# bool is_audio = false; // true if the bitmap is audio +# }; mtmd_bitmap_p = NewType("mtmd_bitmap_p", int) mtmd_bitmap_p_ctypes = c_void_p -# struct mtmd_image_tokens; +# struct mtmd_image_tokens { +# uint32_t nx; // number of tokens in x direction +# uint32_t ny; // number of tokens in y direction +# bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) +# uint32_t n_tokens() const { return nx * ny; } +# clip_image_f32_batch batch_f32; // preprocessed image patches +# std::string id; // optional user-defined ID, useful for KV cache tracking +# mtmd_image_tokens clone() { +# return mtmd_image_tokens{ +# nx, +# ny, +# use_mrope_pos, +# batch_f32.clone(), +# id +# }; +# } +# }; mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int) mtmd_image_tokens_p_ctypes = c_void_p -# struct mtmd_input_chunk; +# struct mtmd_audio_tokens { +# uint32_t n_tokens; // number of tokens +# clip_image_f32_batch batch_f32; // preprocessed image patches +# std::string id; // optional user-defined ID, useful for KV cache tracking +# mtmd_audio_tokens clone() { +# return mtmd_audio_tokens{ +# n_tokens, +# batch_f32.clone(), +# id +# }; +# } +# }; +mtmd_audio_tokens_p = NewType("mtmd_audio_tokens_p", int) +mtmd_audio_tokens_p_ctypes = c_void_p + +# struct mtmd_input_chunk { +# mtmd_input_chunk_type type; +# std::vector tokens_text; +# mtmd_image_tokens_ptr tokens_image; +# mtmd_audio_tokens_ptr tokens_audio; +# }; mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int) mtmd_input_chunk_p_ctypes = c_void_p @@ -487,18 +531,6 @@ def mtmd_input_chunk_free(chunk: mtmd_input_chunk_p): def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p) -> c_size_t: ... -# MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); -@ctypes_function_mtmd( - "mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t) -def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p) -> c_size_t: - ... - -# MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); -@ctypes_function_mtmd( - "mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t) -def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p) -> c_size_t: - ... - # MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate @ctypes_function_mtmd( "mtmd_image_tokens_get_id", [mtmd_image_tokens_p_ctypes], c_char_p) @@ -513,6 +545,59 @@ def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p) -> c_int32: """number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)""" ... +# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens), +# "use mtmd_image_tokens_get_decoder_pos() instead"); +@ctypes_function_mtmd( + "mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t) +def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p) -> c_size_t: + """ + use mtmd_image_tokens_get_decoder_pos() instead + """ + ... + +# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens), +# "use mtmd_image_tokens_get_decoder_pos() instead"); +@ctypes_function_mtmd( + "mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t) +def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p) -> c_size_t: + """ + use mtmd_image_tokens_get_decoder_pos() instead + """ + ... + +# struct mtmd_decoder_pos { +# uint32_t t; +# uint32_t x; +# uint32_t y; +# }; +class mtmd_decoder_pos(Structure): + _fields_ = [ + ("t", c_uint32), + ("x", c_uint32), + ("y", c_uint32), + ] + + if TYPE_CHECKING: + t: c_uint32 + x: c_uint32 + y: c_uint32 + +# // get position for decoder attention, to be used by M-RoPE models +# // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 +# // return relative position (for example, embedding 0 will have position (0, 0, 0); +# // remember to adjust it to the current absolute position) +# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i); +@ctypes_function_mtmd( + "mtmd_image_tokens_get_decoder_pos", [mtmd_image_tokens_p_ctypes, c_size_t], mtmd_decoder_pos) +def mtmd_image_tokens_get_decoder_pos(image_tokens: mtmd_image_tokens_p, i: c_size_t) -> mtmd_decoder_pos: + """ + get position for decoder attention, to be used by M-RoPE models + i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 + return relative position (for example, embedding 0 will have position (0, 0, 0); + remember to adjust it to the current absolute position) + """ + ... + # // tokenize an input text prompt and a list of bitmaps (images/audio) # // the prompt must have the input image marker (default: "<__media__>") in it # // the default marker is defined by mtmd_default_marker() From 3ee7ff8912466485d27325645717176e4626610a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 16 Apr 2026 23:45:02 +0800 Subject: [PATCH 338/518] chore(ci): upgrade softprops/action-gh-release to v3 (Node 24 runtime) --- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu124-win.yml | 2 +- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu126-win.yml | 2 +- .github/workflows/build-wheels-cu128-linux.yml | 2 +- .github/workflows/build-wheels-cu128-win.yml | 2 +- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index f14684289d..42b3d13169 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -120,7 +120,7 @@ jobs: # Store the date in environment variable for the release step echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release with: files: dist/* # Upload the generated wheel files from the dist directory # Define the release tag name using the collected environment variables diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index 0989afc8dd..135b847d32 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -125,7 +125,7 @@ jobs: - name: Create Release if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: files: dist/* # Set tag_name to -cu--win- diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 1eda5d10f2..f60eb5f878 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -120,7 +120,7 @@ jobs: # Store the date in environment variable for the release step echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release with: files: dist/* # Upload the generated wheel files from the dist directory # Define the release tag name using the collected environment variables diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index 19474b530d..be7bfdc72c 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -125,7 +125,7 @@ jobs: - name: Create Release if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: files: dist/* # Set tag_name to -cu--win- diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index a4ab9e8eb2..0bfe971eea 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -120,7 +120,7 @@ jobs: # Store the date in environment variable for the release step echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release with: files: dist/* # Upload the generated wheel files from the dist directory # Define the release tag name using the collected environment variables diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 0d87e09a45..80dd9f2f74 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -125,7 +125,7 @@ jobs: - name: Create Release if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: files: dist/* # Set tag_name to -cu--win- diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index dbc710e18a..23cd668c8d 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -120,7 +120,7 @@ jobs: # Store the date in environment variable for the release step echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - uses: softprops/action-gh-release@v2.2.2 # Action to create a GitHub Release + - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release with: files: dist/* # Upload the generated wheel files from the dist directory # Define the release tag name using the collected environment variables diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index 5b8b9c1992..b995f4f5f4 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -125,7 +125,7 @@ jobs: - name: Create Release if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: files: dist/* # Set tag_name to -cu--win- From 9e00017812148a755058854d6f62458af8cdbe8c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 17 Apr 2026 00:11:53 +0800 Subject: [PATCH 339/518] Update Submodule vendor/llama.cpp 8dc530b..9db77a0 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8dc530b86d..9db77a020c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8dc530b86d44cd0667a685539f29ded70a08ae0a +Subproject commit 9db77a020c97ac3b13b7c1bf4e0c5787001533e7 From 7820677e65827b6f3356f651da9be8d510ba10e5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 17 Apr 2026 00:15:13 +0800 Subject: [PATCH 340/518] feat: enhance Qwen35ChatHandler with preserve_thinking and Qwen3.6 template fixes - Add `preserve_thinking` parameter to optionally retain `` reasoning blocks across all historical conversational turns (defaults to False to save tokens). - Improve template robustness by adding an `is defined` safety check for `enable_thinking`. - Simplify JSON serialization logic for tool call arguments in the Jinja template. - Update class docstring to explicitly indicate support for Qwen 3.5 and Qwen 3.6 models. - Include `preserve_thinking` state in verbose processing logs. --- README.md | 1 + llama_cpp/llama_chat_format.py | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7dcd26576c..ab643fe427 100644 --- a/README.md +++ b/README.md @@ -834,6 +834,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | | [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` | +| [qwen3.6](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | `Qwen35ChatHandler` | `qwen3.6` | | [step3-vl](https://huggingface.co/JamePeng2023/Step3-VL-10B-GGUF) | `Step3VLChatHandler` | `step3-vl` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1149c7677c..af068f5535 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5387,6 +5387,9 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) class Qwen35ChatHandler(MTMDChatHandler): + """ + Handler for Qwen3.5/Qwen3.6 models. + """ CHAT_FORMAT = ( "{%- set image_count = namespace(value=0) -%}" "{%- set video_count = namespace(value=0) -%}" @@ -5494,7 +5497,7 @@ class Qwen35ChatHandler(MTMDChatHandler): " {%- set content = content.split('')[-1].lstrip('\n') -%}" " {%- endif -%}" " {%- set reasoning_content = reasoning_content | trim -%}" - " {%- if loop.index0 > ns.last_query_index -%}" + " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" " {%- else -%}" " {{- '<|im_start|>' + message.role + '\n' + content -}}" @@ -5516,7 +5519,7 @@ class Qwen35ChatHandler(MTMDChatHandler): " {%- if tool_call.arguments is defined -%}" " {%- for (args_name, args_value) in tool_call.arguments | items -%}" " {{- '\n' -}}" - " {%- set args_value = args_value | tojson | safe if args_value is mapping or args_value is sequence and args_value is not string else args_value | string -%}" + " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" " {{- args_value -}}" " {{- '\n' -}}" " {%- endfor -%}" @@ -5543,7 +5546,7 @@ class Qwen35ChatHandler(MTMDChatHandler): "{%- endfor -%}" "{%- if add_generation_prompt -%}" " {{- '<|im_start|>assistant\n' -}}" - " {%- if enable_thinking is false -%}" + " {%- if enable_thinking is defined and enable_thinking is false -%}" " {{- '\n\n\n\n' -}}" " {%- else -%}" " {{- '\n' -}}" @@ -5553,23 +5556,29 @@ class Qwen35ChatHandler(MTMDChatHandler): def __init__( self, - enable_thinking: bool = True, add_vision_id: bool = True, + enable_thinking: bool = True, + preserve_thinking: bool = False, **kwargs, ): """ Parameters: - - enable_thinking (bool): - - True (default): Enables reasoning for better results. - - False: Disables reasoning for faster results. - add_vision_id (bool): - True (default): Count all the images. Recommended for multi-image. - False: Doesn't count the images. Can save tokens with single-image. + - enable_thinking (bool): + - True (default): Enables reasoning for better results. + - False: Disables reasoning for faster results. + - preserve_thinking (bool): + - True: Keeps reasoning process for ALL historical conversational turns. + - False (default): Only keeps for the latest assistant reply to save tokens. """ super().__init__(**kwargs) self.enable_thinking = enable_thinking - self.extra_template_arguments["enable_thinking"] = enable_thinking + self.preserve_thinking = preserve_thinking self.extra_template_arguments["add_vision_id"] = add_vision_id + self.extra_template_arguments["enable_thinking"] = enable_thinking + self.extra_template_arguments["preserve_thinking"] = preserve_thinking def __call__(self, **kwargs): llama = kwargs['llama'] @@ -5578,7 +5587,7 @@ def __call__(self, **kwargs): llama.input_ids.fill(0) if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") # Use parent implementation return super().__call__(**kwargs) From b97cb637cd6124fc47f569721b1716014bd856a8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 17 Apr 2026 02:36:09 +0800 Subject: [PATCH 341/518] Bump version to 0.3.36 Signed-off-by: JamePeng --- CHANGELOG.md | 55 +++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 156cbd334e..480b3c24cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.36] Gemma-4 Omni-Multimodal and ToolCall Improved, Qwen3.6 / Step3-VL Support, Compilation workflow optimization + +- feat: enhance `Qwen35ChatHandler` with preserve_thinking and `Qwen3.6` Support + - Add `preserve_thinking` parameter to optionally retain `` reasoning + blocks across all historical conversational turns (defaults to False to save tokens). + - Improve template robustness by adding an `is defined` safety check for `enable_thinking`. + - Simplify JSON serialization logic for tool call arguments in the Jinja template. + - Update class docstring to explicitly indicate support for `Qwen 3.5` and `Qwen 3.6` models. + - Include `preserve_thinking` state in verbose processing logs. + +- docs: add comprehensive omni multimodal example for Gemma-4 (See here: [Gemma 4 Omni Example](https://github.com/JamePeng/llama-cpp-python?tab=readme-ov-file#comprehensive-omni-multimodal-example-gemma-4-vision--audio--text)) + - Wrapped the existing Qwen3-VL image loading example in a `
` block to improve README readability and save vertical space. + - Introduced a complete, production-ready "Omni MultiModal" example demonstrating simultaneous Vision and Audio processing using the `Gemma4ChatHandler`. + - Added a universal `build_media_payload` helper function to dynamically route and encode local files into OpenAI-compatible `image_url` and `input_audio` payload structures. + - Added crucial documentation clarifying multimodal capability differences across Gemma-4 variants (E2B/E4B supporting full audio/vision vs. 31B/26BA4B supporting vision only). + + +- docs: add audio processing recommendation to Gemma4ChatHandler + - Recommend BF16 mmproj for Gemma4 E2B and E4B models. + - Note known degraded audio performance with other quantizations. + - Add reference link to the relevant llama.cpp PR/issue comment. + +- refactor: update Gemma4ChatHandler with latest google/gemma-4-31B-it chat template from huggingface + - Sync `Gemma4ChatHandler` logic with the upstream chat template, incorporating the new `format_tool_response_block` and OpenAI-compatible forward-scan tool resolution. + +- Update README.md for OpenVINO/Metal/Vulkan/SYCL + +- Implement `Step3VLChatHandler` for `Step3-VL-10B` + +- feat(types): align with latest OpenAI API spec and fix type issues + - Expand `CompletionUsage` with `PromptTokensDetails` and `CompletionTokensDetails` for granular token tracking. + - Add `usage` to `CreateChatCompletionStreamResponse` to support usage reporting in streaming mode. + - Fix duplicate `object` field in `CreateCompletionResponse`. + - Update `ChatCompletionRequestAssistantMessage` to accept `None` for `content` and introduce the new `refusal` field. + - Clean up `ChatCompletionRequestMessage` Union by removing the duplicate user message type. + - Broaden `ChatCompletionToolChoiceOption` to fully support `allowed_tools` and `custom` tool choice behaviors. + +- feat(ci): Optimizing the GitHub build workflow for CUDA and METAL + - Update CI Action runner version + - microsoft/setup-msbuild@v2 -> v3 + - actions/checkout@v5 -> v6 + - actions/upload-artifact@v4 -> v6 + - actions/download-artifact@v4 -> v6 + - softprops/action-gh-release@v2 -> v3 + - ci: restrict cudaarch to Volta-Hopper to fix GitHub Actions timeout + - Using the `all` option for `cudaarch` on CUDA 12.4-12.6 causes the compilation process to exceed the 6-hour maximum execution limit on GitHub Actions, leading to cancelled jobs. + + - To resolve this and reduce build times, the target architectures are now restricted to explicitly support compute capabilities 7.0 through 9.0 (`70-real` to `90-real`). This maintains support for all modern NVIDIA GPUs equipped with Tensor Cores (from Volta up to Hopper architectures) while keeping the build time safely within CI constraints. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/9db77a020c97ac3b13b7c1bf4e0c5787001533e7](https://github.com/ggml-org/llama.cpp/commit/9db77a020c97ac3b13b7c1bf4e0c5787001533e7) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260415 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/e1ade17c6330e3cc46a2b08f9b48b1540521b231...7820677e65827b6f3356f651da9be8d510ba10e5 + ## [0.3.35] Gemma 4 series & LFM 2.5-VL Support, OpenAI OpenAPI Alignment and Logging Architecture Migration - fix: expand stop sequences for `Gemma4ChatHandler` diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index fb263e7825..a02ec5af51 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.35" +__version__ = "0.3.36" From 7a19575ec579901c9718ec0aa77cb656bba9c47c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 18 Apr 2026 05:04:07 +0800 Subject: [PATCH 342/518] Update Submodule vendor/llama.cpp 9db77a0..45cac7c --- llama_cpp/llama_cpp.py | 20 ++++++++++---------- llama_cpp/mtmd_cpp.py | 38 ++++++++++++++++++++++++++++++-------- vendor/llama.cpp | 2 +- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a527904637..7e2e32b4a0 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1014,7 +1014,7 @@ class llama_model_imatrix_data(ctypes.Structure): if TYPE_CHECKING: name: ctypes.c_char_p - data: ctypes.POINTER(ctypes.c_float) + data: ctypes.POINTER(ctypes.c_float) # type: ignore size: ctypes.c_size_t llama_model_imatrix_data_p = ctypes.POINTER(llama_model_imatrix_data) @@ -1068,10 +1068,10 @@ class llama_model_quantize_params(ctypes.Structure): pure: bool keep_split: bool dry_run: bool - imatrix: ctypes.POINTER(llama_model_imatrix_data) - kv_overrides: ctypes.POINTER(llama_model_kv_override) - tensor_types: ctypes.POINTER(llama_model_tensor_override) - prune_layers: ctypes.POINTER(ctypes.c_int32) + imatrix: ctypes.POINTER(llama_model_imatrix_data) # type: ignore + kv_overrides: ctypes.POINTER(llama_model_kv_override) # type: ignore + tensor_types: ctypes.POINTER(llama_model_tensor_override) # type: ignore + prune_layers: ctypes.POINTER(ctypes.c_int32) # type: ignore _fields_ = [ ("nthread", ctypes.c_int32), @@ -1345,7 +1345,7 @@ def llama_numa_init(numa: int, /): ) def llama_model_init_from_user( metadata: ctypes.c_void_p, - set_tensor_data: llama_model_set_tensor_data_t, + set_tensor_data: llama_model_set_tensor_data_t, # type: ignore set_tensor_data_ud: ctypes.c_void_p, params: llama_model_params, / @@ -4548,7 +4548,7 @@ def llama_sampler_init_grammar_lazy_patterns( vocab: llama_vocab_p, grammar_str: bytes, grammar_root: bytes, - trigger_patterns: CtypesArray[bytes], + trigger_patterns: CtypesArray[bytes], # type: ignore num_trigger_patterns: int, trigger_tokens: CtypesArray[llama_token], num_trigger_tokens: int, @@ -4803,8 +4803,8 @@ def llama_print_system_info() -> bytes: None, ) def llama_log_get( - log_callback: Optional[ctypes.pointer(ggml_log_callback)], - user_data: ctypes.pointer(ctypes.c_void_p), + log_callback: Optional[ctypes.pointer(ggml_log_callback)], # type: ignore + user_data: ctypes.pointer(ctypes.c_void_p), # type: ignore /, ): """Get callback for all future logging events. @@ -4819,7 +4819,7 @@ def llama_log_get( None, ) def llama_log_set( - log_callback: Optional[ggml_log_callback], + log_callback: Optional[ggml_log_callback], # type: ignore user_data: ctypes.c_void_p, /, ): diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index ba64b21ef6..57f8414b80 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -345,7 +345,7 @@ def mtmd_bitmap_init( ) def mtmd_bitmap_init_from_audio( n_samples: c_uint, - data: POINTER(c_float), + data: POINTER(c_float), # type: ignore /, ) -> mtmd_bitmap_p: ... @@ -582,6 +582,9 @@ class mtmd_decoder_pos(Structure): x: c_uint32 y: c_uint32 +mtmd_decoder_pos_p = POINTER(mtmd_decoder_pos) +mtmd_decoder_pos_p_ctypes = c_void_p + # // get position for decoder attention, to be used by M-RoPE models # // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 # // return relative position (for example, embedding 0 will have position (0, 0, 0); @@ -633,7 +636,7 @@ def mtmd_tokenize( ctx: mtmd_context_p, output: mtmd_input_chunks_p, text: mtmd_input_text_p, - bitmaps: POINTER(mtmd_bitmap_p), + bitmaps: POINTER(mtmd_bitmap_p), # type: ignore n_bitmaps: c_uint, /, ) -> c_int32: @@ -691,7 +694,7 @@ def mtmd_encode_chunk( # MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); @ctypes_function_mtmd( "mtmd_get_output_embd", [mtmd_context_p_ctypes], POINTER(c_float)) -def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): +def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): # type: ignore """ get output embeddings from the last encode pass """ @@ -703,7 +706,7 @@ def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function_mtmd( "mtmd_log_set", [ggml_log_callback, c_void_p], None) -def mtmd_log_set(log_callback: ggml_log_callback, user_data: c_void_p): +def mtmd_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # type: ignore """ Set callback for all future logging events. """ @@ -735,7 +738,7 @@ def mtmd_test_create_input_chunks() -> mtmd_input_chunk_p: # MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function_mtmd( "mtmd_helper_log_set", [ggml_log_callback, c_void_p], None) -def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): +def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # type: ignore """ Set callback for all future logging events. """ @@ -810,6 +813,25 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32: ... +# // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE +# // out_pos must have length == mtmd_helper_get_n_tokens(image) +# MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos); +@ctypes_function_mtmd("mtmd_helper_image_get_decoder_pos", [ + mtmd_image_tokens_p_ctypes, + mtmd_decoder_pos_p_ctypes + ], + None) +def mtmd_helper_image_get_decoder_pos( + image: mtmd_image_tokens_p, + out_pos: mtmd_decoder_pos_p # type: ignore +) -> c_int32: + """ + helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE + out_pos must have length == mtmd_helper_get_n_tokens(image) + """ + ... + + # // helper function that automatically: # // 1. run llama_decode() on text chunks # // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() @@ -844,7 +866,7 @@ def mtmd_helper_eval_chunks( seq_id: c_int32, n_batch: c_int32, logits_last: c_bool, - new_n_past: POINTER(c_int32), + new_n_past: POINTER(c_int32), # type: ignore /, ) -> c_int32: """ @@ -887,7 +909,7 @@ def mtmd_helper_eval_chunk_single( seq_id: c_int32, n_batch: c_int32, logits_last: c_bool, - new_n_past: POINTER(c_int32), + new_n_past: POINTER(c_int32), # type: ignore /, ) -> c_int32: """ @@ -923,7 +945,7 @@ def mtmd_helper_decode_image_chunk( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, chunks: mtmd_input_chunk_p, - encoded_embd: POINTER(c_float), + encoded_embd: POINTER(c_float), # type: ignore n_past: c_int32, seq_id: c_int32, n_batch: c_int32, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9db77a020c..45cac7ca70 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9db77a020c97ac3b13b7c1bf4e0c5787001533e7 +Subproject commit 45cac7ca703fb9085eae62b9121fca01d20177f6 From 33ce052ec868c1a301ced325ed9c5afbe041fb95 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 18 Apr 2026 06:05:03 +0800 Subject: [PATCH 343/518] chore(ci): upgrade astral-sh/setup-uv@v7 and Jimver/cuda-toolkit@v0.2.35 (Node 24 runtime) --- .github/workflows/build-wheels-cu124-linux.yml | 2 +- .github/workflows/build-wheels-cu124-win.yml | 4 ++-- .github/workflows/build-wheels-cu126-linux.yml | 2 +- .github/workflows/build-wheels-cu126-win.yml | 4 ++-- .github/workflows/build-wheels-cu128-linux.yml | 2 +- .github/workflows/build-wheels-cu128-win.yml | 4 ++-- .github/workflows/build-wheels-cu130-linux.yml | 2 +- .github/workflows/build-wheels-cu130-win.yml | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 42b3d13169..889a1679a4 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -40,7 +40,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index 135b847d32..01bd48e7de 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -42,7 +42,7 @@ jobs: # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.28 + uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: cuda: "${{ matrix.cuda }}" @@ -50,7 +50,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index f60eb5f878..568824c642 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -40,7 +40,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index be7bfdc72c..9330cb130b 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -42,7 +42,7 @@ jobs: # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.28 + uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: cuda: "${{ matrix.cuda }}" @@ -50,7 +50,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index 0bfe971eea..d1c387c52a 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -40,7 +40,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 80dd9f2f74..98ebbc4127 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -42,7 +42,7 @@ jobs: # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.28 + uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: cuda: "${{ matrix.cuda }}" @@ -50,7 +50,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml index 23cd668c8d..4f4305ad3e 100644 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ b/.github/workflows/build-wheels-cu130-linux.yml @@ -40,7 +40,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml index b995f4f5f4..d6187d7bf4 100644 --- a/.github/workflows/build-wheels-cu130-win.yml +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -42,7 +42,7 @@ jobs: # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} - uses: N-Storm/cuda-toolkit@v0.2.29 + uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: cuda: "${{ matrix.cuda }}" @@ -50,7 +50,7 @@ jobs: # from astral-sh/setup-uv - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true From 3984ab5c81a31f4fec7c3c0366f7ffa39c957e1e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 19 Apr 2026 15:34:42 +0800 Subject: [PATCH 344/518] Update Submodule vendor/llama.cpp 45cac7c..037bfe3 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 45cac7ca70..037bfe38d0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 45cac7ca703fb9085eae62b9121fca01d20177f6 +Subproject commit 037bfe38d0297001869df87150286952ae94cb1c From 15f8a36f00dd24dd92a8290b2a06f35fe51bcad4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 19 Apr 2026 16:07:05 +0800 Subject: [PATCH 345/518] docs: initialize LLM Wiki structure for better documentation maintenance - Create docs/wiki/ directory with full folder structure - Add SCHEMA.md, index.md and contributing guidelines - Set up core/, features/, examples/, types/ and subdirectories - Prepare for LLM-powered living documentation (Llama class, multi-modal chat handlers, vision/audio examples, etc.) - Include .gitkeep files to preserve empty directories This lays the foundation for a modern, maintainable wiki that will replace outdated static docs. Future commits will populate pages with up-to-date content generated from latest source code. Signed-off-by: JamePeng --- docs/wiki/.gitkeep | 0 docs/wiki/SCHEMA.md | 45 +++++++++++++++++++++ docs/wiki/contributing-to-wiki.md | 0 docs/wiki/core/.gitkeep | 0 docs/wiki/core/ChatHandler.md | 0 docs/wiki/core/Llama.md | 0 docs/wiki/core/LlamaChatFormat.md | 0 docs/wiki/core/LlamaCppBindings.md | 0 docs/wiki/core/MTMDCppBindings.md | 0 docs/wiki/development/.gitkeep | 0 docs/wiki/examples/.gitkeep | 0 docs/wiki/examples/audio/.gitkeep | 0 docs/wiki/examples/audio/audio-gemma.md | 0 docs/wiki/examples/audio/audio-qwen-omni.md | 0 docs/wiki/examples/basic-completion.md | 0 docs/wiki/examples/chat-completion.md | 0 docs/wiki/examples/speculative-decoding.md | 0 docs/wiki/examples/vision/.gitkeep | 0 docs/wiki/examples/vision/video/.gitkeep | 0 docs/wiki/examples/vision/vision-gemma.md | 0 docs/wiki/examples/vision/vision-glmv.md | 0 docs/wiki/examples/vision/vision-ocr.md | 0 docs/wiki/examples/vision/vision-qwen.md | 0 docs/wiki/features/.gitkeep | 0 docs/wiki/features/caching.md | 0 docs/wiki/features/embeddings-rerank.md | 0 docs/wiki/features/grammar.md | 0 docs/wiki/features/multi-model.md | 0 docs/wiki/features/tool-calls.md | 0 docs/wiki/index.md | 0 docs/wiki/install.md | 0 docs/wiki/troubleshooting.md | 0 docs/wiki/types/.gitkeep | 0 docs/wiki/types/common-types.md | 0 docs/wiki/types/mcp-types.md | 0 35 files changed, 45 insertions(+) create mode 100644 docs/wiki/.gitkeep create mode 100644 docs/wiki/SCHEMA.md create mode 100644 docs/wiki/contributing-to-wiki.md create mode 100644 docs/wiki/core/.gitkeep create mode 100644 docs/wiki/core/ChatHandler.md create mode 100644 docs/wiki/core/Llama.md create mode 100644 docs/wiki/core/LlamaChatFormat.md create mode 100644 docs/wiki/core/LlamaCppBindings.md create mode 100644 docs/wiki/core/MTMDCppBindings.md create mode 100644 docs/wiki/development/.gitkeep create mode 100644 docs/wiki/examples/.gitkeep create mode 100644 docs/wiki/examples/audio/.gitkeep create mode 100644 docs/wiki/examples/audio/audio-gemma.md create mode 100644 docs/wiki/examples/audio/audio-qwen-omni.md create mode 100644 docs/wiki/examples/basic-completion.md create mode 100644 docs/wiki/examples/chat-completion.md create mode 100644 docs/wiki/examples/speculative-decoding.md create mode 100644 docs/wiki/examples/vision/.gitkeep create mode 100644 docs/wiki/examples/vision/video/.gitkeep create mode 100644 docs/wiki/examples/vision/vision-gemma.md create mode 100644 docs/wiki/examples/vision/vision-glmv.md create mode 100644 docs/wiki/examples/vision/vision-ocr.md create mode 100644 docs/wiki/examples/vision/vision-qwen.md create mode 100644 docs/wiki/features/.gitkeep create mode 100644 docs/wiki/features/caching.md create mode 100644 docs/wiki/features/embeddings-rerank.md create mode 100644 docs/wiki/features/grammar.md create mode 100644 docs/wiki/features/multi-model.md create mode 100644 docs/wiki/features/tool-calls.md create mode 100644 docs/wiki/index.md create mode 100644 docs/wiki/install.md create mode 100644 docs/wiki/troubleshooting.md create mode 100644 docs/wiki/types/.gitkeep create mode 100644 docs/wiki/types/common-types.md create mode 100644 docs/wiki/types/mcp-types.md diff --git a/docs/wiki/.gitkeep b/docs/wiki/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md new file mode 100644 index 0000000000..4a8f700e2c --- /dev/null +++ b/docs/wiki/SCHEMA.md @@ -0,0 +1,45 @@ +# LLM Wiki Schema – llama-cpp-python + +**Purpose**: Maintain a living, always-up-to-date, structured documentation wiki for the llama-cpp-python library using LLMs as the primary maintainer. + +**Core Principles**: +- The source of truth is the latest code in `llama_cpp/` (especially `llama.py`, `llama_chat_format.py`, `llama_cpp.py`, `llama_types.py`, `mtmd_cpp.py`, `_internals.py`, `_ggml.py`). +- Never invent parameters or behavior. Always read the current source code before writing/updating a page. +- All examples must be complete, runnable with the latest API, and include necessary imports. +- Clearly mark any deprecated/old usage with a warning and show the modern replacement. +- Use internal wiki links (e.g. [[Llama]], [[Qwen35ChatHandler]]) for cross-referencing. +- Keep pages concise, professional, and user-friendly. + +**Page Types and Templates**: + +1. **Class / Module Page** (e.g. core/Llama.md) + - Frontmatter (YAML): + ```yaml + --- + title: Llama Class + class_name: Llama + last_updated: YYYY-MM-DD + version_target: "latest" + --- + ``` + - Sections (in order): + - Overview + - Constructor (`__init__`) – full parameter table with types, defaults, and explanations + - Core Methods (with signatures and examples) + - Best Practices & Common Patterns + - Deprecated / Changed APIs (with migration notes) + - Related Links + +2. **Feature Page** (features/xxx.md) + - Overview, When to use, Code examples, Limitations, Related features + +3. **Example Page** (examples/xxx.md) + - Goal, Prerequisites, Complete runnable code block, Expected output, Tips + +**Update Rules**: +- Before updating any page, the LLM must read the relevant source files. +- Update the `last_updated` date. +- If a new feature (e.g. new ChatHandler, new sampler) appears in code, create or expand the corresponding page. +- Maintain a high standard of readability and accuracy. + +This schema is the contract. All generated content must follow it. \ No newline at end of file diff --git a/docs/wiki/contributing-to-wiki.md b/docs/wiki/contributing-to-wiki.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/core/.gitkeep b/docs/wiki/core/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/core/ChatHandler.md b/docs/wiki/core/ChatHandler.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/core/LlamaChatFormat.md b/docs/wiki/core/LlamaChatFormat.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/core/LlamaCppBindings.md b/docs/wiki/core/LlamaCppBindings.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/core/MTMDCppBindings.md b/docs/wiki/core/MTMDCppBindings.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/development/.gitkeep b/docs/wiki/development/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/.gitkeep b/docs/wiki/examples/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/audio/.gitkeep b/docs/wiki/examples/audio/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/audio/audio-gemma.md b/docs/wiki/examples/audio/audio-gemma.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/audio/audio-qwen-omni.md b/docs/wiki/examples/audio/audio-qwen-omni.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/basic-completion.md b/docs/wiki/examples/basic-completion.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/chat-completion.md b/docs/wiki/examples/chat-completion.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/speculative-decoding.md b/docs/wiki/examples/speculative-decoding.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/vision/.gitkeep b/docs/wiki/examples/vision/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/vision/video/.gitkeep b/docs/wiki/examples/vision/video/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/vision/vision-gemma.md b/docs/wiki/examples/vision/vision-gemma.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/vision/vision-glmv.md b/docs/wiki/examples/vision/vision-glmv.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/vision/vision-ocr.md b/docs/wiki/examples/vision/vision-ocr.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/examples/vision/vision-qwen.md b/docs/wiki/examples/vision/vision-qwen.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/features/.gitkeep b/docs/wiki/features/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/features/caching.md b/docs/wiki/features/caching.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/features/embeddings-rerank.md b/docs/wiki/features/embeddings-rerank.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/features/grammar.md b/docs/wiki/features/grammar.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/features/multi-model.md b/docs/wiki/features/multi-model.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/features/tool-calls.md b/docs/wiki/features/tool-calls.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/index.md b/docs/wiki/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/install.md b/docs/wiki/install.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/troubleshooting.md b/docs/wiki/troubleshooting.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/types/.gitkeep b/docs/wiki/types/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/types/common-types.md b/docs/wiki/types/common-types.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/wiki/types/mcp-types.md b/docs/wiki/types/mcp-types.md new file mode 100644 index 0000000000..e69de29bb2 From e3b7ad67901b0091662dc05079526c79d4f6ee6f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 20 Apr 2026 20:28:17 +0800 Subject: [PATCH 346/518] Update Submodule vendor/llama.cpp 037bfe3..81df3f7 --- llama_cpp/mtmd_cpp.py | 18 ++++++++++-------- vendor/llama.cpp | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 57f8414b80..4dd6d6d05f 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -587,17 +587,17 @@ class mtmd_decoder_pos(Structure): # // get position for decoder attention, to be used by M-RoPE models # // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 -# // return relative position (for example, embedding 0 will have position (0, 0, 0); -# // remember to adjust it to the current absolute position) -# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i); +# // pos_0 is the absolute position of the first token +# // return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position) +# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i); @ctypes_function_mtmd( - "mtmd_image_tokens_get_decoder_pos", [mtmd_image_tokens_p_ctypes, c_size_t], mtmd_decoder_pos) -def mtmd_image_tokens_get_decoder_pos(image_tokens: mtmd_image_tokens_p, i: c_size_t) -> mtmd_decoder_pos: + "mtmd_image_tokens_get_decoder_pos", [mtmd_image_tokens_p_ctypes, c_int32, c_size_t], mtmd_decoder_pos) +def mtmd_image_tokens_get_decoder_pos(image_tokens: mtmd_image_tokens_p, pos_0: c_int32, i: c_size_t) -> mtmd_decoder_pos: """ get position for decoder attention, to be used by M-RoPE models i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 - return relative position (for example, embedding 0 will have position (0, 0, 0); - remember to adjust it to the current absolute position) + pos_0 is the absolute position of the first token + return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position) """ ... @@ -815,14 +815,16 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32: # // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE # // out_pos must have length == mtmd_helper_get_n_tokens(image) -# MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos); +# MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos); @ctypes_function_mtmd("mtmd_helper_image_get_decoder_pos", [ mtmd_image_tokens_p_ctypes, + c_int32, mtmd_decoder_pos_p_ctypes ], None) def mtmd_helper_image_get_decoder_pos( image: mtmd_image_tokens_p, + pos_0: c_int32, out_pos: mtmd_decoder_pos_p # type: ignore ) -> c_int32: """ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 037bfe38d0..81df3f7cfa 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 037bfe38d0297001869df87150286952ae94cb1c +Subproject commit 81df3f7cfaa6f99de14e792b38d5771bf427383e From 8625836b703cfbd17f38105be5561e0147728a05 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 21 Apr 2026 22:07:43 +0800 Subject: [PATCH 347/518] Update Submodule vendor/llama.cpp 81df3f7..82209ef --- llama_cpp/_internals.py | 4 --- llama_cpp/llama_cpp.py | 58 ----------------------------------------- llama_cpp/mtmd_cpp.py | 26 ++++++++++++------ vendor/llama.cpp | 2 +- 4 files changed, 19 insertions(+), 71 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 20648cf74d..27dd4f80d3 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -758,10 +758,6 @@ def reset_timings(self): def print_timings(self): llama_cpp.llama_perf_context_print(self.ctx) - def print_memory_breakdown(self): - """print a breakdown of per-device memory use via LLAMA_LOG""" - llama_cpp.llama_memory_breakdown_print(self.ctx) - # LoRA / ALoRA Dynamic Routing Methods def clear_loras(self): diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 7e2e32b4a0..416e8b9357 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1519,53 +1519,6 @@ class llama_params_fit_status(enum.IntEnum): LLAMA_PARAMS_FIT_STATUS_ERROR = 2 -# // fits mparams and cparams to free device memory (assumes system memory is unlimited) -# // - returns true if the parameters could be successfully modified to fit device memory -# // - this function is NOT thread safe because it modifies the global llama logger state -# // - only parameters that have the same value as in llama_default_model_params are modified -# // with the exception of the context size which is modified if and only if equal to 0 -# LLAMA_API enum llama_params_fit_status llama_params_fit( -# const char * path_model, -# struct llama_model_params * mparams, -# struct llama_context_params * cparams, -# float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements -# struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements -# size_t margin, // margin of memory to leave per device in bytes -# uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use -# enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log -@ctypes_function( - "llama_params_fit", - [ - ctypes.c_char_p, - llama_model_params_p, - llama_context_params_p, - ctypes.POINTER(ctypes.c_float), - ctypes.POINTER(llama_model_tensor_buft_override), - ctypes.c_size_t, - ctypes.c_uint32, - ctypes.c_int, - ], - ctypes.c_int, -) -def llama_params_fit( - path_model: ctypes.c_char_p, - mparams: CtypesPointer[llama_model_params], - cparams: CtypesPointer[llama_context_params], - tensor_split: CtypesPointer[ctypes.c_float], - tensor_buft_overrides: CtypesPointer[llama_model_tensor_buft_override], - margin: ctypes.c_size_t, - n_ctx_min: ctypes.c_uint32, - log_level: int, - /, -) -> int: - """ - fits mparams and cparams to free device memory (assumes system memory is unlimited) - returns true if the parameters could be successfully modified to fit device memory - this function is NOT thread safe because it modifies the global llama logger state - """ - ... - - # LLAMA_API int64_t llama_time_us(void); @ctypes_function( "llama_time_us", @@ -4930,17 +4883,6 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... -# // print a breakdown of per-device memory use via LLAMA_LOG: -# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); -@ctypes_function( - "llama_memory_breakdown_print", - [llama_context_p_ctypes], - None, -) -def llama_memory_breakdown_print(ctx: llama_context_p, /): - ... - - # // # // training # // diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 4dd6d6d05f..574d90e2bf 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -94,10 +94,19 @@ class mtmd_input_chunk_type(enum.IntEnum): mtmd_bitmap_p = NewType("mtmd_bitmap_p", int) mtmd_bitmap_p_ctypes = c_void_p +# // position indexing for decoder model +# enum mtmd_pos_type { +# MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens +# MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes +# }; +class mtmd_pos_type(enum.IntEnum): + MTMD_POS_TYPE_NORMAL = 0 # number of positions equals to number of tokens + MTMD_POS_TYPE_MROPE = 1 # qwen-vl mrope style, each image takes max(t,h,w) position indexes + # struct mtmd_image_tokens { # uint32_t nx; // number of tokens in x direction # uint32_t ny; // number of tokens in y direction -# bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) +# mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; # uint32_t n_tokens() const { return nx * ny; } # clip_image_f32_batch batch_f32; // preprocessed image patches # std::string id; // optional user-defined ID, useful for KV cache tracking @@ -269,28 +278,29 @@ def mtmd_free(ctx: mtmd_context_p): ... # // whether we need to set non-causal mask before llama_decode -# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); +# // if chunk is nullptr, we assume the default case where chunk is an image chunk +# MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk); @ctypes_function_mtmd( - "mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool) -def mtmd_decode_use_non_causal(ctx: mtmd_context_p) -> c_bool: + "mtmd_decode_use_non_causal", [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes], c_bool) +def mtmd_decode_use_non_causal(ctx: mtmd_context_p, chunk: mtmd_input_chunk_p) -> c_bool: ... # // whether the current model use M-RoPE for llama_decode -# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); +# MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx); @ctypes_function_mtmd( "mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool) def mtmd_decode_use_mrope(ctx: mtmd_context_p) -> c_bool: ... # // whether the current model supports vision input -# MTMD_API bool mtmd_support_vision(mtmd_context * ctx); +# MTMD_API bool mtmd_support_vision(const mtmd_context * ctx); @ctypes_function_mtmd( "mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) def mtmd_support_vision(ctx: mtmd_context_p) -> c_bool: ... # // whether the current model supports audio input -# MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +# MTMD_API bool mtmd_support_audio(const mtmd_context * ctx); @ctypes_function_mtmd( "mtmd_support_audio", [mtmd_context_p_ctypes], c_bool) def mtmd_support_audio(ctx: mtmd_context_p) -> c_bool: @@ -298,7 +308,7 @@ def mtmd_support_audio(ctx: mtmd_context_p) -> c_bool: # // get audio sample rate in Hz, for example 16000 for Whisper # // return -1 if audio is not supported -# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); +# MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); @ctypes_function_mtmd( "mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int) def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 81df3f7cfa..82209efb7e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 81df3f7cfaa6f99de14e792b38d5771bf427383e +Subproject commit 82209efb7eab9a741923897c74fbb8fd71cd17ba From dbb740780b03abe7fdabf7d4656f54b36132c765 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 23 Apr 2026 00:49:26 +0800 Subject: [PATCH 348/518] Update Submodule vendor/llama.cpp 82209ef..8bccdbb --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 82209efb7e..8bccdbbff9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 82209efb7eab9a741923897c74fbb8fd71cd17ba +Subproject commit 8bccdbbff9d0d91d54838471f6eea182b9ab1b79 From afd038f96c59419f9c720450ed28c31e1c171682 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 23 Apr 2026 00:58:31 +0800 Subject: [PATCH 349/518] feat(types): introduce MCP definitions and align with latest OpenAI spec - Add comprehensive Model Context Protocol (MCP) type definitions, including `MCPTool`, `MCPToolCall`, `MCPListTools`, connector IDs, and approval filters to support remote server tool calling. - Add `ServiceTier` literal ("auto", "default", etc.) and include the `service_tier` field in `CreateChatCompletionResponse`. - Restrict `finish_reason` in completion responses to strict standard literals (`stop`, `length`, `tool_calls`, `content_filter`, `function_call`). - Introduce `ChatCompletionMessageCustomToolCall` to support custom tool calls generated by the model. - Update `ChatCompletionRequestAssistantMessage` to include the `name` field and add descriptive docstrings to message types. Signed-off-by: JamePeng --- llama_cpp/llama_types.py | 113 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 60202ae8f0..37b041ee87 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -144,7 +144,10 @@ class ChatCompletionResponseChoice(TypedDict): index: int message: "ChatCompletionResponseMessage" logprobs: Optional[ChatCompletionLogprobs] - finish_reason: Optional[str] + finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter", "function_call"]] + + +ServiceTier = Literal["auto", "default", "flex", "scale", "priority"] class CreateChatCompletionResponse(TypedDict): @@ -152,6 +155,7 @@ class CreateChatCompletionResponse(TypedDict): object: Literal["chat.completion"] created: int model: str + service_tier: NotRequired[ServiceTier] choices: List["ChatCompletionResponseChoice"] usage: CompletionUsage @@ -300,27 +304,130 @@ class ChatCompletionRequestUserMessage(TypedDict): content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]] +# Function tool call + class ChatCompletionMessageToolCallFunction(TypedDict): + """The function that the model called.""" name: str arguments: str class ChatCompletionMessageToolCall(TypedDict): + """A call to a function tool created by the model.""" id: str type: Literal["function"] function: ChatCompletionMessageToolCallFunction +# Custom tool call -ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall] +class ChatCompletionMessageCustomToolCallCustom(TypedDict): + """The custom tool that the model called.""" + name: str + input: str +class ChatCompletionMessageCustomToolCall(TypedDict): + """A call to a custom tool created by the model.""" + id: str + type: Literal["custom"] + custom: ChatCompletionMessageCustomToolCallCustom -class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): +# The tool calls generated by the model, such as function calls. +ChatCompletionMessageToolCalls = Union[ + ChatCompletionMessageToolCall, + ChatCompletionMessageCustomToolCall +] + + +# MCP ToolCall + +MCPConnectorID = Literal[ + "connector_dropbox", + "connector_gmail", + "connector_googlecalendar", + "connector_googledrive", + "connector_microsoftteams", + "connector_outlookcalendar", + "connector_outlookemail", + "connector_sharepoint" +] + +MCPToolCallStatus = Literal["in_progress", "completed", "incomplete", "calling", "failed"] + +class MCPToolCall(TypedDict): + """An invocation of a tool on an MCP server.""" + type: Literal["mcp_call"] + id: str + server_label: str name: str + arguments: str # JSON string + output: NotRequired[Optional[str]] + error: NotRequired[Optional[str]] + status: NotRequired[MCPToolCallStatus] + approval_request_id: NotRequired[Optional[str]] + + +class MCPListToolsTool(TypedDict): + """A tool available on an MCP server.""" + name: str + description: Optional[str] + input_schema: Dict[str, Any] # The JSON schema describing the tool's input + annotations: Optional[Dict[str, Any]] + + +class MCPListTools(TypedDict): + """A list of tools available on an MCP server.""" + type: Literal["mcp_list_tools"] + id: str + server_label: str + tools: List[MCPListToolsTool] + error: Optional[str] + + +class MCPToolFilter(TypedDict): + """A filter object to specify which tools are allowed.""" + tool_names: NotRequired[List[str]] + read_only: NotRequired[bool] + + +class MCPToolApprovalFilter(TypedDict, total=False): + """Specify which of the MCP server's tools require approval based on filters.""" + always: MCPToolFilter + never: MCPToolFilter + + +class MCPTool(TypedDict): + """ + Give the model access to additional tools via remote Model Context Protocol (MCP) servers. + """ + # The type of the MCP tool. Always `mcp`. + type: Literal["mcp"] + # A label for this MCP server, used to identify it in tool calls. + server_label: str + # The URL for the MCP server. One of `server_url` or `connector_id` must be provided. + server_url: NotRequired[str] + connector_id: NotRequired[MCPConnectorID] + authorization: NotRequired[str] + server_description: NotRequired[str] + headers: NotRequired[Optional[Dict[str, str]]] + # List of allowed tool names or a filter object. + allowed_tools: NotRequired[Optional[Union[List[str], MCPToolFilter]]] + # Specify which of the MCP server's tools require approval. + require_approval: NotRequired[Optional[Union[Literal["always", "never"], MCPToolApprovalFilter]]] + # Whether this MCP tool is deferred and discovered via tool search. + defer_loading: NotRequired[bool] + + +# Assistant message + +class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): arguments: str + name: str class ChatCompletionRequestAssistantMessage(TypedDict): + """Messages sent by the model in response to user messages.""" role: Literal["assistant"] + name: Optional[str] content: NotRequired[Optional[str]] refusal: NotRequired[Optional[str]] tool_calls: NotRequired[ChatCompletionMessageToolCalls] From 852a2e972cb1521ca374de4b60f9ca1dffd0962c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 23 Apr 2026 01:36:03 +0800 Subject: [PATCH 350/518] Update /docs/wiki/core/Llama.md Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 205 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index e69de29bb2..ce2f1ba5d7 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -0,0 +1,205 @@ +```yaml +--- +title: Llama Class +class_name: Llama +last_updated: 2026-04-23 +version_target: "latest" +--- +``` + +## Overview +The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, hybrid model checkpointing, speculative decoding, and context shifting. + +## Constructor (`__init__`) + +Initialize the model and context. Note that model loading will immediately allocate RAM/VRAM based on the selected offloading parameters. + +### Core Model & Hardware Parameters +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `model_path` | `str` | **Required** | Path to the `.gguf` model file. | +| `n_gpu_layers` | `int` | `0` | Number of layers to offload to GPU. Set to `-1` for all layers. | +| `split_mode` | `int` | `LLAMA_SPLIT_MODE_LAYER` | How to split the model across GPUs (e.g., `LLAMA_SPLIT_MODE_ROW`). | +| `main_gpu` | `int` | `0` | The primary GPU to use for intermediate results or the entire model. | +| `tensor_split` | `List[float]` | `None` | Proportional split of tensors across GPUs (max `LLAMA_MAX_DEVICES`). | +| `use_mmap` | `bool` | `True` | Whether to use memory mapping (mmap) if possible. | +| `use_mlock` | `bool` | `False` | Force the system to keep the model in RAM, preventing swapping. | +| `kv_overrides` | `Dict` | `None` | Key-value overrides for the model metadata (supports bool, int, float, str). | +| `numa` | `Union[bool, int]`| `False` | NUMA strategy (e.g., `GGML_NUMA_STRATEGY_DISTRIBUTE`). | + +### Context & Performance Parameters +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `n_ctx` | `int` | `512` | Text context size. Set to `0` to load from model metadata. | +| `n_batch` | `int` | `2048` | Maximum batch size for prompt processing. | +| `n_ubatch` | `int` | `512` | Physical batch size. | +| `n_threads` | `int` | `None` | Number of threads for generation (defaults to CPU count // 2). | +| `n_threads_batch`| `int` | `None` | Number of threads for batch processing (defaults to CPU count). | +| `flash_attn_type`| `int` | `AUTO` | Controls Flash Attention activation (`LLAMA_FLASH_ATTN_TYPE_AUTO`). | +| `swa_full` | `bool` | `None` | Whether to use full-size SWA cache | +| `kv_unified` | `bool` | `None` | Use single unified KV buffer for the KV cache of all sequences | +| `type_k` / `type_v`| `int` | `None` | KV cache data type for K and V (defaults to `f16`). | +| `offload_kqv` | `bool` | `True` | Whether to offload K, Q, V tensors to GPU. | + +### Advanced & Chat Parameters +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `chat_format` | `str` | `None` | String specifying the chat template (e.g., `"llama-2"`, `"chatml"`). Guessed from GGUF if None. | +| `chat_handler` | `LlamaChatCompletionHandler` | `None` | Optional custom handler. See [[ChatHandlers]]. | +| `draft_model` | `LlamaDraftModel` | `None` | Optional draft model for speculative decoding. | +| `ctx_checkpoints` | `int` | `32` | Max context checkpoints per slot (Hybrid/SWA models). | +| `checkpoint_interval`| `int`| `4096` | Token interval for saving Hybrid model checkpoints. | + +*(Note: There are numerous additional RoPE/YaRN scaling parameters available for specialized context extension. Refer to the source code for the full list).* + +--- + +## Core Methods + +### `create_chat_completion` +Generates a chat response using the configured `chat_format` or `chat_handler`. +```python +import llama_cpp + +model = llama_cpp.Llama(model_path="models/qwen2.5-7b-instruct.gguf", n_gpu_layers=-1) + +response = model.create_chat_completion( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain KV caching."} + ], + temperature=0.7, + max_tokens=2048 +) +print(response["choices"][0]["message"]["content"]) +``` + +### `create_completion` / `__call__` +Generates standard text completion from a raw string prompt. +```python +import llama_cpp + +model = llama_cpp.Llama(model_path="models/llama-3-8b.gguf") +output = model("The capital of Japan is", max_tokens=10, stop=["\n"]) +print(output["choices"][0]["text"]) +``` + +### `generate` +A low-level generator yielding token IDs one by one. Highly customizable with sampling parameters, dynamic LoRA mounting, and control vectors. +```python +import llama_cpp + +model = llama_cpp.Llama(model_path="models/llama-3-8b.gguf") +tokens = model.tokenize(b"def fibonacci(n):") + +for token in model.generate(tokens, top_k=40, top_p=0.95, temp=0.2): + print(model.detokenize([token]).decode('utf-8'), end="", flush=True) +``` + +### `eval` +Low-level method to ingest and evaluate a sequence of tokens. Used internally to update the KV cache and logits. Handles **Context Shifting** automatically to prevent OOM when the token count exceeds `n_ctx`. +```python +# Evaluates a chunk of tokens and updates internal state +model.eval(tokens=[1, 453, 234, 987], active_loras=[{"name": "coding_adapter", "scale": 1.0}]) +``` + +### Dynamic LoRA Management +The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dynamically per-generation or per-eval. +* `load_lora(name: str, path: str)`: Loads an adapter into VRAM (does not apply it yet). +* `unload_lora(name: str)`: Releases the specific LoRA from VRAM. +* `list_loras() -> List[str]`: Returns names of all registered LoRAs. +* `unload_all_loras()`: Forces VRAM release for all loaded adapters. + +--- + +## Best Practices & Common Patterns + +1. **Context Shifting & Prompt Caching**: + + By default, when calling `.generate()` or `.create_completion(reset=True)`, the engine checks for the longest matching prefix in the existing KV cache. To maximize speed, keep system prompts static and only append new dialogue to avoid re-evaluating the entire history. If the context limit is reached during `eval`, the model will automatically trigger a Context Shift (discarding older tokens while attempting to keep `n_keep` tokens, usually the system prompt). + +2. **Basic Chat with JSON Mode**: + Forces the model to output valid JSON by using the `response_format` parameter. + ```python + from llama_cpp import Llama + + llm = Llama(model_path="path/to/model.gguf", n_gpu_layers=-1) + + response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Extract name and age from: John is 30."}], + response_format={"type": "json_object"}, + temperature=0.0 + ) + print(response["choices"][0]["message"]["content"]) + ``` + +3. **Speculative Decoding**: + + Accelerates generation by using a small "draft" model to predict tokens, which the larger model then validates in parallel. + ```python + from llama_cpp import Llama + from llama_cpp.llama_speculative import LlamaDraftModel + + draft = LlamaDraftModel.from_model(Llama(model_path="tiny_draft.gguf", n_gpu_layers=-1)) + main_llm = Llama(model_path="large_model.gguf", n_gpu_layers=-1, draft_model=draft) + + for chunk in main_llm.create_completion("Explain quantum physics", stream=True): + print(chunk["choices"][0]["text"], end="") + ``` + +4. **Dynamic LoRA Routing**: + + You can load multiple LoRAs using `load_lora()` at startup. Then, pass the `active_loras` parameter to `.generate()`, `.create_completion()`, or `.create_chat_completion()` to dynamically apply them to specific queries without reloading the base model. + + Multi-LoRA Dynamic Switching Example:
+ + Load multiple adapters and apply them selectively without reloading the base model. + ```python + llm = Llama(model_path="base_model.gguf") + llm.load_lora("coding", "codellama_adapter.gguf") + llm.load_lora("story", "storywriter_adapter.gguf") + llm.load_lora("sql_expert", "adapters/sql_lora.gguf") + + # Use coding adapter + llm.create_completion("def sort:", active_loras=[{"name": "coding", "scale": 1.0}]) + + # Use story adapter + llm.create_completion("Once upon a time", active_loras=[{"name": "story", "scale": 0.9}]) + + # Use sql adapter + llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}])v + ``` + +5. **Hybrid & Recurrent Architectures**: + + The class natively detects Hybrid/Recurrent models (like LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba or specialized SWA models(Gemma3/4)) and automatically enables the `HybridCheckpointCache`. This creates periodic save-states during large context pre-filling, allowing the model to roll back seamlessly if a generation is rejected (e.g., speculative decoding mismatches) without corrupting the recurrent state. + + * Tips: If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: + + ```python + llm = Llama( + model_path="./Qwen3.5-VL-9B.gguf", + chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), + n_ctx=4096, + ctx_checkpoints=0 # <-- SET THIS TO 0 TO ENABLE ZERO-LATENCY FAST PATH + ) + ``` + +--- + +## Deprecated / Changed APIs + +> ⚠️ **Warning:** The internal embedding methods on the `Llama` class are deprecated and will be removed. + +* `embed()` ➔ **Deprecated.** +* `create_embedding()` ➔ **Deprecated.** + +**Migration Note:** Do not use `Llama(..., embeddings=True)` combined with `model.create_embedding(...)`. Instead, use the dedicated `LlamaEmbedding` class, which offers optimized batching and reranking support. +*See: [[LlamaEmbedding]]* + +--- + +## Related Links +* [[LlamaEmbedding]] - Dedicated class for text embeddings and reranking. +* [[ChatHandlers]] - Customizing `LlamaChatCompletionHandler` for function calling and vision/omni models (e.g., `[[Gemma4ChatHandler]]`, `[[Qwen35ChatHandler]]`). +* [[LlamaCache]] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). \ No newline at end of file From 7a02d4d213a420d70c091019367e45759a94ff9a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 25 Apr 2026 01:55:58 +0800 Subject: [PATCH 351/518] Update Submodule vendor/llama.cpp 8bccdbb..13d36cf --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8bccdbbff9..13d36cf891 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8bccdbbff9d0d91d54838471f6eea182b9ab1b79 +Subproject commit 13d36cf89178354d9aa6732e5930d89d64caf718 From ddf577dd5bcf46657e4429d8ab774261b683b647 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 25 Apr 2026 04:02:50 +0800 Subject: [PATCH 352/518] feat(speculative): introduce O(1) hash-based N-Gram speculative decoding - Add `LlamaNGramMapDecoding` to `llama_speculative.py`, implementing an ultra-fast speculative decoder based on a hash inverted index and incremental updates. - Achieve O(1) time complexity for draft token generation, completely eliminating the CPU bottleneck present in the legacy Numpy sliding window approach. - Update `README.md` and `docs/wiki/core/Llama.md` to recommend `LlamaNGramMapDecoding` as the default and fastest speculative decoding method, along with updated initialization examples. - Add docs comment to the speculative decoding classes for better developer experience. - Add warnings to the legacy `LlamaPromptLookupDecoding` class regarding its high computational overhead for long contexts. --- README.md | 18 ++++- docs/wiki/core/Llama.md | 17 ++++- llama_cpp/llama_speculative.py | 129 ++++++++++++++++++++++++++++++++- 3 files changed, 153 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ab643fe427..8873c39039 100644 --- a/README.md +++ b/README.md @@ -1370,19 +1370,29 @@ emb = llm.create_embedding("text") `llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model. -The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class. +The fastest way to use speculative decoding is through the `LlamaNGramMapDecoding`(**Recommend**) or `LlamaPromptLookupDecoding` class. Just pass this as a draft model to the `Llama` class during initialization. ```python from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaPromptLookupDecoding +from llama_cpp.llama_speculative import LlamaNGramMapDecoding llama = Llama( - model_path="path/to/model.gguf", - draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. + model_path="path/to/qwen-3.6-27b.gguf", + n_ctx=4096, + n_gpu_layers=-1, + draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10 + ) +) + +response = llama.create_chat_completion( + messages=[{"role": "user", "content": "Write a python script..."}] ) ``` +Note: `LlamaPromptLookupDecoding.num_pred_tokens` is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. Now, `LlamaNGramMapDecoding` with the new Hash Map algorithm, draft generation becomes instantaneous $O(1)$, and the time consumption is almost 0 regardless of whether you set the prediction to 2 or 10 words. ### Adjusting the Context Window diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index ce2f1ba5d7..6c8c47b7fe 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -136,16 +136,25 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn 3. **Speculative Decoding**: Accelerates generation by using a small "draft" model to predict tokens, which the larger model then validates in parallel. + The fastest way to use speculative decoding is through the `LlamaNGramMapDecoding`(**Recommend**) or `LlamaPromptLookupDecoding` class. ```python from llama_cpp import Llama - from llama_cpp.llama_speculative import LlamaDraftModel - - draft = LlamaDraftModel.from_model(Llama(model_path="tiny_draft.gguf", n_gpu_layers=-1)) - main_llm = Llama(model_path="large_model.gguf", n_gpu_layers=-1, draft_model=draft) + from llama_cpp.llama_speculative import LlamaNGramMapDecoding + + llama = Llama( + model_path="path/to/qwen-3.6-27b.gguf", + n_ctx=4096, + n_gpu_layers=-1, + draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10 + ) + ) for chunk in main_llm.create_completion("Explain quantum physics", stream=True): print(chunk["choices"][0]["text"], end="") ``` + Note: `LlamaPromptLookupDecoding.num_pred_tokens` is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. Now, `LlamaNGramMapDecoding` with the new Hash Map algorithm, draft generation becomes instantaneous $O(1)$, and the time consumption is almost 0 regardless of whether you set the prediction to 2 or 10 words. 4. **Dynamic LoRA Routing**: diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py index 39dfb903ba..c3814aaf42 100644 --- a/llama_cpp/llama_speculative.py +++ b/llama_cpp/llama_speculative.py @@ -1,6 +1,7 @@ import abc +import collections -from typing import Any +from typing import Any, Dict, List, Tuple import numpy as np import numpy.typing as npt @@ -14,10 +15,120 @@ def __call__( raise NotImplementedError() +class LlamaNGramMapDecoding(LlamaDraftModel): + """ + Ultra-fast speculative decoder based on hash inverted index and incremental updates. + O(1) time complexity, aligned with llama.cpp's underlying ngram-map algorithm. + """ + + def __init__(self, ngram_size: int = 3, num_pred_tokens: int = 10): + """ + Initializes the N-Gram Map speculative decoder. + + Args: + ngram_size (int): The length of the token sequence used as the search key. + Larger values provide strictly accurate context matching but may result + in fewer cache hits. Defaults to 3. + num_pred_tokens (int): The maximum number of future tokens to draft (predict) + and return once a match is found in the history. Defaults to 10. + """ + self.ngram_size = ngram_size + self.num_pred_tokens = num_pred_tokens + + # Core state cache + # Mapping format: (token_1, ..., token_N) -> [index_1, index_2, ...] + self._ngram_map: Dict[Tuple[int, ...], List[int]] = collections.defaultdict(list) + self._history: List[int] = [] + + def _update_cache(self, input_ids: npt.NDArray[np.intc]) -> None: + """ + Smart state synchronization and incremental build (Extreme O(1) optimization). + + Args: + input_ids (npt.NDArray[np.intc]): The complete sequence of current token IDs + generated or processed so far. + """ + new_len = len(input_ids) + old_len = len(self._history) + + # Check if it's a perfect incremental append (verify if the previous token matches) + is_incremental = False + if new_len > old_len and old_len > 0: + if self._history[-1] == input_ids[old_len - 1]: + is_incremental = True + + if is_incremental: + # Only extract, convert, and append new tokens. + # Never copy or touch the entire historical array! + new_tokens = input_ids[old_len:].tolist() + self._history.extend(new_tokens) + start_idx = max(0, old_len - self.ngram_size) + else: + # Rollback occurred (wrong prediction) or a completely new Prompt. Trigger full rebuild. + self._ngram_map.clear() + self._history = input_ids.tolist() + start_idx = 0 + + # Build/update the hash inverted index + for i in range(start_idx, new_len - self.ngram_size): + key = tuple(self._history[i : i + self.ngram_size]) + self._ngram_map[key].append(i) + + def __call__( + self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any + ) -> npt.NDArray[np.intc]: + """ + Generates draft tokens based on historical N-Gram frequency. + + Args: + input_ids (npt.NDArray[np.intc]): The current sequence of token IDs. + **kwargs: Additional generation arguments (ignored in this implementation). + + Returns: + npt.NDArray[np.intc]: An array of predicted draft tokens. Returns an empty + array if no matching context is found. + """ + # 1. Ultra-fast state synchronization + self._update_cache(input_ids) + + # 2. Cannot speculate if the history is too short + if len(self._history) < self.ngram_size: + return np.array([], dtype=np.intc) + + # 3. Extract the Search Key (the last N tokens) + search_key = tuple(self._history[-self.ngram_size:]) + + # 4. O(1) instant lookup + match_indices = self._ngram_map.get(search_key) + + if not match_indices: + return np.array([], dtype=np.intc) + + # 5. Get the context of the last match and extract draft tokens + best_match_idx = match_indices[-1] + draft_start = best_match_idx + self.ngram_size + draft_end = min(draft_start + self.num_pred_tokens, len(self._history)) + + return np.array(self._history[draft_start:draft_end], dtype=np.intc) + + +# Legacy Numpy sliding window implementation class LlamaPromptLookupDecoding(LlamaDraftModel): - """Based on https://github.com/apoorvumang/prompt-lookup-decoding""" + """ + Stateless speculative decoding based on Numpy sliding window + Warning: High computational overhead for long contexts. + + Based on https://github.com/apoorvumang/prompt-lookup-decoding + """ + + def __init__(self, max_ngram_size: int = 3, num_pred_tokens: int = 10): + """ + Initializes the legacy sliding window speculative decoder. - def __init__(self, max_ngram_size: int = 2, num_pred_tokens: int = 10): + Args: + max_ngram_size (int): The maximum n-gram size to search for. Defaults to 3. + num_pred_tokens (int): The maximum number of tokens to predict. Defaults to 10. + """ self.max_ngram_size = max_ngram_size self.num_pred_tokens = num_pred_tokens @@ -27,6 +138,17 @@ def find_candidate_pred_tokens( max_ngram_size: int, num_pred_tokens: int, ): + """ + Linearly scans the input_ids using sliding windows to find pattern matches. + + Args: + input_ids (npt.NDArray[np.intc]): The complete sequence of token IDs. + max_ngram_size (int): Maximum size of the n-gram window. + num_pred_tokens (int): Maximum draft tokens to return. + + Returns: + npt.NDArray[np.intc]: The predicted draft tokens. + """ input_length = input_ids.shape[0] for ngram_size in range(min(max_ngram_size, input_length - 1), 0, -1): @@ -57,6 +179,7 @@ def find_candidate_pred_tokens( def __call__( self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any ) -> npt.NDArray[np.intc]: + """Generates draft tokens using the legacy sliding window search.""" return self.find_candidate_pred_tokens( input_ids=input_ids, max_ngram_size=self.max_ngram_size, From 3ec636d3a98f9b1d62f38cd90ca82d51293b8c5c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 26 Apr 2026 02:35:13 +0800 Subject: [PATCH 353/518] Update Submodule vendor/llama.cpp 13d36cf..b760272 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 13d36cf891..b760272f1a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 13d36cf89178354d9aa6732e5930d89d64caf718 +Subproject commit b760272f1a25fcae065d827ce2cbcaa035597b02 From 2cccd2eee3c7ce7476080d8ff06870542722add3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 26 Apr 2026 11:29:10 +0800 Subject: [PATCH 354/518] feat(core): implement thread-safe generation abort mechanism - Add `AbortCriteria` class and a thread-safe `Llama.abort()` method to allow graceful interruption of ongoing text generation from external threads (e.g., UI or async environments). - Automatically inject `AbortCriteria` into the stopping criteria sequence at the start of `_create_completion`. - Ensure that when an abort is triggered, the partially generated `completion_tokens` are correctly detokenized and preserved. - Set `finish_reason` to `"abort"` when generation is interrupted, allowing downstream streaming clients to correctly identify manual cancellations. - Simplify and optimize the stopping criteria evaluation logic within the core `generate` loop. - Reorganize and sort module imports for better readability. Signed-off-by: JamePeng --- llama_cpp/llama.py | 81 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 59e636b56e..be5d259868 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1,17 +1,20 @@ from __future__ import annotations +import contextlib +import ctypes +import fnmatch +import json +import multiprocessing import os import sys -import uuid import time -import json -import ctypes +import threading import typing -import random -import fnmatch +import uuid import warnings -import contextlib -import multiprocessing + +import numpy as np +import numpy.typing as npt from typing import ( Any, @@ -29,7 +32,6 @@ from collections import deque from pathlib import Path - from .llama_types import * from .llama_grammar import LlamaGrammar from .llama_cache import ( @@ -46,9 +48,6 @@ from llama_cpp.llama_speculative import LlamaDraftModel -import numpy as np -import numpy.typing as npt - import llama_cpp._internals as internals from ._internals import ( LlamaSamplingContext, @@ -60,6 +59,19 @@ from ._utils import suppress_stdout_stderr +class AbortCriteria: + """ + Listen for external interruption signals to trigger a stop condition. + When an external thread calls `llama.abort()`, a loop interrupt is generated. + """ + def __init__(self, abort_event: threading.Event): + self.abort_event = abort_event + + def __call__(self, _input_ids: npt.NDArray[np.intc], _logits: npt.NDArray[np.single]) -> bool: + # Note: _input_ids and _logits are required by the signature but unused here. + return self.abort_event.is_set() + + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -623,6 +635,9 @@ def __init__( self._sampling_ctx: Optional[LlamaSamplingContext] = None + # Create a thread-safe interrupt event + self._abort_event = threading.Event() + def close(self) -> None: """Explicitly free the model from memory.""" if getattr(self, "_sampling_ctx", None) is not None: @@ -769,6 +784,15 @@ def reset(self): """Reset the model state.""" self.n_tokens = 0 + def abort(self) -> None: + """ + Safely aborts any ongoing text generation. + Useful for async API environments or UI interruption buttons. + """ + if self.verbose: + print(f"Llama.abort: Abort signal received. Terminating generation...", file=sys.stderr) + self._abort_event.set() + def eval( self, tokens: Sequence[int], @@ -1442,26 +1466,18 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array): # Sample loop while sample_idx < self.n_tokens: + if self._abort_event.is_set(): + return + token = self._sampling_ctx.sample(self._ctx, idx=-1) self._sampling_ctx.accept(token, False if grammar is None else True) sample_idx += 1 - # Halt generation if custom stopping criteria are met if stopping_criteria is not None: - if self._logits_all: - logits_idx = sample_idx - self.n_tokens - check_stopping = True - else: - if sample_idx == self.n_tokens: - logits_idx = 0 - check_stopping = True - else: - check_stopping = False - - if check_stopping and stopping_criteria( + if stopping_criteria( self._input_ids[: sample_idx], - self._scores[logits_idx, :] + self._scores[0 if not self._logits_all else sample_idx - self.n_tokens, :] ): return @@ -1746,6 +1762,8 @@ def _create_completion( Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: assert suffix is None or suffix.__class__ is str + # Each time a new request is initiated, the previous abort state must be cleared. + self._abort_event.clear() completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) @@ -1885,6 +1903,11 @@ def _create_completion( if self.verbose: print("Llama._create_completion: cache miss", file=sys.stderr) + if stopping_criteria is None: + stopping_criteria = StoppingCriteriaList([AbortCriteria(self._abort_event)]) + else: + stopping_criteria.append(AbortCriteria(self._abort_event)) + finish_reason = "length" multibyte_fix = 0 for token in self.generate( @@ -1929,6 +1952,11 @@ def _create_completion( finish_reason = "stop" break + if self._abort_event.is_set(): + text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + finish_reason = "abort" + break + completion_tokens.append(token) all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -2108,6 +2136,11 @@ def _create_completion( text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) finish_reason = "stop" + # If the abort is triggered externally, force the `finish_reason` to be changed to "abort". + if self._abort_event.is_set(): + text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + finish_reason = "abort" + if self.verbose: self._ctx.print_timings() From 7a4dd5b1e5dfa2a58108e79e337fcc0c606d5c0a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 26 Apr 2026 18:02:16 +0800 Subject: [PATCH 355/518] Update /docs/wiki/core/Llama.md for abort() and example code Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 102 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 6c8c47b7fe..eacda189e3 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -2,7 +2,7 @@ --- title: Llama Class class_name: Llama -last_updated: 2026-04-23 +last_updated: 2026-04-26 version_target: "latest" --- ``` @@ -103,6 +103,10 @@ Low-level method to ingest and evaluate a sequence of tokens. Used internally to model.eval(tokens=[1, 453, 234, 987], active_loras=[{"name": "coding_adapter", "scale": 1.0}]) ``` +### `abort` +Immediately halts an active generation loop safely. +* **Usage**: Typically called from a separate monitoring thread (like a timer). When triggered, the running stream will exit and the final chunk will contain `"finish_reason": "abort"`. + ### Dynamic LoRA Management The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dynamically per-generation or per-eval. * `load_lora(name: str, path: str)`: Loads an adapter into VRAM (does not apply it yet). @@ -193,6 +197,102 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn ctx_checkpoints=0 # <-- SET THIS TO 0 TO ENABLE ZERO-LATENCY FAST PATH ) ``` +6. **Assistant Prefill**: + + `llama-cpp-python` supports native **Assistant Prefill** for seamless message continuation. You can now simply use the `assistant_prefill=True` parameter in the `create_chat_completion` function. + + This safely renders the `N-1` conversation history using standard Jinja templates (preserving exact control tokens) and flawlessly appends your partial text directly to the prompt. + + ```python + from llama_cpp import Llama + + llm = Llama(model_path="path/to/model.gguf") + + # An interrupted/partial conversation + messages = [ + {"role": "user", "content": "What are the first 5 planets in the solar system?"}, + {"role": "assistant", "content": "The first 5 planets in our solar system are:\n1. Mercury\n2."} + ] + + # Seamlessly continue the generation + response = llm.create_chat_completion( + messages=messages, + max_tokens=50, + assistant_prefill=True # <--- Enables seamless continuation + ) + + prefilled_text = messages[-1]["content"] + # The model will flawlessly continue from " Venus\n3. Earth..." + generated_text = response["choices"][0]["message"]["content"] + + print(prefilled_text + generated_text) + ``` + +7. **Interrupting Reasoning & Assistant Prefill (Time-boxing)**: + + Use the `abort()` method alongside `assistant_prefill=True` to forcefully stop a reasoning model (like Qwen or DeepSeek) if it thinks for too long, inject a bridge text, and force it to output the final answer. + ```python + import threading + from llama_cpp import Llama + + llm = Llama(model_path="Qwen3.6-27B.gguf", n_ctx=4096, n_gpu_layers=-1) + + def run_controlled_generation(prompt: str, timeout_seconds: int = 10): + messages = [{"role": "user", "content": prompt}] + + # 1. Set a time bomb to interrupt long phases + def timeout_handler(): + llm.abort() + + timer = threading.Timer(timeout_seconds, timeout_handler) + timer.start() + + stream = llm.create_chat_completion( + messages=messages, max_tokens=2048, stream=True + ) + + partial_response = "" + finish_reason = None + + for chunk in stream: + finish_reason = chunk["choices"][0].get("finish_reason") + + if finish_reason is not None and finish_reason != "abort": + timer.cancel() + break + + if finish_reason == "abort": + break + + delta = chunk["choices"][0]["delta"].get("content", "") + if delta: + partial_response += delta + print(delta, end="", flush=True) + + # 2. Forced Intervention and Prefill Continuation + if finish_reason == "abort": + # Inject bridge text to forcefully close the reasoning tag + bridge_text = "\n...Wait, I have thought long enough, let's start answering the user.\n\n\n" + print(bridge_text, end="", flush=True) + + prefilled_content = partial_response + bridge_text + messages.append({"role": "assistant", "content": prefilled_content}) + + # Use assistant_prefill=True to seamlessly continue the text block + stream_part2 = llm.create_chat_completion( + messages=messages, + max_tokens=2048, + stream=True, + assistant_prefill=True + ) + + for chunk in stream_part2: + delta = chunk["choices"][0]["delta"].get("content", "") + if delta: + print(delta, end="", flush=True) + + run_controlled_generation("Explain quantum mechanics in a way that relates to bugs in code.", timeout_seconds=8) + ``` --- From 820d1839ef4303c26b9785a8e5eca684cb07e33c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 26 Apr 2026 18:23:06 +0800 Subject: [PATCH 356/518] Update README.md Signed-off-by: JamePeng --- README.md | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 8873c39039..1d56558d23 100644 --- a/README.md +++ b/README.md @@ -4,28 +4,38 @@ # Python Bindings for [`llama.cpp`](https://github.com/ggml-org/llama.cpp) -[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) [![Tests](https://github.com/JamePeng/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/JamePeng/llama-cpp-python/actions/workflows/test.yaml) ![GitHub Tag](https://img.shields.io/github/v/tag/JamePeng/llama-cpp-python) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python) [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]() -Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggml-org/llama.cpp) library. +Efficiency Python bindings for **ggml-org's** [`llama.cpp`](https://github.com/ggml-org/llama.cpp) library. This package provides: - Low-level access to C API via `ctypes` interface. + - [llama_cpp_lib](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/llama_cpp.py) + - [mtmd_cpp_lib](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/mtmd_cpp.py) - High-level Python API for text completion - - OpenAI-like API - - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp) - - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html) -- OpenAI compatible web server - - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion) - - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling) - - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models) - - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support) - -Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). + - OpenAI-like API and Type([llama_types.py](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/llama_types.py)) + - [High-level API](https://github.com/JamePeng/llama-cpp-python#high-level-api) + - [Continuing Assistant Responses (Prefill)](https://github.com/JamePeng/llama-cpp-python#continuing-assistant-responses-prefill) + - [Dynamic LoRA Routing & Control Vectors (Multi-Tenant Serving)](https://github.com/JamePeng/llama-cpp-python#dynamic-lora-routing--control-vectors-multi-tenant-serving) + - [Dynamic LoRA Example](https://github.com/JamePeng/llama-cpp-python#dynamic-lora-example) + - [Control Vector Injection (Representation Engineering)](https://github.com/JamePeng/llama-cpp-python#control-vector-injection-representation-engineering) + - [Sampling Configuration & Usage (LlamaSamplingParams)](https://github.com/JamePeng/llama-cpp-python#sampling-configuration--usage-llamasamplingparams) + - [Multi-modal Models Support](https://github.com/JamePeng/llama-cpp-python#multi-modal-models) + - Support Models Lists + - [Loading a Local Image With Qwen3VL(Thinking/Instruct)](https://github.com/JamePeng/llama-cpp-python#loading-a-local-image-with-qwen3vlthinkinginstruct) + - [Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text)](https://github.com/JamePeng/llama-cpp-python#comprehensive-omni-multimodal-example-gemma-4-vision--audio--text) + - [Embeddings & Reranking (GGUF)](https://github.com/JamePeng/llama-cpp-python#embeddings--reranking-gguf) + - [1. Text Embeddings (Vector Search)](https://github.com/JamePeng/llama-cpp-python#1-text-embeddings-vector-search) + - [2. Reranking (Cross-Encoder Scoring)](https://github.com/JamePeng/llama-cpp-python#2-reranking-cross-encoder-scoring) + - [3. Normalization](https://github.com/JamePeng/llama-cpp-python#3-normalization) + - [Speculative Decoding](https://github.com/JamePeng/llama-cpp-python#speculative-decoding) +- [FAQ](https://github.com/JamePeng/llama-cpp-python#faq) + +The new documentation will be maintained in the [docs/wiki](https://github.com/JamePeng/llama-cpp-python/tree/main/docs/wiki) directory based on the LLM Wiki approach. Interested volunteers are welcome to participate in its maintenance and updates :) ## Discussions From 9d231991242a2c4288719786db664a705b1d8bed Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 27 Apr 2026 22:58:36 +0800 Subject: [PATCH 357/518] Update Submodule vendor/llama.cpp b760272..4414c04 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b760272f1a..4414c04b9a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b760272f1a25fcae065d827ce2cbcaa035597b02 +Subproject commit 4414c04b9a23bed188e0318fb1e8812cf175b5b4 From 5068a8072e44aebd2ad124b2db5defa2b102f3c3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 27 Apr 2026 22:59:26 +0800 Subject: [PATCH 358/518] Upload /docs/wiki/LlamaEmbedding.md for llama_embedding.py Signed-off-by: JamePeng --- docs/wiki/SCHEMA.md | 2 +- docs/wiki/core/ChatHandler.md | 0 docs/wiki/core/LlamaEmbedding.md | 263 +++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+), 1 deletion(-) delete mode 100644 docs/wiki/core/ChatHandler.md create mode 100644 docs/wiki/core/LlamaEmbedding.md diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index 4a8f700e2c..f5676442cb 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -3,7 +3,7 @@ **Purpose**: Maintain a living, always-up-to-date, structured documentation wiki for the llama-cpp-python library using LLMs as the primary maintainer. **Core Principles**: -- The source of truth is the latest code in `llama_cpp/` (especially `llama.py`, `llama_chat_format.py`, `llama_cpp.py`, `llama_types.py`, `mtmd_cpp.py`, `_internals.py`, `_ggml.py`). +- The source of truth is the latest code in `llama_cpp/` (especially `llama.py`, `llama_chat_format.py`, `llama_cpp.py`, `llama_types.py`, `llama_embedding.py`, `mtmd_cpp.py`, `_internals.py`, `_ggml.py`). - Never invent parameters or behavior. Always read the current source code before writing/updating a page. - All examples must be complete, runnable with the latest API, and include necessary imports. - Clearly mark any deprecated/old usage with a warning and show the modern replacement. diff --git a/docs/wiki/core/ChatHandler.md b/docs/wiki/core/ChatHandler.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/wiki/core/LlamaEmbedding.md b/docs/wiki/core/LlamaEmbedding.md new file mode 100644 index 0000000000..b28772b45e --- /dev/null +++ b/docs/wiki/core/LlamaEmbedding.md @@ -0,0 +1,263 @@ +--- +title: LlamaEmbedding +class_name: LlamaEmbedding +last_updated: 2026-04-27 +version_target: "latest" +--- + +# LlamaEmbedding + +## Overview + +`LlamaEmbedding` is a specialized class for high-performance Text Embedding and Reranking. It inherits from the base `Llama` class but is optimized for vector operations. + +### Support Embeddings & Rerank Model: + + +| Model | Type | Link | Status | +|--------------------|-----------|--------------------------------------------------------|--------------| +| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | +|`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | + +**Core Features:** +1. **Auto-configuration**: Automatically sets `embeddings=True`. +2. **Streaming Batch**: Handles massive datasets without OOM (Out Of Memory). +3. **Native Reranking Support**: Specifically handles `LLAMA_POOLING_TYPE_RANK` models (like BGE-Reranker, Qwen3-Reranker). It correctly identifies classification heads to output scalar relevance scores instead of high-dimensional vectors. +4. **Advanced Normalization**: Implements MaxInt16, Taxicab (L1), and Euclidean (L2) normalization strategies using NumPy for optimal performance and compatibility with various vector databases. + +## Constructor `__init__` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model_path` | str | Required | Path to the GGUF model file. | +| `n_ctx` | int | 0 | Text context window size (0 = model default). | +| `n_batch` | int | 512 | Maximum prompt processing batch size. | +| `n_ubatch` | int | 512 | Physical batch size. | +| `pooling_type` | int | `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) | Pooling strategy used by the model: `LLAMA_POOLING_TYPE_RANK` (4) for rerankers, `LLAMA_POOLING_TYPE_UNSPECIFIED` (-1) for embeddings. | +| `n_gpu_layers` | int | 0 | Number of layers offloaded to GPU (0 = CPU only, -1 = all layers). | +| `verbose` | bool | True | Whether to print debug information. | +| `**kwargs` | Any | — | Extra arguments passed to the `Llama` base class (e.g., `n_batch`, `n_ctx`, `verbose`). | + +### Initialization Logic + +1. Forces `embeddings=True` to enable embedding support. +2. Sets `kv_unified=True` to enable unified KV Cache, allowing arbitrary sequence IDs in a batch without "invalid seq_id" errors. +3. Passes `pooling_type` to the parent class constructor. + +## Core Methods + +### `embed(input, normalize=NORM_MODE_EUCLIDEAN, truncate=True, separator=None, return_count=False)` + +**Description**: Computes embedding vectors for input text (standard embeddings or reranking scores). + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input` | `Union[str, List[str], List[List[int]]]` | — | Input format: string (can be split), list of strings, or list of integer lists (token IDs). | +| `normalize` | int | `NORM_MODE_EUCLIDEAN` (2) | Vector normalization mode (see below). | +| `truncate` | bool | True | Whether to truncate input. | +| `separator` | str | None | Separator for splitting string input into multiple documents. | +| `return_count` | bool | False | If True, returns `(embeddings, token_count)`. | + +**Normalization Modes:** +- `NORM_MODE_NONE` (-1): No normalization. +- `NORM_MODE_MAX_INT16` (0): Max absolute value normalization (scaled to 32760). +- `NORM_MODE_TAXICAB` (1): L1 Taxicab norm. +- `NORM_MODE_EUCLIDEAN` (2): L2 Euclidean norm. +- `NORM_MODE_PNORM` (>2): p-norm normalization. + +**Returns:** +- `return_count=False`: List of embedding vectors. +- `return_count=True`: Tuple `(embeddings, token_count)`. + +**Internal Logic:** +1. Determines mode based on `pooling_type`: `LLAMA_POOLING_TYPE_NONE` (token-level), `LLAMA_POOLING_TYPE_RANK` (rerank), or other (sequence-level). +2. Uses streaming batch decoding to process embeddings in chunks. +3. For token-level mode, extracts and normalizes per-token vectors. +4. For sequence-level mode, extracts sequence vectors and normalizes. +5. Supports `separator` for splitting input into multiple documents. + +### `rank(query, documents)` + +**Description**: Calculates relevance scores for a list of documents against a query using a Reranking model. + +| Parameter | Type | Description | +|-----------|------|-------------| +| `query` | str | Search query string. | +| `documents` | `List[str]` | List of candidate document strings to be scored. | + +**Returns**: List of float scores, where higher values indicate greater relevance. + +**Internal Logic:** +1. Checks if model is a reranker (`pooling_type == LLAMA_POOLING_TYPE_RANK`). +2. Attempts to retrieve the built-in 'rerank' chat template. +3. If template exists: dynamically replaces `{query}` and `{document}` and tokenizes; otherwise, manually constructs `[BOS] Query [SEP] Doc [EOS]` sequence. +4. Executes embedding inference (`embed`), returning raw logits/scores. +5. For generative rerankers (e.g., Qwen3-Reranker, output dim = 2), uses `yes_logit` as relevance score. + +### `create_embedding(input, model=None, normalize=NORM_MODE_EUCLIDEAN, output_format="json")` + +**Description**: High-level API compatible with OpenAI format. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input` | `Union[str, List[str]]` | — | Input text or list of texts. | +| `model` | str | None | Model name (optional, uses `self.model_path` if None). | +| `normalize` | int | `NORM_MODE_EUCLIDEAN` (2) | Normalization mode. | +| `output_format` | str | "json" | Output format: 'json', 'json+', or 'array'. | + +**Output Formats:** +- `'json'`: OpenAI-style dictionary list. +- `'json+'`: OpenAI dictionary list + cosine similarity matrix. +- `'array'`: Raw Python list (`List[float]` or `List[List[float]]`). + +**Returns**: Data structure according to `output_format`. + +## Deprecated / Changed APIs + +- **Note**: The TODO comments `# TODO(JamePeng): Needs more extensive testing with various embedding and reranking models.` indicate that support for various embedding and reranking models may be incomplete. Further testing is recommended. + +## Best Practices & Common Patterns + +1. **Select Correct `pooling_type`**: + - Standard embeddings: `LLAMA_POOLING_TYPE_UNSPECIFIED (-1)`. + - Reranker models: `LLAMA_POOLING_TYPE_RANK (4)`. + - Token-level embeddings: `LLAMA_POOLING_TYPE_NONE (0)`. + +2. **Batch Optimization for Large Datasets**: + - Adjust `n_batch` and `n_ubatch` to balance performance and memory. + - Streaming processing avoids OOM for large datasets. + +3. **Normalization Selection**: + - Vector databases typically prefer L2 normalization (Euclidean), but other norms may be needed in specific scenarios. + +4. **Reranker Models**: + - Ensure `pooling_type` is set to `LLAMA_POOLING_TYPE_RANK`. + - Note that output is scalar scores, not vectors. + +5. **Performance Tuning**: + - For GPU acceleration, set `n_gpu_layers` to -1 (recommended). + - Use `verbose=True` for debugging configuration. + +## Example Code + +### 1. Text Embeddings (Vector Search) + +To generate embeddings, use the `LlamaEmbedding` class. It automatically configures the model for vector generation. + +```python +from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_NONE + +# Initialize the model (automatically sets embeddings=True) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) + +# 1. Simple usage (OpenAI-compatible format) +response = llm.create_embedding("Hello, world!") +print(response['data'][0]['embedding']) + +# 2. Batch processing (High Performance) +# You can pass a large list of strings; the streaming batcher handles memory automatically. +documents = ["Hello, world!", "Goodbye, world!", "Llama is cute."] * 100 +embeddings = llm.embed(documents) # Returns a list of lists (vectors) + +print(f"Generated {len(embeddings)} vectors.") +``` + +**Advanced Output Formats:** +You can request raw arrays or cosine similarity matrices directly: + +```python +from llama_cpp.llama_embedding import LlamaEmbedding, LLAMA_POOLING_TYPE_NONE + +# Initialize the model (automatically sets embeddings=True) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) + +# Returns raw List[float] instead of a dictionary wrapper +vector = llm.create_embedding("Text", output_format="array") + +# Returns a similarity matrix (A @ A.T) in the response +# Note: Requires numpy installed +response = llm.create_embedding( + ["apple", "fruit", "car"], + output_format="json+" +) +print(response["cosineSimilarity"]) +``` + +### 2. Reranking (Cross-Encoder Scoring) + +Reranking models (like `bge-reranker`) take a **Query** and a list of **Documents** as input and output a relevance score (scalar) for each document. + +> **Important:** You must explicitly set `pooling_type` to `LLAMA_POOLING_TYPE_RANK` (4) when initializing the model. + +```python +import llama_cpp +from llama_cpp.llama_embedding import LlamaEmbedding + +# Initialize a Reranking model +ranker = LlamaEmbedding( + model_path="path/to/qwen3-reranker-0.6b-q8_0.gguf", + pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK, # Crucial for Rerankers! + n_gpu_layers=-1, + n_ctx=0 +) + +query = "What causes Rain?" +docs = [ + "Clouds are made of water droplets...", # Relevant + "To bake a cake you need flour...", # Irrelevant + "Rain is liquid water in the form of droplets..." # Highly Relevant +] + +# Calculate relevance scores +# Logic: Constructs inputs like "[BOS] query [SEP] doc [EOS]" automatically +scores = ranker.rank(query, docs) + +# Result: List of floats (higher means more relevant) +print(scores) +# e.g., [0.0011407170677557588, 5.614783731289208e-05, 0.7173627614974976] -> The 3rd doc is the best match +``` + +### 3. Normalization + +The `embed` method supports various mathematical normalization strategies via the `normalize` parameter. + +| Normalization modes | $Integer$ | Description | Formula | +|---------------------|-----------|---------------------|---------| +| NORM_MODE_NONE | $-1$ | none | +| NORM_MODE_MAX_INT16 | $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$ +| NORM_MODE_TAXICAB | $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$ +| NORM_MODE_EUCLIDEAN | $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$ +| NORM_MODE_PNORM | $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$ + +This is useful for optimizing storage or preparing vectors for cosine similarity search (which requires L2 normalization). + +```python +from llama_cpp.llama_embedding import ( + LLAMA_POOLING_TYPE_NONE, + NORM_MODE_MAX_INT16, + NORM_MODE_TAXICAB, + NORM_MODE_EUCLIDEAN +) + +# Initialize the model (automatically sets embeddings=True) +llm = LlamaEmbedding(model_path="path/to/bge-m3.gguf", n_gpu_layers=-1, pooling_type=LLAMA_POOLING_TYPE_NONE) + +# Taxicab (L1) +vec_l1 = llm.embed("text", normalize=NORM_MODE_TAXICAB) + +# Default is Euclidean (L2) - Standard for vector databases +vec_l2 = llm.embed("text", normalize=NORM_MODE_EUCLIDEAN) + +# Max Absolute Int16 - Useful for quantization/compression +vec_int16 = llm.embed("text", normalize=NORM_MODE_MAX_INT16) + +# Raw Output (No Normalization) - Get the raw floating point values from the model +embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MODE_NONE) +``` + +## Notes + +- This class is in development; some features may be unstable, especially reranking model support. +- Performance issues can be addressed by adjusting `n_batch`, `n_ubatch`, and `n_gpu_layers`. +- For custom models, manual `pooling_type` configuration may be required to match model behavior. \ No newline at end of file From 969b45f4a8fbd4f1e760988c1947b4b3328bcb45 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 28 Apr 2026 00:12:43 +0800 Subject: [PATCH 359/518] feat(handler): Support `add_generation_prompt` parameter pass to MTMDChatHandler - supports disabling assistant part injection, used to support the multimodal `assistant_prefill` functionality. Signed-off-by: JamePeng --- llama_cpp/llama.py | 2 ++ llama_cpp/llama_chat_format.py | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index be5d259868..5ad8364a9b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2717,6 +2717,7 @@ def create_chat_completion( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + add_generation_prompt: bool = True, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2829,6 +2830,7 @@ def create_chat_completion( active_loras=active_loras, control_vector=control_vector, assistant_prefill=assistant_prefill, + add_generation_prompt=add_generation_prompt, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index af068f5535..a0d8d25db4 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3079,6 +3079,7 @@ def _process_mtmd_prompt( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + add_generation_prompt: bool = True, ) -> Tuple[List[int], List[tuple], Any, List[Any]]: """ Core multimodal preprocessing pipeline. @@ -3106,7 +3107,7 @@ def _process_mtmd_prompt( # 2. Render the chat template and replace actual URLs with C++ media markers text = self.chat_template.render( messages=messages, - add_generation_prompt=True, + add_generation_prompt=add_generation_prompt, eos_token=self.mtmd_eos_token, bos_token=self.mtmd_bos_token, functions=functions, @@ -3306,6 +3307,7 @@ def __call__( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + add_generation_prompt: bool = True, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -3322,7 +3324,8 @@ def __call__( functions=functions, function_call=function_call, tools=tools, - tool_choice=tool_choice + tool_choice=tool_choice, + add_generation_prompt=add_generation_prompt, ) if self.verbose: From db8bf453c980776433dd5f23075711a5896a7b5c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 30 Apr 2026 03:02:03 +0800 Subject: [PATCH 360/518] Update Submodule vendor/llama.cpp 4414c04..660b1b4 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4414c04b9a..660b1b4bdc 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4414c04b9a23bed188e0318fb1e8812cf175b5b4 +Subproject commit 660b1b4bdc6fedc18e8c3d87a945ffb51f91c547 From 66dd88b0ad8126c322a110d22fd90811a8cb64b6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 1 May 2026 05:55:23 +0800 Subject: [PATCH 361/518] feat(_ggml): implement ggml-backend API bindings and fix type hints - Introduces extensive ctypes bindings for `ggml-backend.h` (devices, buffers, registries, and CPU buffer types) to support advanced memory routing like MoE CPU offloading. Also fixes various static typing warnings by adding `# type: ignore` to pointer annotations. Signed-off-by: JamePeng --- llama_cpp/_ggml.py | 525 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 499 insertions(+), 26 deletions(-) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 733a3520c1..8f4cb1187f 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -548,7 +548,7 @@ class ggml_object(ctypes.Structure): if TYPE_CHECKING: offs: ctypes.c_size_t size: ctypes.c_size_t - next: "ctypes.POINTER(ggml_object)" + next: "ctypes.POINTER(ggml_object)" # type: ignore type: int padding: ctypes.Array[ctypes.c_char] @@ -586,8 +586,8 @@ class ggml_context(ctypes.Structure): mem_buffer_owned: bool no_alloc: bool n_objects: int - objects_begin: ggml_object_p - objects_end: ggml_object_p + objects_begin: ggml_object_p # type: ignore + objects_end: ggml_object_p # type: ignore _fields_ = [ ("mem_size", ctypes.c_size_t), @@ -694,8 +694,8 @@ class ggml_tensor(ctypes.Structure): op: int op_params: ctypes.Array[ctypes.c_int32] flags: int - src: "ctypes.Array[ctypes.POINTER(ggml_tensor)]" - view_src: "ctypes.POINTER(ggml_tensor)" + src: "ctypes.Array[ctypes.POINTER(ggml_tensor)]" # type: ignore + view_src: "ctypes.POINTER(ggml_tensor)" # type: ignore view_offs: ctypes.c_size_t data: ctypes.c_void_p name: ctypes.Array[ctypes.c_char] @@ -744,8 +744,8 @@ class ggml_tensor(ctypes.Structure): None, ) def ggml_log_get( - log_callback: Optional[ctypes.POINTER(ggml_log_callback)], - user_data: ctypes.POINTER(ctypes.c_void_p), + log_callback: Optional[ctypes.POINTER(ggml_log_callback)], # type: ignore + user_data: ctypes.POINTER(ctypes.c_void_p), # type: ignore /, ): """ @@ -763,7 +763,7 @@ def ggml_log_get( None, ) def ggml_log_set( - log_callback: Optional[ggml_log_callback], + log_callback: Optional[ggml_log_callback], # type: ignore user_data: ctypes.c_void_p, /, ): @@ -875,36 +875,509 @@ class ggml_opt_optimizer_params(ctypes.Structure): ) -# from ggml-backend.h -# // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) -# // when ask == true, the scheduler wants to know if the user wants to observe this node -# // this allows the scheduler to batch nodes together in order to evaluate them in a single call # // -# // when ask == false, the scheduler is passing the node tensor to the user for observation -# // if the user returns false, the scheduler will cancel the graph compute +# // GGML Backend from ggml-backend.h # // -# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); -ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE( - ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p + +# typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +ggml_backend_buffer_type_t = NewType( + "ggml_backend_buffer_type_t", + ctypes.c_void_p, +) + +# typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +ggml_backend_buffer_t = NewType( + "ggml_backend_buffer_t", + ctypes.c_void_p, +) + +# typedef struct ggml_backend_event * ggml_backend_event_t; +ggml_backend_event_t = NewType( + "ggml_backend_event_t", + ctypes.c_void_p, +) + +# typedef struct ggml_backend * ggml_backend_t; +ggml_backend_t = NewType( + "ggml_backend_t", + ctypes.c_void_p, +) + +# typedef struct ggml_backend_reg * ggml_backend_reg_t; +ggml_backend_reg_t = NewType( + "ggml_backend_reg_t", + ctypes.c_void_p, +) + +# typedef struct ggml_backend_device * ggml_backend_dev_t; +ggml_backend_dev_t = NewType( + "ggml_backend_dev_t", + ctypes.c_void_p, +) + +# // +# // Backend buffer type +# // + +# GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); +@ggml_base_function("ggml_backend_buft_name", [ctypes.c_void_p], ctypes.c_char_p) +def ggml_backend_buft_name(buft: ggml_backend_buffer_type_t) -> ctypes.c_char_p: + """ + Get ggml_backend_buffer name + """ + ... + + +# GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); +@ggml_base_function("ggml_backend_buft_alloc_buffer", [ctypes.c_void_p, ctypes.c_size_t], ctypes.c_void_p) +def ggml_backend_buft_alloc_buffer( + buft: ggml_backend_buffer_type_t, + size: ctypes.c_size_t +) -> ggml_backend_buffer_t: + """ + Alloc ggml_backend_buffer with size + """ + ... + + +# GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); +@ggml_base_function("ggml_backend_buft_get_alignment", [ctypes.c_void_p], ctypes.c_size_t) +def ggml_backend_buft_get_alignment(buft: ggml_backend_buffer_type_t) -> ctypes.c_size_t: + """ + Get tensor alignment by ggml_backend_buffer + """ + ... + + +# GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); +@ggml_base_function("ggml_backend_buft_get_max_size", [ctypes.c_void_p], ctypes.c_size_t) +def ggml_backend_buft_get_max_size(buft: ggml_backend_buffer_type_t) -> ctypes.c_size_t: + """ + Get ggml_backend_buffer max buffer size that can be allocated (defaults to SIZE_MAX) + """ + ... + + +# GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); +@ggml_base_function("ggml_backend_buft_get_alloc_size", [ + ctypes.c_void_p, + ggml_tensor_p, +], ctypes.c_size_t) +def ggml_backend_buft_get_alloc_size( + buft: ggml_backend_buffer_type_t, + tensor: ggml_tensor_p, # type: ignore +) -> ctypes.c_size_t: + """ + Get alloc data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) + """ + ... + + +# GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); +@ggml_base_function("ggml_backend_buft_is_host", [ctypes.c_void_p], ctypes.c_bool) +def ggml_backend_buft_is_host(buft: ggml_backend_buffer_type_t) -> ctypes.c_bool: + """ + Check if ggml_backend_buffer is host + """ + ... + + +# GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); +@ggml_base_function("ggml_backend_buft_get_device", [ctypes.c_void_p], ctypes.c_void_p) +def ggml_backend_buft_get_device(buft: ggml_backend_buffer_type_t) -> ggml_backend_dev_t: + """ + Get device by ggml_backend_buffer + """ + ... + +# // +# // Backend buffer +# // + +# enum ggml_backend_buffer_usage { +# GGML_BACKEND_BUFFER_USAGE_ANY = 0, +# GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, +# GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, +# }; +class GGMLBackendBufferUsage(enum.IntEnum): + GGML_BACKEND_BUFFER_USAGE_ANY = 0 + GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1 + GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2 + + +# GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_name", [ctypes.c_void_p], ctypes.c_char_p) +def ggml_backend_buffer_name(buffer: ggml_backend_buffer_t) -> ctypes.c_char_p: + """ + Get ggml_backend_buffer name + """ + ... + + +# GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_free", [ctypes.c_void_p], None) +def ggml_backend_buffer_free(buffer: ggml_backend_buffer_t): + """ + Free ggml_backend_buffer + """ + ... + + +# GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_get_base", [ctypes.c_void_p], None) +def ggml_backend_buffer_get_base(buffer: ggml_backend_buffer_t): + """ + Get ggml_backend_buffer base address + """ + ... + + +# GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_get_size", [ctypes.c_void_p], ctypes.c_size_t) +def ggml_backend_buffer_get_size(buffer: ggml_backend_buffer_t) -> ctypes.c_size_t: + """ + Get ggml_backend_buffer size + """ + ... + + +# GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); +@ggml_base_function("ggml_backend_buffer_init_tensor", [ + ctypes.c_void_p, + ggml_tensor_p, +], ctypes.c_int32) +def ggml_backend_buffer_init_tensor( + buffer: ggml_backend_buffer_t, + tensor: ggml_tensor_p, # type: ignore +) -> ctypes.c_int32: + """ + Init tensor by ggml_backend_buffer + """ + ... + + +# GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_get_alignment", [ctypes.c_void_p], ctypes.c_size_t) +def ggml_backend_buffer_get_alignment(buffer: ggml_backend_buffer_t) -> ctypes.c_size_t: + """ + Get tensor alignment by ggml_backend_buffer + """ + ... + + +# GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_get_max_size", [ctypes.c_void_p], ctypes.c_size_t) +def ggml_backend_buffer_get_max_size(buffer: ggml_backend_buffer_t) -> ctypes.c_size_t: + """ + Get max buffer size that can be allocated (defaults to SIZE_MAX) + """ + ... + + +# GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor); +@ggml_base_function("ggml_backend_buffer_get_alloc_size", [ + ctypes.c_void_p, + ggml_tensor_p, +], ctypes.c_size_t) +def ggml_backend_buffer_get_alloc_size( + buffer: ggml_backend_buffer_t, + tensor: ggml_tensor_p, # type: ignore +) -> ctypes.c_size_t: + """ + Get alloc data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) + """ + ... + + +# GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); +@ggml_base_function("ggml_backend_buffer_clear", [ctypes.c_void_p, ctypes.c_uint8], None) +def ggml_backend_buffer_clear(buffer: ggml_backend_buffer_t, value: ctypes.c_uint8): + """ + Clear ggml_backend_buffer + """ + ... + + +# GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_is_host", [ctypes.c_void_p], ctypes.c_bool) +def ggml_backend_buffer_is_host(buffer: ggml_backend_buffer_t) -> ctypes.c_bool: + """ + Check if ggml_backend_buffer is host + """ + ... + + +# GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); +@ggml_base_function("ggml_backend_buffer_set_usage", [ctypes.c_void_p, ctypes.c_int32], None) +def ggml_backend_buffer_set_usage(buffer: ggml_backend_buffer_t, usage: ctypes.c_int32): + """ + Set ggml_backend_buffer usage + """ + ... + + +# GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_get_usage", [ctypes.c_void_p], ctypes.c_int32) +def ggml_backend_buffer_get_usage(buffer: ggml_backend_buffer_t) -> ctypes.c_int32: + """ + Get ggml_backend_buffer usage + """ + ... + + +# GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_get_type", [ctypes.c_void_p], ctypes.c_void_p) +def ggml_backend_buffer_get_type(buffer: ggml_backend_buffer_t) -> ggml_backend_buffer_t: + """ + Get ggml_backend_buffer_type + """ + ... + + +# GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); +@ggml_base_function("ggml_backend_buffer_reset", [ctypes.c_void_p], None) +def ggml_backend_buffer_reset(buffer: ggml_backend_buffer_t): + """ + Reset ggml_backend_buffer + """ + ... + + +# // +# // Backend device +# // + +# enum ggml_backend_dev_type { +# // CPU device using system memory +# GGML_BACKEND_DEVICE_TYPE_CPU, +# // GPU device using dedicated memory +# GGML_BACKEND_DEVICE_TYPE_GPU, +# // integrated GPU device using host memory +# GGML_BACKEND_DEVICE_TYPE_IGPU, +# // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) +# GGML_BACKEND_DEVICE_TYPE_ACCEL, +# // "meta" device wrapping multiple other devices for tensor parallelism +# GGML_BACKEND_DEVICE_TYPE_META, +# }; +class GGMLBackendDevType(enum.IntEnum): + GGML_BACKEND_DEVICE_TYPE_CPU = 0 # CPU device using system memory + GGML_BACKEND_DEVICE_TYPE_GPU = 1 # GPU device using dedicated memory + GGML_BACKEND_DEVICE_TYPE_IGPU = 2 # integrated GPU device using host memory + GGML_BACKEND_DEVICE_TYPE_ACCEL = 3 # accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) + GGML_BACKEND_DEVICE_TYPE_META = 4 # "meta" device wrapping multiple other devices for tensor parallelism + +ggml_backend_dev_type_t = NewType( + "ggml_backend_dev_type_t", + ctypes.c_void_p, ) # // # // Backend registry # // +# GGML_API void ggml_backend_register(ggml_backend_reg_t reg); +@ggml_function("ggml_backend_register", [ctypes.c_void_p], None) +def ggml_backend_register(reg: ctypes.c_void_p): + """ + Register ggml backend + """ + ... + +# GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); +@ggml_function("ggml_backend_device_register", [ctypes.c_void_p], None) +def ggml_backend_device_register(device: ctypes.c_void_p): + """ + Register ggml backend device + """ + ... + + +# // Backend (reg) enumeration + +# GGML_API size_t ggml_backend_reg_count(void); +@ggml_function("ggml_backend_reg_count", [], ctypes.c_size_t) +def ggml_backend_reg_count() -> ctypes.c_size_t: + """ + Get ggml_backend_reg count + """ + ... + +# GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); +@ggml_function("ggml_backend_reg_get", [ctypes.c_size_t], ctypes.c_void_p) +def ggml_backend_reg_get(index: ctypes.c_size_t) -> ggml_backend_reg_t: + """ + Get ggml_backend_reg by index + """ + +# GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name); +@ggml_function("ggml_backend_reg_by_name", [ctypes.c_char_p], ctypes.c_void_p) +def ggml_backend_reg_by_name(name: ctypes.c_char_p) -> ggml_backend_reg_t: + """ + Get ggml_backend_reg by name + """ + ... + +# // Device enumeration + +# GGML_API size_t ggml_backend_dev_count(void); +@ggml_function("ggml_backend_dev_count", [], ctypes.c_size_t) +def ggml_backend_dev_count() -> ctypes.c_size_t: + """ + Get ggml_backend_dev count + """ + ... + +# GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); +@ggml_function("ggml_backend_dev_get", [ctypes.c_size_t], ctypes.c_void_p) +def ggml_backend_dev_get(index: ctypes.c_size_t) -> ggml_backend_dev_t: + """ + Get ggml_backend_dev by index + """ + ... + +# GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); +@ggml_function("ggml_backend_dev_by_name", [ctypes.c_char_p], ctypes.c_void_p) +def ggml_backend_dev_by_name(name: ctypes.c_char_p) -> ggml_backend_dev_t: + """ + Get ggml_backend_dev by name + """ + ... + +# GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); +@ggml_function("ggml_backend_dev_by_type", [ctypes.c_int32], ctypes.c_void_p) +def ggml_backend_dev_by_type(type: ctypes.c_int32) -> ggml_backend_dev_t: + """ + Get ggml_backend_dev by type + """ + ... + +# // Direct backend (stream) initialization + +# // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) +# GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); +@ggml_function("ggml_backend_init_by_name", [ctypes.c_char_p, ctypes.c_char_p], ctypes.c_void_p) +def ggml_backend_init_by_name(name: ctypes.c_char_p, params: ctypes.c_char_p) -> ggml_backend_t: + """ + = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) + """ + ... + + +# // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) +# GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); +@ggml_base_function("ggml_backend_dev_init", [ctypes.c_int32, ctypes.c_char_p], ctypes.c_void_p) +def ggml_backend_dev_init(type: ctypes.c_int32, params: ctypes.c_char_p) -> ggml_backend_t: + """ + = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) + """ + ... + + +# // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) +# GGML_API ggml_backend_t ggml_backend_init_best(void); +@ggml_function("ggml_backend_init_best", [], ctypes.c_void_p) +def ggml_backend_init_best() -> ggml_backend_t: + """ + = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) + """ + ... + + +# // Load a backend from a dynamic library and register it +# GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); +@ggml_function("ggml_backend_load", [ctypes.c_char_p], ctypes.c_void_p) +def ggml_backend_load(path: ctypes.c_char_p) -> ggml_backend_reg_t: + """ + Load a backend from a dynamic library and register it + """ + ... + + +# // Unload a backend if loaded dynamically and unregister it +# GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); +@ggml_function("ggml_backend_load_all", [ctypes.c_void_p], None) +def ggml_backend_load_all(reg: ggml_backend_reg_t): + """ + Unload a backend if loaded dynamically and unregister it + """ + ... + + # // Load all known backends from dynamic libraries + # GGML_API void ggml_backend_load_all(void); @ggml_function("ggml_backend_load_all", [], None) def ggml_backend_load_all(): - """Load all known backends from dynamic libraries""" + """ + Load all known backends from dynamic libraries + """ ... + # GGML_API void ggml_backend_load_all_from_path(const char * dir_path); @ggml_function("ggml_backend_load_all_from_path", [ctypes.c_char_p], None) def ggml_backend_load_all_from_path(dir_path: ctypes.c_char_p): - """Load all known backends from path""" + """ + Load all known backends from path + """ ... + +# // CPU buffer types are always available + +# GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); +@ggml_base_function( + "ggml_backend_cpu_buffer_from_ptr", + [ctypes.c_void_p, ctypes.c_size_t], + ctypes.c_void_p, +) +def ggml_backend_cpu_buffer_from_ptr( + ptr: ctypes.c_void_p, + size: ctypes.c_size_t +) -> ggml_backend_buffer_t: + """ + Return the CPU backend buffer type from ptr. + """ + ... + + +# GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); +@ggml_base_function( + "ggml_backend_cpu_buffer_type", + [], + ctypes.c_void_p, +) +def ggml_backend_cpu_buffer_type() -> ggml_backend_buffer_type_t: + """ + Return the CPU backend buffer type. + """ + ... + + +# // +# // Backend scheduler +# // + +# typedef struct ggml_backend_sched * ggml_backend_sched_t; +ggml_backend_sched_t = NewType( + "ggml_backend_sched_t", + ctypes.c_void_p, +) + + +# // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) +# // when ask == true, the scheduler wants to know if the user wants to observe this node +# // this allows the scheduler to batch nodes together in order to evaluate them in a single call +# // +# // when ask == false, the scheduler is passing the node tensor to the user for observation +# // if the user returns false, the scheduler will cancel the graph compute +# // +# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); +ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p +) + + # // # // GGML internal header from ggml-impl.h # // @@ -933,8 +1406,8 @@ class GGMLCgraphEvalOrder(enum.IntEnum): class ggml_hash_set(ctypes.Structure): if TYPE_CHECKING: size: int - used: ctypes.POINTER(ggml_bitset_t) - keys: "ctypes.POINTER(ggml_tensor_p)" + used: ctypes.POINTER(ggml_bitset_t) # type: ignore + keys: ctypes.POINTER(ggml_tensor_p) # type: ignore _fields_ = [ ("size", ctypes.c_size_t), @@ -963,11 +1436,11 @@ class ggml_cgraph(ctypes.Structure): size: int n_nodes: int n_leafs: int - nodes: "ctypes.POINTER(ggml_tensor_p)" - grads: "ctypes.POINTER(ggml_tensor_p)" - grad_accs: "ctypes.POINTER(ggml_tensor_p)" - leafs: "ctypes.POINTER(ggml_tensor_p)" - use_counts: ctypes.POINTER(ctypes.c_int32) + nodes: ctypes.POINTER(ggml_tensor_p) # type: ignore + grads: ctypes.POINTER(ggml_tensor_p) # type: ignore + grad_accs: ctypes.POINTER(ggml_tensor_p) # type: ignore + leafs: ctypes.POINTER(ggml_tensor_p) # type: ignore + use_counts: ctypes.POINTER(ctypes.c_int32) # type: ignore visited_hash_set: ggml_hash_set order: int From b7064d7fc27d33d28837e7885e8e744feb2be3b1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 1 May 2026 07:44:49 +0800 Subject: [PATCH 362/518] feat(llama): add fine-grained MoE CPU offloading controls - Introduce `cpu_moe` (bool) and `n_cpu_moe` (int) parameters to `Llama.__init__` for precise Mixture of Experts (MoE) weight offloading. - `cpu_moe=True` forces all MoE expert weights to the CPU memory, regardless of `n_gpu_layers`. - `n_cpu_moe=N` offloads the expert weights of the first N layers to the CPU, while keeping attention and router weights on the GPU. - Enhance `n_gpu_layers` to accept string literals "auto" (equivalent to -1) and "all" (equivalent to -2) alongside exact integers, improving configuration readability. - Update internal module aliases (e.g., `llama_cpp` to `llama_cpp_lib`) to avoid naming conflicts with the underlying C library. - Integrate `ggml_backend_cpu_buffer_type` to map specific tensor overrides (via regex) directly to CPU buffers during model load. Signed-off-by: JamePeng --- llama_cpp/llama.py | 195 +++++++++++++++++++++++++++++++-------------- 1 file changed, 137 insertions(+), 58 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5ad8364a9b..1241f81e26 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -43,7 +43,7 @@ HybridCheckpointCache, # type: ignore ) from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer -import llama_cpp.llama_cpp as llama_cpp +import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama_chat_format as llama_chat_format from llama_cpp.llama_speculative import LlamaDraftModel @@ -55,6 +55,9 @@ CommonSamplerType, CustomSampler, ) +from ._ggml import ( + ggml_backend_cpu_buffer_type, +) from ._logger import set_verbose from ._utils import suppress_stdout_stderr @@ -77,13 +80,17 @@ class Llama: __backend_initialized = False + LLM_FFN_EXPS_REGEX = rb"\.ffn_(up|down|gate|gate_up)_(ch|)exps" + def __init__( self, model_path: str, *, # Model Params - n_gpu_layers: int = 0, - split_mode: int = llama_cpp.llama_split_mode.LLAMA_SPLIT_MODE_LAYER, + n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", + cpu_moe: bool = False, + n_cpu_moe: int = 0, + split_mode: int = llama_cpp_lib.llama_split_mode.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, vocab_only: bool = False, @@ -95,7 +102,7 @@ def __init__( no_host: bool = False, kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None, # Context Params - seed: int = llama_cpp.LLAMA_DEFAULT_SEED, + seed: int = llama_cpp_lib.LLAMA_DEFAULT_SEED, n_ctx: int = 512, n_keep: int = 256, n_batch: int = 2048, @@ -105,10 +112,10 @@ def __init__( n_threads_batch: Optional[int] = None, rope_scaling_type: Optional[ int - ] = llama_cpp.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, - pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, - attention_type: Optional[int] = llama_cpp.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED, - flash_attn_type: Optional[int] = llama_cpp.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO, + ] = llama_cpp_lib.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, + pooling_type: int = llama_cpp_lib.LLAMA_POOLING_TYPE_UNSPECIFIED, + attention_type: Optional[int] = llama_cpp_lib.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED, + flash_attn_type: Optional[int] = llama_cpp_lib.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO, rope_freq_base: float = 0.0, rope_freq_scale: float = 0.0, yarn_ext_factor: float = -1.0, @@ -174,7 +181,14 @@ def __init__( Args: model_path: Path to the model. - n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded. + n_gpu_layers: Max number of model layers to store in VRAM (-ngl). + Accepts an exact integer, "auto", or "all". + "auto" / -1 lets llama.cpp choose automatically. + "all" / -2 stores all possible layers in VRAM. + 0 disables model layer offload. + cpu_moe: Keep all Mixture of Experts (MoE) weights in the CPU + n_cpu_moe: Keep the MoE expert weights of the first N layers on CPU. + Useful when VRAM is insufficient for MoE models. split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. @@ -237,40 +251,38 @@ def __init__( if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): - llama_cpp.llama_backend_init() + llama_cpp_lib.llama_backend_init() Llama.__backend_initialized = True if isinstance(numa, bool): self.numa = ( - llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE + llama_cpp_lib.GGML_NUMA_STRATEGY_DISTRIBUTE if numa - else llama_cpp.GGML_NUMA_STRATEGY_DISABLED + else llama_cpp_lib.GGML_NUMA_STRATEGY_DISABLED ) else: self.numa = numa - if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED: + if self.numa != llama_cpp_lib.GGML_NUMA_STRATEGY_DISABLED: with suppress_stdout_stderr(disable=verbose): - llama_cpp.llama_numa_init(self.numa) + llama_cpp_lib.llama_numa_init(self.numa) self.model_path = model_path # Model Params - self.model_params = llama_cpp.llama_model_default_params() - self.model_params.n_gpu_layers = ( - 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers - ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers + self.model_params = llama_cpp_lib.llama_model_default_params() + self.model_params.n_gpu_layers = self._parse_n_gpu_layers(n_gpu_layers) self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: - if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES: + if len(self.tensor_split) > llama_cpp_lib.LLAMA_MAX_DEVICES: raise ValueError( - f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}" + f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp_lib.LLAMA_MAX_DEVICES}" ) # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES - FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES + FloatArray = ctypes.c_float * llama_cpp_lib.LLAMA_MAX_DEVICES self._c_tensor_split = FloatArray( *tensor_split # type: ignore ) # keep a reference to the array so it is not gc'd @@ -283,13 +295,61 @@ def __init__( self.model_params.use_extra_bufts = use_extra_bufts self.model_params.no_host = no_host + # Logic of cpu_moe, n_cpu_moe + # Reference from llama.cpp/tools/llama-bench/llama-bench.cpp + self.cpu_moe = cpu_moe + self.n_cpu_moe = n_cpu_moe + self._cpu_moe_patterns = None + self._cpu_moe_tensor_buft_overrides = None + + if self.n_cpu_moe < 0: + raise ValueError("n_cpu_moe must be >= 0") + + if self.cpu_moe and self.n_cpu_moe != 0 and self.verbose: + print( + "Llama.__init__: cpu_moe=True already keeps all MoE expert weights on CPU; " + "n_cpu_moe is redundant.", + file=sys.stderr, + ) + + if self.cpu_moe or self.n_cpu_moe > 0: + cpu_buft = ggml_backend_cpu_buffer_type() + + if self.cpu_moe: + patterns = [self.LLM_FFN_EXPS_REGEX] + else: + patterns = [ + self._make_cpu_moe_pattern(i) + for i in range(self.n_cpu_moe) + ] + + # keep pattern bytes alive + self._cpu_moe_patterns = patterns + + TensorBuftOverrideArray = ( + llama_cpp_lib.llama_model_tensor_buft_override + * (len(patterns) + 1) + ) + self._cpu_moe_tensor_buft_overrides = TensorBuftOverrideArray() + + for i, pattern in enumerate(self._cpu_moe_patterns): + self._cpu_moe_tensor_buft_overrides[i].pattern = pattern + self._cpu_moe_tensor_buft_overrides[i].buft = cpu_buft + + self._cpu_moe_tensor_buft_overrides[len(patterns)].pattern = None + self._cpu_moe_tensor_buft_overrides[len(patterns)].buft = None + + self.model_params.tensor_buft_overrides = ( + self._cpu_moe_tensor_buft_overrides + ) + # kv_overrides is the original python dict self.kv_overrides = kv_overrides if kv_overrides is not None: # _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs kvo_array_len = len(kv_overrides) + 1 # for sentinel element self._kv_overrides_array = ( - llama_cpp.llama_model_kv_override * kvo_array_len + llama_cpp_lib.llama_model_kv_override * kvo_array_len )() for i, (k, v) in enumerate(kv_overrides.items()): @@ -297,17 +357,17 @@ def __init__( if isinstance(v, bool): self._kv_overrides_array[ i - ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_BOOL.value + ].tag = llama_cpp_lib.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_BOOL.value self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): self._kv_overrides_array[ i - ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_INT.value + ].tag = llama_cpp_lib.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_INT.value self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): self._kv_overrides_array[ i - ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_FLOAT.value + ].tag = llama_cpp_lib.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_FLOAT.value self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") @@ -316,12 +376,12 @@ def __init__( v_bytes = v_bytes.ljust(128, b"\0") self._kv_overrides_array[ i - ].tag = llama_cpp.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_STR.value + ].tag = llama_cpp_lib.LlamaModelKVOverrideType.LLAMA_KV_OVERRIDE_TYPE_STR.value # copy min(v_bytes, 128) to str_value address = typing.cast( int, ctypes.addressof(self._kv_overrides_array[i].value) - + llama_cpp.llama_model_kv_override_value.val_str.offset, + + llama_cpp_lib.llama_model_kv_override_value.val_str.offset, ) buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char)) ctypes.memmove( @@ -344,10 +404,10 @@ def __init__( self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() # Used by the sampler - self._seed = seed or llama_cpp.LLAMA_DEFAULT_SEED + self._seed = seed or llama_cpp_lib.LLAMA_DEFAULT_SEED # Context Params - self.context_params = llama_cpp.llama_context_default_params() + self.context_params = llama_cpp_lib.llama_context_default_params() self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) @@ -357,22 +417,22 @@ def __init__( self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None - else llama_cpp.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED + else llama_cpp_lib.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) self.context_params.pooling_type = ( pooling_type if pooling_type is not None - else llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED + else llama_cpp_lib.LLAMA_POOLING_TYPE_UNSPECIFIED ) self.context_params.attention_type = ( attention_type if attention_type is not None - else llama_cpp.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED + else llama_cpp_lib.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED ) self.context_params.flash_attn_type = ( flash_attn_type if flash_attn_type is not None - else llama_cpp.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO + else llama_cpp_lib.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO ) self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 @@ -517,7 +577,7 @@ def __init__( ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + print(llama_cpp_lib.llama_print_system_info().decode("utf-8"), file=sys.stderr) self.chat_format = chat_format self.chat_handler = chat_handler @@ -539,11 +599,6 @@ def __init__( self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) self.scores: npt.NDArray[np.single] = np.ndarray((n_ctx if self._logits_all else 1, self._n_vocab), dtype=np.single) - - self._mirostat_mu = ctypes.c_float( - 2.0 * 5.0 - ) # TODO: Move this to sampling context - try: self.metadata = self._model.metadata() except Exception as e: @@ -673,12 +728,34 @@ def close(self) -> None: def __del__(self) -> None: self.close() + @staticmethod + def _parse_n_gpu_layers(n_gpu_layers: Union[int, str]) -> int: + if isinstance(n_gpu_layers, str): + value = n_gpu_layers.strip().lower() + if value == "auto": + return -1 + if value == "all": + return -2 + try: + return int(value) + except ValueError as exc: + raise ValueError("n_gpu_layers must be an int, 'auto', or 'all'") from exc + + if isinstance(n_gpu_layers, int): + return n_gpu_layers + + raise TypeError("n_gpu_layers must be an int, 'auto', or 'all'") + + @staticmethod + def _make_cpu_moe_pattern(i: int) -> bytes: + return f"blk\\.{i}".encode("utf-8") + Llama.LLM_FFN_EXPS_REGEX + @property - def ctx(self) -> llama_cpp.llama_context_p: + def ctx(self) -> llama_cpp_lib.llama_context_p: return self._ctx.ctx @property - def model(self) -> llama_cpp.llama_model_p: + def model(self) -> llama_cpp_lib.llama_model_p: return self._model.model @property @@ -1016,12 +1093,12 @@ def eval( self.scores[0, :] = logits_view # Helper method: Convert dict logit_bias to List[llama_logit_bias] - def _convert_logit_bias(self, logit_bias: Optional[Dict[int, float]]) -> List[llama_cpp.llama_logit_bias]: + def _convert_logit_bias(self, logit_bias: Optional[Dict[int, float]]) -> List[llama_cpp_lib.llama_logit_bias]: if not logit_bias: return [] bias_list = [] for token, bias in logit_bias.items(): - lb = llama_cpp.llama_logit_bias() + lb = llama_cpp_lib.llama_logit_bias() lb.token = token lb.bias = bias bias_list.append(lb) @@ -1133,7 +1210,7 @@ def sample( # LogitsProcessor Adapter if logits_processor: - def adapter(token_data_array: llama_cpp.llama_token_data_array): + def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): if self._logits_all: current_scores = self._scores[self.n_tokens - 1, :] else: @@ -1402,7 +1479,7 @@ def generate( # Register custom python-level logits processors if provided if logits_processor: - def adapter(token_data_array: llama_cpp.llama_token_data_array): + def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): if self._logits_all: current_scores = self._scores[self.n_tokens - 1, :] else: @@ -1613,7 +1690,7 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + logits_all = pooling_type == llama_cpp_lib.LLAMA_POOLING_TYPE_NONE if self.context_params.embeddings is False: raise RuntimeError( @@ -1621,7 +1698,7 @@ def embed( ) if self.verbose: - llama_cpp.llama_perf_context_reset(self._ctx.ctx) + llama_cpp_lib.llama_perf_context_reset(self._ctx.ctx) if isinstance(input, str): inputs = [input] @@ -1635,15 +1712,15 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(self._ctx.ctx), True) + llama_cpp_lib.llama_memory_clear(llama_cpp_lib.llama_get_memory(self._ctx.ctx), True) self._ctx.decode(self._batch) self._batch.reset() # store embeddings - if pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE: + if pooling_type == llama_cpp_lib.LLAMA_POOLING_TYPE_NONE: pos: int = 0 for i, size in enumerate(seq_sizes): - ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx) + ptr = llama_cpp_lib.llama_get_embeddings(self._ctx.ctx) embedding: List[List[float]] = [ ptr[pos + j * n_embd : pos + (j + 1) * n_embd] for j in range(size) @@ -1656,7 +1733,7 @@ def decode_batch(seq_sizes: List[int]): pos += size else: for i in range(len(seq_sizes)): - ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i) + ptr = llama_cpp_lib.llama_get_embeddings_seq(self._ctx.ctx, i) embedding: List[float] = ptr[:n_embd] if normalize: embedding = internals.normalize_embedding(embedding) @@ -1702,11 +1779,11 @@ def decode_batch(seq_sizes: List[int]): decode_batch(s_batch) if self.verbose: - llama_cpp.llama_perf_context_print(self._ctx.ctx) + llama_cpp_lib.llama_perf_context_print(self._ctx.ctx) output = data[0] if isinstance(input, str) else data - llama_cpp.llama_memory_clear(llama_cpp.llama_get_memory(self._ctx.ctx), True) + llama_cpp_lib.llama_memory_clear(llama_cpp_lib.llama_get_memory(self._ctx.ctx), True) self.reset() if return_count: @@ -1862,7 +1939,7 @@ def _create_completion( if len(prompt_tokens) >= self._n_ctx: raise ValueError( - f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" + f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp_lib.llama_n_ctx(self.ctx)}" ) if max_tokens is None or max_tokens <= 0: @@ -1947,7 +2024,7 @@ def _create_completion( active_loras=active_loras, control_vector=control_vector, ): - if llama_cpp.llama_token_is_eog(self._model.vocab, token): + if llama_cpp_lib.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) finish_reason = "stop" break @@ -2871,6 +2948,8 @@ def __getstate__(self): model_path=self.model_path, # Model Params n_gpu_layers=self.model_params.n_gpu_layers, + cpu_moe=self.cpu_moe, + n_cpu_moe=self.n_cpu_moe, split_mode=self.model_params.split_mode, main_gpu=self.model_params.main_gpu, tensor_split=self.tensor_split, @@ -2932,7 +3011,7 @@ def save_state(self) -> LlamaState: print("Llama.save_state: saving llama state", file=sys.stderr) # Query the backend for the required buffer size to store the current state. - state_size = llama_cpp.llama_state_get_size(self._ctx.ctx) + state_size = llama_cpp_lib.llama_state_get_size(self._ctx.ctx) if self.verbose: print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) @@ -2943,7 +3022,7 @@ def save_state(self) -> LlamaState: # Copy the raw state data from the internal C context into our Python-managed buffer. # Returns the actual number of bytes written (n_bytes). - n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size) + n_bytes = llama_cpp_lib.llama_state_get_data(self._ctx.ctx, llama_state, state_size) if self.verbose: print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) @@ -2995,7 +3074,7 @@ def load_state(self, state: LlamaState) -> None: # Copy the raw bytes from the Python object into a C-compatible buffer. llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) - if llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size) != state_size: + if llama_cpp_lib.llama_state_set_data(self._ctx.ctx, llama_state, state_size) != state_size: raise RuntimeError("Failed to set llama state data") def n_ctx(self) -> int: From cfa01586fb4ff56bf31a83a63dc8329c85826dfc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 1 May 2026 08:03:29 +0800 Subject: [PATCH 363/518] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1d56558d23..c9aba7d42d 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ This package provides: - Low-level access to C API via `ctypes` interface. - [llama_cpp_lib](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/llama_cpp.py) - [mtmd_cpp_lib](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/mtmd_cpp.py) + - [ggml_cpp_lib](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/_ggml.py) + - *Note: Synchronize ggml's ctypes calls as needed, but won't fully implement it, because most of it is called at the lower level in the upstream llama.cpp.* - High-level Python API for text completion - OpenAI-like API and Type([llama_types.py](https://github.com/JamePeng/llama-cpp-python/blob/main/llama_cpp/llama_types.py)) - [High-level API](https://github.com/JamePeng/llama-cpp-python#high-level-api) From 120c5e2d03fe77a961d9f26885d4d1c0a17edeb7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 1 May 2026 09:39:32 +0800 Subject: [PATCH 364/518] Update /docs/wiki/Llama.md --- docs/wiki/core/Llama.md | 12 +++++++----- docs/wiki/core/LlamaEmbedding.md | 8 ++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index eacda189e3..8db9d57626 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -2,7 +2,7 @@ --- title: Llama Class class_name: Llama -last_updated: 2026-04-26 +last_updated: 2026-05-01 version_target: "latest" --- ``` @@ -17,9 +17,11 @@ Initialize the model and context. Note that model loading will immediately alloc ### Core Model & Hardware Parameters | Parameter | Type | Default | Description | | :--- | :--- | :--- | :--- | -| `model_path` | `str` | **Required** | Path to the `.gguf` model file. | -| `n_gpu_layers` | `int` | `0` | Number of layers to offload to GPU. Set to `-1` for all layers. | -| `split_mode` | `int` | `LLAMA_SPLIT_MODE_LAYER` | How to split the model across GPUs (e.g., `LLAMA_SPLIT_MODE_ROW`). | +| `model_path` | `str` | **Required** | Model file path (GGUF format) | +| `n_gpu_layers` | `Union[int, Literal["auto", "all"]]` | `"auto"` | Number of model layers stored in VRAM:
• `auto`/`-1`: auto-selected by llama.cpp
• `all`/`-2`: all layers
• integer N: first N layers
• `0`: disable layer offload | +| `cpu_moe` | `bool` | `False` | Whether to keep all MoE weights on CPU | +| `n_cpu_moe` | `int` | `0` | Number of first N MoE layers to keep on CPU (compatible with `cpu_moe`) | +| `split_mode` | `int` | `LLAMA_SPLIT_MODE_LAYER` | Model GPU split mode:
• `LLAMA_SPLIT_MODE_NONE`: single GPU
• `LLAMA_SPLIT_MODE_ROW`: row-level split
• `LLAMA_SPLIT_MODE_LAYER`: layer-level split | | `main_gpu` | `int` | `0` | The primary GPU to use for intermediate results or the entire model. | | `tensor_split` | `List[float]` | `None` | Proportional split of tensors across GPUs (max `LLAMA_MAX_DEVICES`). | | `use_mmap` | `bool` | `True` | Whether to use memory mapping (mmap) if possible. | @@ -309,6 +311,6 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn --- ## Related Links -* [[LlamaEmbedding]] - Dedicated class for text embeddings and reranking. +* [[LlamaEmbedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/LlamaEmbedding.md)] - Dedicated class for text embeddings and reranking. * [[ChatHandlers]] - Customizing `LlamaChatCompletionHandler` for function calling and vision/omni models (e.g., `[[Gemma4ChatHandler]]`, `[[Qwen35ChatHandler]]`). * [[LlamaCache]] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). \ No newline at end of file diff --git a/docs/wiki/core/LlamaEmbedding.md b/docs/wiki/core/LlamaEmbedding.md index b28772b45e..fd74862d99 100644 --- a/docs/wiki/core/LlamaEmbedding.md +++ b/docs/wiki/core/LlamaEmbedding.md @@ -1,7 +1,7 @@ --- title: LlamaEmbedding class_name: LlamaEmbedding -last_updated: 2026-04-27 +last_updated: 2026-05-01 version_target: "latest" --- @@ -260,4 +260,8 @@ embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MOD - This class is in development; some features may be unstable, especially reranking model support. - Performance issues can be addressed by adjusting `n_batch`, `n_ubatch`, and `n_gpu_layers`. -- For custom models, manual `pooling_type` configuration may be required to match model behavior. \ No newline at end of file +- For custom models, manual `pooling_type` configuration may be required to match model behavior. + +## Related Links + +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] From b875c9a1c5b68284a5badf60d797855dc50cbc2c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 09:17:18 +0800 Subject: [PATCH 365/518] Update Submodule vendor/llama.cpp 660b1b4..b97ebdc --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 660b1b4bdc..b97ebdc98f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 660b1b4bdc6fedc18e8c3d87a945ffb51f91c547 +Subproject commit b97ebdc98f6053604a19d861c08d8087601b96e0 From ef618a696f4eb4dfbd1344953d3f603713165cf5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 11:12:01 +0800 Subject: [PATCH 366/518] Update docs/wiki/modules/LlamaCache.md and Separate the modules folder --- docs/wiki/core/Llama.md | 6 +- docs/wiki/modules/LlamaCache.md | 1392 +++++++++++++++++ .../wiki/{core => modules}/LlamaChatFormat.md | 0 .../{core => modules}/LlamaCppBindings.md | 0 docs/wiki/{core => modules}/LlamaEmbedding.md | 6 +- .../wiki/{core => modules}/MTMDCppBindings.md | 0 6 files changed, 1400 insertions(+), 4 deletions(-) create mode 100644 docs/wiki/modules/LlamaCache.md rename docs/wiki/{core => modules}/LlamaChatFormat.md (100%) rename docs/wiki/{core => modules}/LlamaCppBindings.md (100%) rename docs/wiki/{core => modules}/LlamaEmbedding.md (98%) rename docs/wiki/{core => modules}/MTMDCppBindings.md (100%) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 8db9d57626..299d546e57 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -1,6 +1,8 @@ ```yaml --- title: Llama Class +module_name: llama_cpp.llama +source_file: llama_cpp/llama.py class_name: Llama last_updated: 2026-05-01 version_target: "latest" @@ -311,6 +313,6 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn --- ## Related Links -* [[LlamaEmbedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/LlamaEmbedding.md)] - Dedicated class for text embeddings and reranking. +* [[LlamaEmbedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] - Dedicated class for text embeddings and reranking. +* [[LlamaCache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). * [[ChatHandlers]] - Customizing `LlamaChatCompletionHandler` for function calling and vision/omni models (e.g., `[[Gemma4ChatHandler]]`, `[[Qwen35ChatHandler]]`). -* [[LlamaCache]] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). \ No newline at end of file diff --git a/docs/wiki/modules/LlamaCache.md b/docs/wiki/modules/LlamaCache.md new file mode 100644 index 0000000000..ef02fd40e4 --- /dev/null +++ b/docs/wiki/modules/LlamaCache.md @@ -0,0 +1,1392 @@ +--- +title: Llama Cache +module_name: llama_cpp.llama_cache +source_file: llama_cpp/llama_cache.py +last_updated: 2026-05-02 +version_target: "latest" +--- + +# Llama Cache + +## Overview + +`llama_cpp.llama_cache` provides cache implementations for storing and restoring `LlamaState` objects or recurrent model state checkpoints. + +The module is mainly used to speed up repeated inference workflows by reusing previously computed model state for matching token prefixes. + +It defines several cache classes: + +| Class | Purpose | +|---|---| +| `BaseLlamaCache` | Abstract base class for llama.cpp state caches. | +| `LlamaRAMCache` | In-memory LRU cache for `LlamaState` objects. | +| `LlamaDiskCache` | Disk-backed cache using the `diskcache` library. | +| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | +| `HybridCheckpointCache` | Checkpoint manager for RNN/Hybrid model hidden states. | +| `HybridCheckpoint` | Dataclass representing one saved hybrid model checkpoint. | +| `TrieNode` | Internal trie node used by `LlamaTrieCache`. | + +The public compatibility alias is: + +```python +LlamaCache = LlamaTrieCache +```` + +This means that code importing `LlamaCache` receives the trie-based cache implementation. + +Defined in: `llama_cpp/llama_cache.py` + +Related pages: [[Llama]], [[Caching]], [[State Save Load]], [[Hybrid Models]] + +--- + +## Role in the API + +The cache module provides reusable storage for model runtime state. + +There are two main caching strategies: + +1. **Token-prefix state caching** + + Used by: + + * `LlamaRAMCache` + * `LlamaDiskCache` + * `LlamaTrieCache` + * `LlamaCache` + + These caches map token sequences to `llama_core.LlamaState` objects. When queried, they do not require an exact match. Instead, they return the state associated with the longest cached token prefix. + +2. **Hybrid / recurrent checkpoint caching** + + Used by: + + * `HybridCheckpoint` + * `HybridCheckpointCache` + + This is designed for Hybrid or recurrent models where rollback requires saving and restoring hidden state snapshots through low-level llama.cpp state APIs. + +--- + +## Public API Summary + +| API | Type | Public | Description | +| ----------------------- | -------------- | -------: | ----------------------------------------------- | +| `BaseLlamaCache` | Abstract class | Yes | Base interface for cache implementations. | +| `LlamaRAMCache` | Class | Yes | In-memory LRU cache with linear prefix lookup. | +| `LlamaDiskCache` | Class | Yes | Disk-backed cache using `diskcache.Cache`. | +| `LlamaTrieCache` | Class | Yes | Trie-based cache with efficient prefix lookup. | +| `LlamaCache` | Alias | Yes | Backward-compatible alias for `LlamaTrieCache`. | +| `HybridCheckpoint` | Dataclass | Yes | Represents one saved Hybrid/RNN checkpoint. | +| `HybridCheckpointCache` | Class | Yes | Manages Hybrid/RNN state checkpoints. | +| `TrieNode` | Class | Internal | Trie node used by `LlamaTrieCache`. | + +--- + +# `BaseLlamaCache` + +## Overview + +`BaseLlamaCache` is the abstract base class for llama.cpp cache implementations. + +It defines a common dictionary-like interface for storing and retrieving `llama_core.LlamaState` objects by token sequence. + +Subclasses are expected to implement: + +* `cache_size` +* `__getitem__` +* `__contains__` +* `__setitem__` + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +`BaseLlamaCache` acts as the shared contract for cache implementations used by higher-level llama-cpp-python runtime code. + +It is not intended to be used directly. Users should instantiate one of the concrete cache classes instead: + +* `LlamaRAMCache` +* `LlamaDiskCache` +* `LlamaTrieCache` +* `LlamaCache` + +--- + +## Constructor: `__init__` + +```python +def __init__(self, capacity_bytes: int = (2 << 30)): + ... +``` + +| Parameter | Type | Default | Required | Description | +| ---------------- | ----- | --------: | -------: | -------------------------------------------------------------------- | +| `capacity_bytes` | `int` | `2 << 30` | No | Maximum cache capacity in bytes. The default is approximately 2 GiB. | + +--- + +## Instance Variables + +| Name | Type | Description | +| ---------------- | ----- | ------------------------------------------------------------------------------------------------------------ | +| `capacity_bytes` | `int` | Maximum allowed cache size in bytes. Concrete subclasses use this value to decide when eviction is required. | + +--- + +## Properties + +### `cache_size` + +```python +@property +@abstractmethod +def cache_size(self) -> int: + ... +``` + +Returns the current cache size in bytes. + +Concrete implementations define how this value is calculated. + +--- + +## Core Methods + +### `_find_longest_prefix_key` + +```python +def _find_longest_prefix_key( + self, + key: Tuple[int, ...], +) -> Optional[Tuple[int, ...]]: + ... +``` + +Finds the cached key with the longest token prefix matching the requested key. + +In `BaseLlamaCache`, this method is only a placeholder and does not implement behavior. + +Concrete subclasses may override it. + +--- + +### `__getitem__` + +```python +@abstractmethod +def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": + ... +``` + +Retrieves a cached `LlamaState`. + +The expected behavior is longest-prefix matching rather than strict exact-key lookup. + +--- + +### `__contains__` + +```python +@abstractmethod +def __contains__(self, key: Sequence[int]) -> bool: + ... +``` + +Returns whether the cache contains a matching token prefix for the given key. + +--- + +### `__setitem__` + +```python +@abstractmethod +def __setitem__( + self, + key: Sequence[int], + value: "llama_core.LlamaState" +) -> None: + ... +``` + +Stores a `LlamaState` under a token sequence. + +--- + +# `LlamaRAMCache` + +## Overview + +`LlamaRAMCache` is an in-memory cache for `llama_core.LlamaState` objects. + +It stores token sequences in an `OrderedDict` and maintains an LRU eviction policy. Lookup is based on the longest cached token prefix. + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +`LlamaRAMCache` is useful when users want fast in-process caching without writing state to disk. + +It keeps all cached states in Python memory. This makes retrieval simple, but memory usage can grow quickly depending on the size of saved `LlamaState` objects. + +--- + +## Constructor: `__init__` + +```python +def __init__(self, capacity_bytes: int = (2 << 30), verbose: bool = False): + ... +``` + +| Parameter | Type | Default | Required | Description | +| ---------------- | ------ | --------: | -------: | ----------------------------------------------------------------------------------------------------------------------------- | +| `capacity_bytes` | `int` | `2 << 30` | No | Maximum total size of cached states in bytes. | +| `verbose` | `bool` | `False` | No | Whether to enable verbose behavior when computing token-prefix matches. This value is passed to `Llama.longest_token_prefix`. | + +--- + +## Instance Variables + +| Name | Type | Description | +| ---------------- | ----------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | +| `capacity_bytes` | `int` | Maximum cache capacity in bytes. | +| `cache_state` | `OrderedDict[Tuple[int, ...], llama_core.LlamaState]` | Stores cached token sequences and their corresponding `LlamaState` objects. The order is used for LRU eviction. | +| `_current_size` | `int` | Current total size of cached states in bytes. | +| `verbose` | `bool` | Passed to `llama_core.Llama.longest_token_prefix` during prefix comparison. | + +--- + +## Properties + +### `cache_size` + +```python +@property +def cache_size(self): + return self._current_size +``` + +Returns the current tracked memory usage of the cache in bytes. + +--- + +## Core Methods + +### `_find_longest_prefix_key` + +```python +def _find_longest_prefix_key( + self, + key: Tuple[int, ...], +) -> Optional[Tuple[int, ...]]: + ... +``` + +Finds the cached token sequence with the longest prefix match against `key`. + +This implementation scans every key in `cache_state` and calls: + +```python +llama_core.Llama.longest_token_prefix(k, key, self.verbose) +``` + +### Complexity + +| Operation | Complexity | +| ------------- | ---------: | +| Prefix lookup | `O(N * K)` | +| LRU update | `O(1)` | +| Size tracking | `O(1)` | + +Where: + +* `N` is the number of cached entries. +* `K` is the token sequence length. + +--- + +### `__getitem__` + +```python +def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": + ... +``` + +Returns the cached `LlamaState` for the longest matching token prefix. + +Behavior: + +1. Raises `KeyError("Cache is empty")` if the cache has no entries. +2. Converts the input key to a tuple. +3. Finds the longest cached prefix. +4. Raises `KeyError("Key not found")` if no matching prefix exists. +5. Moves the matched key to the end of `cache_state` to mark it as recently used. +6. Returns the matched `LlamaState`. + +--- + +### `__contains__` + +```python +def __contains__(self, key: Sequence[int]) -> bool: + ... +``` + +Returns `True` if any cached key is a prefix match for the requested token sequence. + +Returns `False` if the cache is empty. + +--- + +### `__setitem__` + +```python +def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): + ... +``` + +Stores a `LlamaState` in memory. + +Behavior: + +1. Converts `key` to a tuple. +2. If the key already exists, deletes the old entry. +3. Inserts the new `LlamaState`. +4. Adds `value.llama_state_size` to `_current_size`. +5. Evicts least-recently-used entries while `_current_size > capacity_bytes`. +6. Resets `_current_size` to `0` if the cache becomes empty. + +> Note: The current implementation increments `_current_size` by the new value size when replacing an existing key, but it does not subtract the old value size before deletion. This may cause size tracking to overcount replaced entries. + +--- + +## Example + +```python +from llama_cpp import Llama +from llama_cpp.llama_cache import LlamaRAMCache + +llm = Llama( + model_path="./models/model.gguf", + cache=LlamaRAMCache(capacity_bytes=1 << 30), +) + +response = llm("Q: What is llama.cpp?\nA:", max_tokens=64) + +print(response["choices"][0]["text"]) +``` + +--- + +## Best Practices + +* Use `LlamaRAMCache` when cache speed is more important than persistence. +* Keep `capacity_bytes` below available system memory. +* Reuse the same cache instance across repeated prompts when prefix reuse is expected. +* Prefer `LlamaTrieCache` or `LlamaCache` when many cached entries are expected and prefix lookup cost matters. + +--- + +# `LlamaDiskCache` + +## Overview + +`LlamaDiskCache` is a disk-backed cache for `llama_core.LlamaState` objects. + +It delegates storage, size limits, and LRU-style eviction behavior to the external `diskcache` library. + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +`LlamaDiskCache` is useful when cached model states should persist beyond the current Python process or when RAM usage should be limited. + +Compared with `LlamaRAMCache`, it may reduce memory pressure but can be slower due to disk I/O. + +--- + +## Constructor: `__init__` + +```python +def __init__( + self, + cache_dir: str = ".cache/llama_cache", + capacity_bytes: int = (2 << 30), + verbose: bool = False +): + ... +``` + +| Parameter | Type | Default | Required | Description | +| ---------------- | ------ | ---------------------: | -------: | ---------------------------------------------------------------------------------------------- | +| `cache_dir` | `str` | `".cache/llama_cache"` | No | Directory used by `diskcache.Cache` to store cached state data. | +| `capacity_bytes` | `int` | `2 << 30` | No | Maximum disk cache size in bytes. Passed to `diskcache.Cache(..., size_limit=capacity_bytes)`. | +| `verbose` | `bool` | `False` | No | Passed to `Llama.longest_token_prefix` when searching for the best prefix match. | + +--- + +## Instance Variables + +| Name | Type | Description | +| ---------------- | ----------------- | ---------------------------------------------------------------------------- | +| `cache_dir` | `str` | Filesystem directory for the disk cache. | +| `cache` | `diskcache.Cache` | SQLite-backed disk cache object. | +| `verbose` | `bool` | Passed to token-prefix comparison logic. | +| `capacity_bytes` | `int` | Maximum configured cache capacity in bytes, inherited from `BaseLlamaCache`. | + +--- + +## Properties + +### `cache_size` + +```python +@property +def cache_size(self): + return self.cache.volume() +``` + +Returns the current disk cache volume in bytes using `diskcache.Cache.volume()`. + +--- + +## Core Methods + +### `_find_longest_prefix_key` + +```python +def _find_longest_prefix_key( + self, + key: Tuple[int, ...], +) -> Optional[Tuple[int, ...]]: + ... +``` + +Finds the cached key with the longest token-prefix match. + +Behavior: + +1. Returns `None` immediately if the disk cache is empty. +2. Iterates over `self.cache.iterkeys()`. +3. Uses `llama_core.Llama.longest_token_prefix(k, key, self.verbose)` to compare each cached key. +4. Stops early if a perfect match is found. + +### Complexity + +| Operation | Complexity | +| ---------------------- | ------------------------------------: | +| Prefix lookup | `O(N * K)` | +| Disk iteration | Depends on `diskcache` and filesystem | +| Exact-match early exit | Supported | + +--- + +### `__getitem__` + +```python +def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": + ... +``` + +Retrieves the cached state associated with the longest matching prefix. + +Behavior: + +1. Prints `"LlamaDiskCache.__getitem__: called"` to `stderr`. +2. Raises `KeyError("Cache is empty")` if no entries exist. +3. Converts `key` to a tuple. +4. Finds the longest prefix key. +5. Raises `KeyError("Key not found")` if no match exists. +6. Reads and returns the cached `LlamaState`. + +The implementation notes that this read is non-destructive and automatically updates access time for LRU behavior through `diskcache`. + +--- + +### `__contains__` + +```python +def __contains__(self, key: Sequence[int]) -> bool: + ... +``` + +Returns whether the cache has any longest-prefix match for the given token sequence. + +--- + +### `__setitem__` + +```python +def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): + ... +``` + +Stores a `LlamaState` in the disk cache. + +Behavior: + +1. Prints `"LlamaDiskCache.__setitem__: called"` to `stderr`. +2. Converts `key` to a tuple. +3. Assigns the value to `self.cache[tuple(key)]`. + +`diskcache` handles capacity checks and eviction. + +--- + +## Example + +```python +from llama_cpp import Llama +from llama_cpp.llama_cache import LlamaDiskCache + +cache = LlamaDiskCache( + cache_dir=".cache/llama_cache", + capacity_bytes=2 << 30, +) + +llm = Llama( + model_path="./models/model.gguf", + cache=cache, +) + +response = llm("Q: What is llama.cpp?\nA:", max_tokens=64) + +print(response["choices"][0]["text"]) +``` + +--- + +## Best Practices + +* Use `LlamaDiskCache` when cache persistence is useful. +* Place `cache_dir` on a fast local SSD when possible. +* Avoid using slow network filesystems for high-throughput inference. +* Consider `LlamaTrieCache` for workloads where many prefix lookups happen within a single process. + +--- + +## Common Pitfalls + +* Disk-backed caching can be slower than RAM caching. +* The cache depends on the third-party `diskcache` package. +* Prefix lookup still scans cached keys linearly, even though storage and eviction are handled by `diskcache`. +* The implementation prints debug messages to `stderr` on get and set operations. + +--- + +# `TrieNode` + +## Overview + +`TrieNode` is an internal helper class used by `LlamaTrieCache`. + +Each node represents one position in a token-prefix tree. + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +`TrieNode` is not intended to be used directly by users. + +It stores: + +* Child nodes keyed by token ID. +* An optional `LlamaState` when the node marks the end of a cached token sequence. + +--- + +## Constructor: `__init__` + +```python +def __init__(self): + ... +``` + +The constructor takes no parameters. + +--- + +## Instance Variables + +| Name | Type | Description | +| ---------- | --------------------------------- | ----------------------------------------------------------------------------------------- | +| `children` | `Dict[int, TrieNode]` | Child trie nodes keyed by token ID. | +| `state` | `Optional[llama_core.LlamaState]` | Cached state stored at this node if the node represents a complete cached token sequence. | + +--- + +# `LlamaTrieCache` + +## Overview + +`LlamaTrieCache` is a trie-based cache implementation for `llama_core.LlamaState` objects. + +It optimizes longest-prefix lookup by storing token sequences in a prefix tree rather than scanning all cached keys. + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +`LlamaTrieCache` is the preferred cache implementation for efficient prefix lookup. + +It combines: + +* A trie for `O(K)` longest-prefix lookup. +* An `OrderedDict` for `O(1)` LRU tracking. +* Explicit byte-size tracking through `_current_size`. + +The compatibility alias `LlamaCache` points to this class: + +```python +LlamaCache = LlamaTrieCache +``` + +--- + +## Constructor: `__init__` + +```python +def __init__(self, capacity_bytes: int = (2 << 30)): + ... +``` + +| Parameter | Type | Default | Required | Description | +| ---------------- | ----- | --------: | -------: | -------------------------------------------------------------------------------------------- | +| `capacity_bytes` | `int` | `2 << 30` | No | Maximum cache size in bytes. Entries are evicted when tracked state size exceeds this value. | + +--- + +## Instance Variables + +| Name | Type | Description | +| ---------------- | ---------------------------------------- | --------------------------------------------------------------------------------- | +| `root` | `TrieNode` | Root node of the token-prefix trie. | +| `_current_size` | `int` | Current total size of cached states in bytes. | +| `lru_tracker` | `OrderedDict[Tuple[int, ...], TrieNode]` | Tracks cached keys by recency. The value is the terminal `TrieNode` for that key. | +| `capacity_bytes` | `int` | Maximum cache capacity in bytes, inherited from `BaseLlamaCache`. | + +--- + +## Properties + +### `cache_size` + +```python +@property +def cache_size(self) -> int: + return self._current_size +``` + +Returns the current total size of cached states in bytes. + +This is an `O(1)` operation. + +--- + +## Core Methods + +### `_find_longest_prefix_node` + +```python +def _find_longest_prefix_node( + self, + key: Tuple[int, ...] +) -> Tuple[Optional[TrieNode], Optional[Tuple[int, ...]]]: + ... +``` + +Finds the trie node containing the longest cached prefix for the given token sequence. + +Returns: + +```python +Tuple[Optional[TrieNode], Optional[Tuple[int, ...]]] +``` + +The first item is the matching trie node. + +The second item is the matching cached key. + +### Behavior + +1. Starts at the root node. +2. Checks whether the empty prefix has a cached state. +3. Walks one token at a time through the trie. +4. Updates the best match each time it reaches a node with a stored state. +5. Stops when the token path no longer exists. + +### Complexity + +| Operation | Complexity | +| ------------- | ---------: | +| Prefix lookup | `O(K)` | + +Where `K` is the length of the requested token sequence. + +--- + +### `__getitem__` + +```python +def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState": + ... +``` + +Retrieves the `LlamaState` for the longest matching cached prefix. + +Behavior: + +1. Converts `key` to a tuple. +2. Finds the longest matching trie node. +3. Raises `KeyError` if no prefix match exists. +4. Moves the matched key to the end of `lru_tracker`. +5. Returns the stored `LlamaState`. + +--- + +### `__contains__` + +```python +def __contains__(self, key: Sequence[int]) -> bool: + ... +``` + +Returns `True` if any prefix of `key` is cached. + +This lookup is `O(K)`. + +--- + +### `_prune` + +```python +def _prune(self, key: Tuple[int, ...]): + ... +``` + +Removes a cached key from the trie and prunes empty parent nodes. + +This is an internal helper used during LRU eviction. + +Behavior: + +1. Walks the trie path for the given key. +2. Returns immediately if the key does not exist. +3. Removes the stored state from the terminal node. +4. Subtracts the state size from `_current_size`. +5. Walks backward through the path and removes empty trie nodes. + +--- + +### `__setitem__` + +```python +def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): + ... +``` + +Stores a `LlamaState` in the trie cache. + +Behavior: + +1. Converts `key` to a tuple. +2. Creates trie nodes for each token if needed. +3. If the terminal node already has a state, subtracts the old state size. +4. Stores the new state. +5. Adds `value.llama_state_size` to `_current_size`. +6. Updates `lru_tracker`. +7. Evicts least-recently-used items while `_current_size > capacity_bytes`. + +--- + +## Example + +```python +from llama_cpp import Llama +from llama_cpp.llama_cache import LlamaCache + +llm = Llama( + model_path="./models/model.gguf", + cache=LlamaCache(capacity_bytes=2 << 30), +) + +response = llm("Q: What is llama.cpp?\nA:", max_tokens=64) + +print(response["choices"][0]["text"]) +``` + +Because `LlamaCache` is an alias for `LlamaTrieCache`, this example uses the trie-based cache. + +--- + +## Performance Characteristics + +| Cache | Prefix Lookup | LRU Tracking | Storage | +| ---------------- | ------------: | -----------------------: | ------- | +| `LlamaRAMCache` | `O(N * K)` | `O(1)` | RAM | +| `LlamaDiskCache` | `O(N * K)` | Delegated to `diskcache` | Disk | +| `LlamaTrieCache` | `O(K)` | `O(1)` | RAM | + +Where: + +* `N` is the number of cached entries. +* `K` is the token sequence length. + +--- + +## Best Practices + +* Prefer `LlamaCache` for general use, because it currently aliases `LlamaTrieCache`. +* Use `LlamaTrieCache` directly when you want explicit control over the cache implementation. +* Use a realistic `capacity_bytes` value based on available RAM. +* Use this cache when many prompts share prefixes. + +--- + +## Common Pitfalls + +* The cache still stores full `LlamaState` objects, which may be large. +* `capacity_bytes` is based on `value.llama_state_size`; this assumes each stored state reports its size accurately. +* `TrieNode` is internal and should not be manipulated directly. +* Eviction removes entries from both `lru_tracker` and the trie. + +--- + +# `LlamaCache` + +## Overview + +`LlamaCache` is a backward-compatible alias for `LlamaTrieCache`. + +```python +LlamaCache = LlamaTrieCache +``` + +This means users can import `LlamaCache` and receive the trie-based implementation. + +--- + +## Example + +```python +from llama_cpp import Llama +from llama_cpp.llama_cache import LlamaCache + +cache = LlamaCache(capacity_bytes=2 << 30) + +llm = Llama( + model_path="./models/model.gguf", + cache=cache, +) +``` + +--- + +## Migration Notes + +Older code may expect `LlamaCache` to refer to another cache implementation. + +In the current source, `LlamaCache` resolves to `LlamaTrieCache`. + +When documenting or debugging cache behavior, treat `LlamaCache` as equivalent to: + +```python +from llama_cpp.llama_cache import LlamaTrieCache as LlamaCache +``` + +--- + +# `HybridCheckpoint` + +## Overview + +`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or recurrent model's hidden state. + +It is used by `HybridCheckpointCache`. + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +Hybrid or recurrent models may require hidden-state rollback rather than standard KV-cache truncation. + +`HybridCheckpoint` stores enough metadata to verify and restore a specific recurrent state snapshot. + +--- + +## Dataclass Definition + +```python +@dataclass +class HybridCheckpoint: + pos: int + data: bytes + hash_val: str + size: int + seq_id: int +``` + +--- + +## Fields + +| Field | Type | Description | +| ---------- | ------- | --------------------------------------------------------------- | +| `pos` | `int` | Token position where this checkpoint was taken. | +| `data` | `bytes` | Raw binary RNN or Hybrid model state data. | +| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | +| `size` | `int` | Size of the state data in bytes. | +| `seq_id` | `int` | Sequence ID associated with this checkpoint. | + +--- + +## Notes + +`HybridCheckpoint` objects are normally created by `HybridCheckpointCache.save_checkpoint`. + +Users usually do not need to instantiate this dataclass manually. + +--- + +# `HybridCheckpointCache` + +## Overview + +`HybridCheckpointCache` manages RNN or Hybrid model hidden-state checkpoints. + +It is designed for models that cannot physically truncate KV cache in the same way as standard transformer-only models. + +Instead of implementing dictionary-style cache operations, it provides explicit checkpoint operations: + +* `save_checkpoint` +* `find_best_checkpoint` +* `restore_checkpoint` +* `clear` +* `close` + +Defined in: `llama_cpp/llama_cache.py` + +--- + +## Role in the API + +`HybridCheckpointCache` is a specialized cache manager for Hybrid/Recurrent model rollback. + +It stores raw state snapshots extracted from the llama.cpp backend through low-level C API functions: + +* `llama_state_seq_get_size_ext` +* `llama_state_seq_get_data_ext` +* `llama_state_seq_set_data_ext` + +It is not a drop-in replacement for `LlamaRAMCache`, `LlamaDiskCache`, or `LlamaTrieCache`. + +--- + +## Constructor: `__init__` + +```python +def __init__( + self, + ctx: llama_cpp_lib.llama_context_p, + max_checkpoints: int = 16, + verbose: bool = False +): + ... +``` + +| Parameter | Type | Default | Required | Description | +| ----------------- | ------------------------------- | ------: | -------: | ------------------------------------------------------------------------------------------- | +| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Low-level llama.cpp context pointer. Required for extracting and restoring sequence state. | +| `max_checkpoints` | `int` | `16` | No | Maximum number of checkpoints to retain. If set to `0` or below, checkpointing is disabled. | +| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | + +--- + +## Constructor Behavior + +The constructor raises `ValueError` if `ctx` is `None`. + +```python +if ctx is None: + raise ValueError( + "HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context" + ) +``` + +If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. + +This mode is intended to avoid expensive state extraction for single-turn workflows. + +--- + +## Instance Variables + +| Name | Type | Description | +| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------ | +| `_ctx` | `llama_cpp_lib.llama_context_p` | Low-level llama.cpp context pointer used for state extraction and restoration. | +| `max_checkpoints` | `int` | Maximum number of checkpoints retained. Values less than or equal to zero disable checkpointing. | +| `checkpoints` | `list[HybridCheckpoint]` | Stored checkpoint objects. | +| `_current_size` | `int` | Total memory used by all stored checkpoints in bytes. | +| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | +| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | +| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | +| `_flag_partial` | int | Cached value of `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY`. | +| `verbose` | `bool` | Enables debug output. | + +--- + +## Properties + +### `cache_size` + +```python +@property +def cache_size(self) -> int: + return self._current_size +``` + +Returns the total memory used by stored checkpoints in bytes. + +--- + +## Core Methods + +### `clear` + +```python +def clear(self): + ... +``` + +Clears all stored checkpoints and resets `_current_size` to `0`. + +If the checkpoint list is already empty, it returns immediately. + +In verbose mode, it prints: + +```text +HybridCheckpointCache: cleared +``` + +--- + +### `close` + +```python +def close(self): + ... +``` + +Releases references held by the cache. + +Behavior: + +* Sets `checkpoints` to `None`. +* Sets `_ctx` to `None`. +* Sets cached C API function references to `None`. + +This method is also called by `__del__`. + +--- + +### `__del__` + +```python +def __del__(self) -> None: + self.close() +``` + +Finalizer that calls `close`. + +--- + +### `_hash_prefix` + +```python +def _hash_prefix(self, tokens: List[int], length: int) -> str: + ... +``` + +Computes a SHA-256 hash for the token prefix up to `length`. + +Behavior: + +1. Returns `"empty"` if `length <= 0`. +2. Clamps `length` to the actual token list length. +3. Converts the selected token prefix into an `array.array('i')`. +4. Hashes the bytes with SHA-256. +5. Returns the first 32 hex characters. + +This hash is used to ensure checkpoints are restored only when the token prefix exactly matches. + +--- + +### `find_best_checkpoint` + +```python +def find_best_checkpoint( + self, + tokens: List[int], + seq_id: int = 0 +) -> Optional[HybridCheckpoint]: + ... +``` + +Finds the longest valid checkpoint matching the given token prefix and sequence ID. + +Returns `None` if: + +* Checkpointing is disabled. +* There are no checkpoints. +* No checkpoint matches the requested sequence ID and token prefix. + +Behavior: + +1. Skips checkpoints whose `seq_id` differs. +2. Skips checkpoints whose `pos` is greater than the current token length. +3. Verifies token-prefix integrity using `_hash_prefix`. +4. Returns the checkpoint with the largest matching `pos`. + +--- + +### `save_checkpoint` + +```python +def save_checkpoint( + self, + current_pos: int, + tokens: List[int], + seq_id: int = 0 +) -> bool: + ... +``` + +Extracts the current recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. + +Returns `True` if the checkpoint was saved successfully. + +Returns `False` if: + +* Checkpointing is disabled. +* The backend reports state size `0`. +* State extraction writes an unexpected number of bytes. + +### Behavior + +1. Returns immediately if `max_checkpoints <= 0`. +2. Calls `_get_size_ext` to query the required state buffer size. +3. Allocates a `ctypes.c_uint8` buffer. +4. Calls `_get_data_ext` to extract state data. +5. Copies the state bytes into a Python `bytes` object. +6. Computes a hash of the token prefix. +7. Appends a new `HybridCheckpoint`. +8. Increments `_current_size`. +9. Evicts old checkpoints using FIFO order if the number of checkpoints exceeds `max_checkpoints`. + +### Important Performance Note + +The implementation intentionally bypasses checkpoint extraction when `max_checkpoints <= 0`. + +This avoids potentially large synchronous VRAM-to-RAM transfers for single-turn workflows. + +--- + +### `restore_checkpoint` + +```python +def restore_checkpoint( + self, + cp: HybridCheckpoint, + seq_id: int = 0 +) -> bool: + ... +``` + +Restores a previously saved checkpoint into the C++ backend. + +Returns `True` if restoration succeeds. + +Returns `False` if: + +* The checkpoint sequence ID does not match the requested `seq_id`. +* The current backend state size differs from the checkpoint size. +* The backend does not report the expected number of restored bytes. + +### Behavior + +1. Verifies `cp.seq_id == seq_id`. +2. Queries current expected state size from the backend. +3. Verifies it matches `cp.size`. +4. Copies checkpoint bytes into a ctypes buffer. +5. Calls `_set_data_ext` to restore the state. +6. Returns whether the number of restored bytes equals `cp.size`. + +--- + +## Disabled Dictionary Interface + +`HybridCheckpointCache` inherits from `BaseLlamaCache`, but it intentionally disables the dictionary-style methods. + +### `__getitem__` + +```python +def __getitem__(self, key): + raise NotImplementedError( + "HybridCheckpointCache: pls use save_checkpoint or restore_checkpoint method" + ) +``` + +### `__setitem__` + +```python +def __setitem__(self, key, value): + raise NotImplementedError( + "HybridCheckpointCache: pls use save_checkpoint or restore_checkpoint method" + ) +``` + +### `__contains__` + +```python +def __contains__(self, key): + raise NotImplementedError( + "HybridCheckpointCache: pls use save_checkpoint or restore_checkpoint method" + ) +``` + +Users should use checkpoint-specific methods instead. + +--- + +## Example + +```python +from llama_cpp.llama_cache import HybridCheckpointCache + +# `ctx` must be a valid llama.cpp context pointer. +checkpoint_cache = HybridCheckpointCache( + ctx=ctx, + max_checkpoints=16, + verbose=True, +) + +tokens = [1, 2, 3, 4] +current_pos = len(tokens) + +saved = checkpoint_cache.save_checkpoint( + current_pos=current_pos, + tokens=tokens, + seq_id=0, +) + +if saved: + checkpoint = checkpoint_cache.find_best_checkpoint(tokens, seq_id=0) + + if checkpoint is not None: + restored = checkpoint_cache.restore_checkpoint(checkpoint, seq_id=0) + print("Restored:", restored) +``` + +> Note: This example assumes `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. + +--- + +## Best Practices + +* Use `HybridCheckpointCache` only for Hybrid or recurrent model workflows that require hidden-state rollback. +* Set `max_checkpoints=0` for single-turn workflows where rollback is not needed. +* Keep `max_checkpoints` small if checkpoint states are large. +* Use `find_best_checkpoint` before calling `restore_checkpoint`. +* Do not use dictionary-style cache access with this class. + +--- + +## Common Pitfalls + +* Passing `ctx=None` raises `ValueError`. +* `max_checkpoints <= 0` disables checkpointing. +* Restoring a checkpoint with the wrong `seq_id` fails. +* Restore fails if the current backend state size no longer matches the checkpoint size. +* `close()` sets internal references to `None`; the object should not be reused afterward. +* This class is not equivalent to `LlamaCache`. + +--- + +# Module Variables and Constants + +## `LlamaCache` + +```python +LlamaCache = LlamaTrieCache +``` + +Backward-compatible alias for `LlamaTrieCache`. + +Users can import either: + +```python +from llama_cpp.llama_cache import LlamaCache +``` + +or: + +```python +from llama_cpp.llama_cache import LlamaTrieCache +``` + +Both refer to the trie-based cache implementation in the current source. + +--- + +# How the Cache Implementations Compare + +| Class | Storage | Prefix Lookup | Eviction | Persistence | Best For | +| ----------------------- | ------- | ------------------------------: | ------------------------ | ----------: | -------------------------------------------- | +| `LlamaRAMCache` | RAM | `O(N * K)` | LRU | No | Small in-memory caches. | +| `LlamaDiskCache` | Disk | `O(N * K)` | Delegated to `diskcache` | Yes | Persistent cache across runs. | +| `LlamaTrieCache` | RAM | `O(K)` | LRU | No | Fast prefix lookup with many cached entries. | +| `HybridCheckpointCache` | RAM | Hash-verified checkpoint search | FIFO by checkpoint count | No | Hybrid/Recurrent model rollback. | + +--- + +# Recommended Entry Points + +For most users: + +```python +from llama_cpp.llama_cache import LlamaCache +``` + +This currently gives the trie-based implementation. + +For explicit cache selection: + +```python +from llama_cpp.llama_cache import LlamaRAMCache +from llama_cpp.llama_cache import LlamaDiskCache +from llama_cpp.llama_cache import LlamaTrieCache +``` + +For Hybrid/Recurrent models: + +```python +from llama_cpp.llama_cache import HybridCheckpointCache +``` + +--- + +# Related Links + +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] diff --git a/docs/wiki/core/LlamaChatFormat.md b/docs/wiki/modules/LlamaChatFormat.md similarity index 100% rename from docs/wiki/core/LlamaChatFormat.md rename to docs/wiki/modules/LlamaChatFormat.md diff --git a/docs/wiki/core/LlamaCppBindings.md b/docs/wiki/modules/LlamaCppBindings.md similarity index 100% rename from docs/wiki/core/LlamaCppBindings.md rename to docs/wiki/modules/LlamaCppBindings.md diff --git a/docs/wiki/core/LlamaEmbedding.md b/docs/wiki/modules/LlamaEmbedding.md similarity index 98% rename from docs/wiki/core/LlamaEmbedding.md rename to docs/wiki/modules/LlamaEmbedding.md index fd74862d99..57c0788e3f 100644 --- a/docs/wiki/core/LlamaEmbedding.md +++ b/docs/wiki/modules/LlamaEmbedding.md @@ -1,11 +1,13 @@ --- -title: LlamaEmbedding +title: Llama Embedding +module_name: llama_cpp.llama_embedding +source_file: llama_cpp/llama_embedding.py class_name: LlamaEmbedding last_updated: 2026-05-01 version_target: "latest" --- -# LlamaEmbedding +# Llama Embedding ## Overview diff --git a/docs/wiki/core/MTMDCppBindings.md b/docs/wiki/modules/MTMDCppBindings.md similarity index 100% rename from docs/wiki/core/MTMDCppBindings.md rename to docs/wiki/modules/MTMDCppBindings.md From 4e58a6363d7e233c2992560aa3abd866ca6305f4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 11:25:14 +0800 Subject: [PATCH 367/518] docs: update LLM wiki schema to v0.3 - Add schema metadata, documentation language rules, expanded page templates, attribute/state documentation guidance, and clearer update rules for LLM-maintained llama-cpp-python wiki pages. Signed-off-by: JamePeng --- docs/wiki/SCHEMA.md | 72 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index f5676442cb..b96ec964c7 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -1,45 +1,103 @@ # LLM Wiki Schema – llama-cpp-python -**Purpose**: Maintain a living, always-up-to-date, structured documentation wiki for the llama-cpp-python library using LLMs as the primary maintainer. +**Schema Metadata**: +- **Author**: JamePeng +- **Maintainer**: LLM-assisted documentation workflow +- **Project**: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) wiki +- **Last Modified**: 2026-05-02 +- **Version Target**: latest source code +- **Schema Version**: 0.3 + +**Purpose**: +- Maintain a living, always-up-to-date, structured documentation wiki for the `llama-cpp-python` library, with LLMs acting as the primary documentation maintainer. +- The wiki must help users understand the latest public API, core classes, modules, configuration options, examples, and migration paths based on the current source code. +- The wiki should explain not only *how to call an API*, but also *what role the class/module plays in the library*, *how its state is configured*, and *how users should choose between related APIs*. **Core Principles**: -- The source of truth is the latest code in `llama_cpp/` (especially `llama.py`, `llama_chat_format.py`, `llama_cpp.py`, `llama_types.py`, `llama_embedding.py`, `mtmd_cpp.py`, `_internals.py`, `_ggml.py`). +- The source of truth is the latest code in `llama_cpp/`, especially: + - `llama.py` + - `_internals.py` + - `llama_chat_format.py` + - `llama_cache.py` + - `llama_embedding.py` + - `llama_types.py` + - `llama_cpp.py` + - `mtmd_cpp.py` + - `_ggml.py` - Never invent parameters or behavior. Always read the current source code before writing/updating a page. +- Prefer documenting public and user-facing APIs first. Internal implementation details may be documented only when they help users understand behavior, extension points, debugging, or advanced usage. - All examples must be complete, runnable with the latest API, and include necessary imports. -- Clearly mark any deprecated/old usage with a warning and show the modern replacement. +- Clearly mark deprecated, legacy, or changed usage with a warning and show the modern replacement. - Use internal wiki links (e.g. [[Llama]], [[Qwen35ChatHandler]]) for cross-referencing. - Keep pages concise, professional, and user-friendly. +**Documentation Language**: +- The default documentation language is **English**. +- All generated wiki pages, examples, explanations, titles, tables, and warnings should be written in English unless the user explicitly requests another language. +- Code comments inside examples should also be in English by default. +- If the source code contains Chinese comments or non-English notes, translate them into clear English while preserving the original meaning. + **Page Types and Templates**: -1. **Class / Module Page** (e.g. core/Llama.md) +1. **Class / Module Page** (e.g. core/Llama.md, modules/LlamaEmbedding.md) - Frontmatter (YAML): ```yaml --- title: Llama Class class_name: Llama + source_file: llama_cpp/llama.py last_updated: YYYY-MM-DD version_target: "latest" --- ``` - Sections (in order): - Overview + - Role in the Library - Constructor (`__init__`) – full parameter table with types, defaults, and explanations - - Core Methods (with signatures and examples) + - Important Attributes / State + - Core Methods (with signatures and usage examples) - Best Practices & Common Patterns - Deprecated / Changed APIs (with migration notes) - Related Links + - The **Overview** should briefly explain: + - What the class or module is. + - What problem it solves. + - Whether it is a high-level public API, extension point, helper, or internal implementation detail. + - When users should use it. + + - The **Role in the Library** should explain how the class or module relates to nearby APIs. For example, whether it wraps low-level bindings, handles chat formatting, manages cache state, provides embeddings, or connects to multimodal behavior. + + - Constructor parameter tables should use: + + | Parameter | Type | Default | Description | + |---|---|---|---| + + - Important attributes or state should use: + + | Attribute | Type | Source | Description | + |---|---|---|---| + + - Only document attributes that affect user understanding, configuration, lifecycle, inference behavior, caching, chat formatting, embeddings, or debugging. Do not document every trivial private variable. + 2. **Feature Page** (features/xxx.md) - - Overview, When to use, Code examples, Limitations, Related features + - Overview, When to use, Related APIs, Code examples, Configuration Notes, Limitations, Related features + - Feature pages should explain workflows across multiple classes or modules. 3. **Example Page** (examples/xxx.md) - Goal, Prerequisites, Complete runnable code block, Expected output, Tips + - Rules: + * Use the latest API. + * Include all imports as need. + * Avoid pseudo-code. + * Keep examples focused. + * Mention required model assumptions when needed, such as GGUF file path or chat format. **Update Rules**: - Before updating any page, the LLM must read the relevant source files. - Update the `last_updated` date. -- If a new feature (e.g. new ChatHandler, new sampler) appears in code, create or expand the corresponding page. +- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, or backend option, create or expand the corresponding page. +- If behavior is inferred from implementation rather than explicitly documented in code, mark the explanation as implementation-based. - Maintain a high standard of readability and accuracy. This schema is the contract. All generated content must follow it. \ No newline at end of file From 50aafd47383656acda8edd514298205281d85ba9 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 11:51:47 +0800 Subject: [PATCH 368/518] Upload /docs/wiki/modules/LlamaSpeculative.md for llama_speculative.py Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 5 +- docs/wiki/modules/LlamaSpeculative.md | 237 ++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 2 deletions(-) create mode 100644 docs/wiki/modules/LlamaSpeculative.md diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 299d546e57..f010dd0c67 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -313,6 +313,7 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn --- ## Related Links -* [[LlamaEmbedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] - Dedicated class for text embeddings and reranking. -* [[LlamaCache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). +* [[Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). +* [[Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] - Dedicated class for text embeddings and reranking. +* [[Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] - Provides draft model interfaces and prompt-based speculative decoding helpers. * [[ChatHandlers]] - Customizing `LlamaChatCompletionHandler` for function calling and vision/omni models (e.g., `[[Gemma4ChatHandler]]`, `[[Qwen35ChatHandler]]`). diff --git a/docs/wiki/modules/LlamaSpeculative.md b/docs/wiki/modules/LlamaSpeculative.md new file mode 100644 index 0000000000..d1463883b9 --- /dev/null +++ b/docs/wiki/modules/LlamaSpeculative.md @@ -0,0 +1,237 @@ +--- +title: Llama Speculative Decoding +module_name: llama_cpp.llama_speculative +source_file: llama_cpp/llama_speculative.py +last_updated: 2026-05-02 +version_target: "latest" +--- + +# Llama Speculative Decoding + +## Overview + +`llama_speculative.py` provides draft model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. + +Speculative decoding uses a lightweight draft model to propose candidate tokens before the main model verifies them. In this module, the draft model does not need to be a neural model. It can also be a prompt lookup decoder that predicts future tokens by finding repeated token patterns in the existing context. + +This module currently defines: + +| Class | Status | Description | +|---|---|---| +| `LlamaDraftModel` | public interface | Abstract base class for draft models used by speculative decoding. | +| `LlamaNGramMapDecoding` | public | Fast stateful n-gram map based speculative decoder. | +| `LlamaPromptLookupDecoding` | legacy public | Stateless NumPy sliding-window prompt lookup decoder. | + +## Role in the Library + +This module defines the draft-model side of speculative decoding. + +A draft model receives the current token sequence and returns predicted draft tokens. These draft tokens can then be verified by the main `Llama` model during generation. + +The module provides two prompt-based implementations: + +- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based lookup. +- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window implementation. + +For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains an n-gram index instead of scanning the full token history on every call. + +## Classes + +## `LlamaDraftModel` + +```python +class LlamaDraftModel(abc.ABC) +```` + +Abstract base class for speculative draft models. + +A draft model must implement `__call__` and return an array of predicted token IDs. + +### Method + +```python +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] +``` + +| Parameter | Type | Description | +| ----------- | ---------------------- | ----------------------------------------------------------------- | +| `input_ids` | `npt.NDArray[np.intc]` | Current token sequence. | +| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | + +Returns: + +| Type | Description | +| ---------------------- | -------------------------------------------- | +| `npt.NDArray[np.intc]` | Draft token IDs proposed by the draft model. | + +## `LlamaNGramMapDecoding` + +```python +class LlamaNGramMapDecoding(LlamaDraftModel) +``` + +Fast speculative decoder based on an n-gram hash map. + +This decoder maintains an internal inverted index from historical n-grams to their positions. When called with the current token sequence, it looks up the final n-gram in the history and returns the following tokens from the most recent matching context. + +### Constructor + +```python +def __init__( + self, + ngram_size: int = 3, + num_pred_tokens: int = 10, +) +``` + +| Parameter | Type | Default | Description | +| ----------------- | ----- | ------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `ngram_size` | `int` | `3` | Length of the token sequence used as the lookup key. Larger values require stricter context matches but may produce fewer hits. | +| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return after a matching n-gram is found. | + +### Important Attributes / State + +| Attribute | Type | Source | Description | +| ----------------- | ---------------------------------- | -------------- | -------------------------------------------------------------------------------- | +| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | +| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | +| `_ngram_map` | `Dict[Tuple[int, ...], List[int]]` | internal cache | Internal inverted index mapping n-gram tuples to positions in the token history. | +| `_history` | `List[int]` | internal cache | Internal token history used to maintain the n-gram map. | + +`_ngram_map` and `_history` are internal state and should not be modified directly. + +### Behavior + +When called, `LlamaNGramMapDecoding`: + +1. Synchronizes its internal history with the provided `input_ids`. +2. Incrementally updates the n-gram map when tokens are appended. +3. Rebuilds the map if the input sequence is no longer a simple continuation, such as after rollback or a new prompt. +4. Uses the last `ngram_size` tokens as the search key. +5. Returns up to `num_pred_tokens` tokens following the most recent historical match. +6. Returns an empty NumPy array if no match is found. + +### Example + +```python +import numpy as np +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=5, +) + +input_ids = np.array([1, 2, 3, 4, 1, 2, 3], dtype=np.intc) + +draft_tokens = draft_model(input_ids) + +print(draft_tokens) +``` + +## `LlamaPromptLookupDecoding` + +```python +class LlamaPromptLookupDecoding(LlamaDraftModel) +``` + +Legacy speculative decoder based on NumPy sliding-window lookup. + +This implementation is stateless. Each call scans the input token sequence to find previous occurrences of the current n-gram and returns the following tokens as draft predictions. + +> Warning: This implementation may have high computational overhead for long contexts. Prefer `LlamaNGramMapDecoding` for new usage. + +### Constructor + +```python +def __init__( + self, + max_ngram_size: int = 3, + num_pred_tokens: int = 10, +) +``` + +| Parameter | Type | Default | Description | +| ----------------- | ----- | ------- | -------------------------------------------------------------------------- | +| `max_ngram_size` | `int` | `3` | Maximum n-gram size to search for. The decoder tries larger n-grams first. | +| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return. | + +### Important Attributes / State + +| Attribute | Type | Source | Description | +| ----------------- | ----- | ----------- | --------------------------------------------------- | +| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | +| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | + +### Static Method + +```python +@staticmethod +def find_candidate_pred_tokens( + input_ids: npt.NDArray[np.intc], + max_ngram_size: int, + num_pred_tokens: int, +) +``` + +Linearly scans `input_ids` using NumPy sliding windows to find matching n-grams. + +| Parameter | Type | Description | +| ----------------- | ---------------------- | ----------------------------------------- | +| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | +| `max_ngram_size` | `int` | Maximum n-gram size to search for. | +| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | + +Returns: + +| Type | Description | +| ---------------------- | --------------------------------------------------------------- | +| `npt.NDArray[np.intc]` | Candidate draft tokens, or an empty array if no match is found. | + +### Example + +```python +from llama_cpp import Llama +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +llama = Llama( + model_path="path/to/qwen-3.6-27b.gguf", + n_ctx=4096, + n_gpu_layers=-1, + draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10 + ) +) + +response = llama.create_chat_completion( + messages=[{"role": "user", "content": """ + Write a Python script using `sqlite3` to define CRUD (Create, Read, Update, Delete) operations for an e-commerce database. +You need to create 5 separate classes for the following entities: `User`, `Product`, `Order`, `Review`, and `Category`. +Each class MUST have exactly the same internal structure and method names (create, get, update, delete). Do not add extra logic, just the standard boilerplate. + """}] +) +``` + +## Best Practices & Common Patterns + +* Prefer `LlamaNGramMapDecoding` for new usage. +* Use `LlamaPromptLookupDecoding` only when compatibility with the older stateless prompt lookup behavior is needed. +* Increase `ngram_size` or `max_ngram_size` for stricter context matching. +* Increase `num_pred_tokens` when you want longer draft proposals, but keep in mind that speculative decoding still depends on later verification by the main model. +* Do not mutate `_ngram_map` or `_history` directly. +* If input token history rolls back or changes unexpectedly, `LlamaNGramMapDecoding` automatically rebuilds its internal cache. + +## Deprecated / Changed APIs + +`LlamaPromptLookupDecoding` is marked as a legacy NumPy sliding-window implementation in the source code. It is still available, but `LlamaNGramMapDecoding` is the preferred implementation for faster repeated calls over long contexts. + +## Related Links + +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] + From cbf15da62226ef86b3bc9cc4309f5189298379da Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 12:09:59 +0800 Subject: [PATCH 369/518] Update /docs/wiki/contributing-to-wiki.md --- docs/wiki/contributing-to-wiki.md | 196 ++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/docs/wiki/contributing-to-wiki.md b/docs/wiki/contributing-to-wiki.md index e69de29bb2..ec8d6e0a27 100644 --- a/docs/wiki/contributing-to-wiki.md +++ b/docs/wiki/contributing-to-wiki.md @@ -0,0 +1,196 @@ +# Contributing to the LLM Wiki + +Thank you for helping improve the `llama-cpp-python` LLM Wiki. + +This wiki is maintained with the help of LLMs, but all documentation must stay grounded in the latest source code. The goal is to keep the wiki accurate, practical, and easy to use for both humans and LLM-based documentation workflows. + +## Documentation Source of Truth + +The source of truth is always the current code in `llama_cpp/`. + +Before creating or updating a wiki page, read the relevant source files first. Do not rely only on memory, old examples, outdated documentation, or external summaries. + +Important source files include: + +- `llama.py` +- `_internals.py` +- `llama_chat_format.py` +- `llama_cache.py` +- `llama_embedding.py` +- `llama_types.py` +- `llama_cpp.py` +- `llama_speculative.py` +- `mtmd_cpp.py` +- `_ggml.py` + +## General Rules + +When contributing documentation: + +- Use English by default. +- Keep pages concise, clear, and practical. +- Do not invent parameters, defaults, return values, or behavior. +- Include complete runnable examples when adding code samples. +- Prefer modern APIs over deprecated or legacy usage. +- Clearly mark deprecated or changed APIs with migration notes. +- Use internal wiki links such as `[[Llama]]`, `[[Chat Completion]]`, or `[[LlamaNGramMapDecoding]]`. +- Update the `last_updated` field in page frontmatter. + +## Page Structure + +Follow the project wiki schema defined in [`SCHEMA.md`](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md). + +Most pages should be one of the following types: + +### Class / Module Page + +Use this for important classes or source modules. + +Typical sections: + +- Overview +- Role in the Library +- Constructor (`__init__`) +- Important Attributes / State +- Core Methods +- Best Practices & Common Patterns +- Deprecated / Changed APIs +- Related Links + +### Feature Page + +Use this for workflows that involve multiple APIs. + +Examples: + +- Chat completion +- Text completion +- Embeddings +- Caching +- Speculative decoding +- Multimodal usage + +Typical sections: + +- Overview +- When to use +- Related APIs +- Code examples +- Configuration notes +- Limitations +- Related features + +### Example Page + +Use this for runnable examples. + +Typical sections: + +- Goal +- Prerequisites +- Complete code +- Expected output +- Tips +- Related links + +## Handling Files with Multiple Classes + +If a source file contains multiple related classes, create one module overview page first. + +Create separate class pages only when a class is: + +- Public or commonly imported by users +- Configuration-heavy +- Behavior-heavy +- A major extension point +- Likely to be searched by name + +Small helper classes, internal classes, and simple data containers can usually stay documented only on the module page. + +Avoid duplicating full documentation across module pages and class pages. + +## Documenting Parameters and Attributes + +Constructor parameters should use this format: + +| Parameter | Type | Default | Description | +|---|---|---|---| + +Important attributes or state should use this format: + +| Attribute | Type | Source | Description | +|---|---|---|---| + +Only document attributes that help users understand configuration, lifecycle, inference behavior, caching, chat formatting, embeddings, debugging, or extension points. + +Do not document every private variable. Private attributes may be mentioned only when they are useful for explaining behavior or debugging, and they should be marked as internal. + +## Code Examples + +All code examples should: + +- Include required imports. +- Be runnable with the latest API. +- Avoid pseudo-code unless explicitly marked as conceptual. +- Use clear model path placeholders such as `./model.gguf`. +- Mention assumptions such as chat format, embedding mode, GPU configuration, or required model type when relevant. + +Example: + +```python +from llama_cpp import Llama + +llm = Llama(model_path="./model.gguf") + +output = llm.create_completion("Hello,") +print(output["choices"][0]["text"]) +```` + +## Accuracy Requirements + +Do not guess. + +If behavior is not clearly documented but can be inferred from implementation, say: + +> Based on the current implementation, ... + +If an API appears internal, say: + +> This appears to be an internal implementation detail and should not be treated as a stable public API. + +If you cannot verify something from the current source code, do not include it as fact. + +## Pull Request Checklist + +Before submitting a documentation change, check that: + +* [ ] The relevant source files were reviewed. +* [ ] The page follows `SCHEMA.md`. +* [ ] Frontmatter is present and `last_updated` is updated. +* [ ] Parameters, defaults, and signatures match the source code. +* [ ] Examples are complete and runnable. +* [ ] Deprecated or legacy APIs are clearly marked. +* [ ] Internal APIs are not presented as stable public APIs. +* [ ] Related pages are linked with internal wiki links. +* [ ] The page is concise and avoids unnecessary duplication. + +## Commit Message Style + +Use simple documentation-focused commit messages. + +Examples: + +```bash +docs: add speculative decoding wiki page +docs: update Llama constructor parameters +docs: expand chat handler documentation +docs: clarify cache API usage +docs: update wiki schema to v0.3 +``` + +## Final Note + +The wiki should help users understand the latest `llama-cpp-python` API from the source code itself. + +Accuracy is more important than completeness. When in doubt, verify the code first. + From 28f842c3c0ec66eaaa30a905eebe8f291a8d92a3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 12:49:47 +0800 Subject: [PATCH 370/518] docs: Update /docs/wiki/index.md Signed-off-by: JamePeng --- docs/wiki/index.md | 111 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index e69de29bb2..aadfd249e5 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -0,0 +1,111 @@ +# llama-cpp-python Wiki + +Welcome to the `llama-cpp-python` wiki :) + +This wiki provides structured, source-code-aligned documentation for the public APIs, core classes, modules, examples, and development notes of `llama-cpp-python`. + +The documentation is maintained with the help of LLMs, but the source of truth is always the latest code in `llama_cpp/`. + +--- + +## Quick Navigation + +### Core API + +Start here if you are using `llama-cpp-python` directly. + +| Page | Description | +|---|---| +| [core/Llama\|Llama] | Main high-level interface for loading GGUF models, running completions, chat completions, tokenization, embeddings, and model configuration. | + +--- + +### Modules + +These pages document major source modules and related classes. + +| Page | Description | +|---|---| +| [modules/LlamaCache\|Llama Cache] | Cache interfaces and implementations for reusing model state across repeated prompts. | +| [modules/LlamaEmbedding\|Llama Embedding] | Embedding-related APIs and usage patterns. | +| [modules/LlamaSpeculative\|Llama Speculative Decoding] | Draft model interfaces and prompt-based speculative decoding helpers. | + +--- + +### Wiki Maintenance + +These pages define how the wiki should be written, updated, and reviewed. + +| Page | Description | +|---|---| +| [SCHEMA\|Wiki Schema] | Documentation schema and rules for LLM-maintained wiki pages. | +| [contributing-to-wiki\|Contributing to the Wiki] | Contribution guide for writing and updating wiki documentation. | + +--- + +## Recommended Reading Order + +If you are new to this wiki, read the pages in this order: + +1. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +2. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] +3. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] +4. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] + +If you are contributing documentation, start with: + +1. [[SCHEMA|Wiki Schema](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md)] +2. [[contributing-to-wiki|Contributing to the Wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md)] + +--- + +## Documentation Status + +The wiki is still being expanded. + +Currently available pages: + +- `core/Llama.md` +- `modules/LlamaCache.md` +- `modules/LlamaEmbedding.md` +- `modules/LlamaSpeculative.md` +- `SCHEMA.md` +- `contributing-to-wiki.md` + +Some planned pages may already exist as empty placeholder files. Empty pages are intentionally not linked from this index until they are completed. + +--- + +## Planned Areas + +Future documentation may cover: + +- Installation and build options +- Chat formats and chat handlers +- Low-level ctypes bindings +- Multimodal APIs +- Type definitions and structured return values +- Troubleshooting +- Runnable examples +- Development notes + +--- + +## Documentation Principles + +This wiki follows a few core rules: + +- Source code is the source of truth. +- Parameters, defaults, and behavior must match the latest implementation. +- Examples should be complete and runnable. +- Deprecated or legacy APIs should be clearly marked. +- Internal implementation details should not be presented as stable public APIs. +- Pages should be concise, practical, and easy to navigate. + +--- + +## Project Links + +- GitHub: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) +- Wiki schema: [SCHEMA](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md) +- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) \ No newline at end of file From 374c0d00aab924f03c18a2a53ab65c2fa20ce66c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 14:18:54 +0800 Subject: [PATCH 371/518] Update Submodule vendor/llama.cpp b97ebdc..63d93d1 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b97ebdc98f..63d93d1733 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b97ebdc98f6053604a19d861c08d8087601b96e0 +Subproject commit 63d93d17336e41e4cc73a64451e5b1d2477abdb1 From fe38cbfac424973ccd9cfaa199a64a02502af4d1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 2 May 2026 14:50:46 +0800 Subject: [PATCH 372/518] Bump version to 0.3.37 Signed-off-by: JamePeng --- CHANGELOG.md | 76 +++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 480b3c24cc..d144fe2267 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,82 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.37] MoE CPU Offloading, O(1) Speculative Decoding, Thread-Safe Abort & New LLM Wiki + +- docs: A basic new documentation system for the LLM-Wiki has been initially established. + - Based on the continuously optimized `SCHEMA.md`, I am attempting to enable AI to automatically learn code files and write and update corresponding Markdown documents. + - Currently, the documentation under the path `/docs/wiki/` is complete: + - `core/Llama.md` + - `modules/LlamaCache.md` + - `modules/LlamaEmbedding.md` + - `modules/LlamaSpeculative.md` + - `SCHEMA.md` + - `contributing-to-wiki.md` + - `index.md` + - The Github Wiki is now also synchronized with `/docs/wiki/index.md` + - Note: The LLM-wiki is still being expanded. + +- docs: update LLM wiki schema to v0.3 + - Add schema metadata, documentation language rules, expanded page templates, attribute/state documentation guidance, and clearer update rules for LLM-maintained llama-cpp-python wiki pages. + +- feat(llama): add fine-grained MoE CPU offloading controls + - Introduce `cpu_moe` (bool) and `n_cpu_moe` (int) parameters to `Llama.__init__` for precise Mixture of Experts (MoE) weight offloading. + - `cpu_moe=True` forces all MoE expert weights to the CPU memory, regardless of `n_gpu_layers`. + - `n_cpu_moe=N` offloads the expert weights of the first N layers to the CPU, while keeping attention and router weights on the GPU. + - Enhance `n_gpu_layers` to accept string literals "auto" (equivalent to -1) and "all" (equivalent to -2) alongside exact integers, improving configuration readability. + - Update internal module aliases (e.g., `llama_cpp` to `llama_cpp_lib`) to avoid naming conflicts with the underlying C library. + - Integrate `ggml_backend_cpu_buffer_type` to map specific tensor overrides (via regex) directly to CPU buffers during model load. + +- feat(_ggml): implement ggml-backend API bindings and fix type hints + - Introduces extensive ctypes bindings for `ggml-backend.h` (devices, buffers, registries, and CPU buffer types) to support advanced memory routing like MoE CPU offloading. Also fixes various static typing warnings by adding `# type: ignore` to pointer annotations. + - *Note: Synchronize ggml's ctypes calls as needed, but won't fully implement it, because most of it is called at the lower level in the upstream llama.cpp.* + +- feat(handler): Support `add_generation_prompt` parameter pass to `MTMDChatHandler` + - supports disabling assistant part injection, used to support the multimodal `assistant_prefill` functionality. + +- feat(core): implement thread-safe generation abort mechanism + - Add `AbortCriteria` class and a thread-safe `Llama.abort()` method to allow graceful interruption of ongoing text generation from external threads (e.g., UI or async environments). + - Automatically inject `AbortCriteria` into the stopping criteria sequence at the start of `_create_completion`. + - Ensure that when an abort is triggered, the partially generated `completion_tokens` are correctly detokenized and preserved. + - Set `finish_reason` to `"abort"` when generation is interrupted, allowing downstream streaming clients to correctly identify manual cancellations. + - Simplify and optimize the stopping criteria evaluation logic within the core `generate` loop. + - Reorganize and sort module imports for better readability. + - Update /docs/wiki/core/Llama.md for `abort()` and example code + +- feat(speculative): introduce O(1) hash-based N-Gram speculative decoding + - Add `LlamaNGramMapDecoding` to `llama_speculative.py`, implementing an ultra-fast speculative decoder based on a hash inverted index and incremental updates. + - Achieve O(1) time complexity for draft token generation, completely eliminating the CPU bottleneck present in the legacy Numpy sliding window approach. + - Update `README.md` and `docs/wiki/core/Llama.md` to recommend `LlamaNGramMapDecoding` as the default and fastest speculative decoding method, along with updated initialization examples. + - Add docs comment to the speculative decoding classes for better developer experience. + - Add warnings to the legacy `LlamaPromptLookupDecoding` class regarding its high computational overhead for long contexts. + +- docs: Update README.md + +- feat(types): introduce MCP definitions and align with latest OpenAI spec + - Add comprehensive Model Context Protocol (MCP) type definitions, including `MCPTool`, `MCPToolCall`, `MCPListTools`, connector IDs, and approval filters to support remote server tool calling. + - Add `ServiceTier` literal ("auto", "default", etc.) and include the `service_tier` field in `CreateChatCompletionResponse`. + - Restrict `finish_reason` in completion responses to strict standard literals (`stop`, `length`, `tool_calls`, `content_filter`, `function_call`). + - Introduce `ChatCompletionMessageCustomToolCall` to support custom tool calls generated by the model. + - Update `ChatCompletionRequestAssistantMessage` to include the `name` field and add descriptive docstrings to message types. + +- docs: initialize LLM Wiki structure for better documentation maintenance + - Create docs/wiki/ directory with full folder structure + - Add SCHEMA.md, index.md and contributing guidelines + - Set up core/, features/, modules/, examples/, types/ and subdirectories + - Prepare for LLM-powered living documentation (Llama class, multi-modal chat handlers, vision/audio examples, etc.) + - Include .gitkeep files to preserve empty directories + + This lays the foundation for a modern, maintainable wiki that will replace outdated static docs. + Future commits will populate pages with up-to-date content generated from latest source code. + +- chore(ci): upgrade astral-sh/setup-uv@v7 and Jimver/cuda-toolkit@v0.2.35 (Node 24 runtime) + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/63d93d17336e41e4cc73a64451e5b1d2477abdb1](https://github.com/ggml-org/llama.cpp/commit/63d93d17336e41e4cc73a64451e5b1d2477abdb1) + +- feat: Sync llama.cpp llama/mtmd API Binding 20260421 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/b97cb637cd6124fc47f569721b1716014bd856a8...374c0d00aab924f03c18a2a53ab65c2fa20ce66c + ## [0.3.36] Gemma-4 Omni-Multimodal and ToolCall Improved, Qwen3.6 / Step3-VL Support, Compilation workflow optimization - feat: enhance `Qwen35ChatHandler` with preserve_thinking and `Qwen3.6` Support diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index a02ec5af51..7fd2b4c492 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.36" +__version__ = "0.3.37" From 5336947364f8d2b068c46a77609e0406541067b7 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 3 May 2026 15:26:48 +0800 Subject: [PATCH 373/518] feat(grammar): sync JSON schema to GBNF converter with upstream - Allow `LlamaGrammar.from_json_schema` and `json_schema_to_gbnf` to accept both string and dict schema inputs. - Expose `allow_fetch`, `dotall`, and `raw_pattern` arguments to the public API to match the upstream script. - Fix missing handling for empty/unconstrained schema objects (e.g. `{"description": "..."}`) which now correctly default to accepting any value. - Fix bug where `has_min and has_max` evaluated incorrectly when variables were zero. Replaced `!= None` with `is not None` in `_generate_min_max_int`. - Update internal constants and regex patterns (`INVALID_RULE_CHARS_RE`, `GRAMMAR_LITERAL_ESCAPE_RE`, `GRAMMAR_RANGE_LITERAL_ESCAPE_RE`) to resolve character escaping issues. - Add type hint `| None` for dependencies in `BuiltinRule`. - Update reference link to point to the new `ggml-org` organization. Signed-off-by: JamePeng --- llama_cpp/llama_grammar.py | 66 ++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 21bb688dee..3c431fc3d8 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -1,4 +1,4 @@ -"""Python implementation of llama grammar parser directly translated from C++ source file in vendor/llama.cpp/common/grammar-parser.cpp.""" +"""Python implementation of llama grammar parser. Reference: vendor/llama.cpp/examples/json_schema_to_grammar.py""" # flake8: noqa from pathlib import Path @@ -51,8 +51,11 @@ def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGramma @classmethod def from_json_schema( cls, - json_schema: str, + json_schema: Union[str, dict], prop_order: Optional[List[str]] = None, + allow_fetch: bool = False, + dotall: bool = False, + raw_pattern: bool = False, verbose: bool = True ) -> "LlamaGrammar": """ @@ -63,7 +66,13 @@ def from_json_schema( verbose: Whether to log. """ try: - gbnf_grammar_str = json_schema_to_gbnf(json_schema, prop_order=prop_order) + gbnf_grammar_str = json_schema_to_gbnf( + json_schema, + prop_order=prop_order, + allow_fetch=allow_fetch, + dotall=dotall, + raw_pattern=raw_pattern, + ) return cls.from_string(gbnf_grammar_str, verbose=verbose) except Exception as e: raise ValueError(f"{cls.__name__}.from_json_schema: conversion failed: {e}") @@ -285,9 +294,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): return f'({result})?' if min_items == 0 else result def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): - has_min = min_value != None - has_max = max_value != None - def digit_range(from_char: str, to_char: str): out.append("[") if from_char == to_char: @@ -363,7 +369,7 @@ def uniform_range(from_str: str, to_str: str): out.append(to_str[i]) out.append("]") - if has_min and has_max: + if min_value is not None and max_value is not None: if min_value < 0 and max_value < 0: out.append("\"-\" (") _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) @@ -390,7 +396,7 @@ def uniform_range(from_str: str, to_str: str): less_decimals = max(decimals_left - 1, 1) - if has_min: + if min_value is not None: if min_value < 0: out.append("\"-\" (") _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) @@ -434,7 +440,7 @@ def uniform_range(from_str: str, to_str: str): more_digits(length - 1, less_decimals) return - if has_max: + if max_value is not None: if max_value >= 0: if top_level: out.append("\"-\" [1-9] ") @@ -488,13 +494,13 @@ def __init__(self, content: str, deps: list = None): RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()]) -INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+") -GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\\\]') -GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n\"\\]\\-\\\\]') -GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"', "-": "\\-", "]": "\\]", "\\": "\\\\"} +INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') +GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\\]') +GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]') +GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]', '\\': '\\\\'} -NON_LITERAL_SET = set("|.()[]{}*+?") -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("^$.[]()|{}*+?") +NON_LITERAL_SET = set('|.()[]{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?') class SchemaConverter: def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): @@ -659,7 +665,7 @@ def _visit_pattern(self, pattern, name): Transforms a regular expression pattern into a GBNF rule. Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions - Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. @@ -946,6 +952,11 @@ def add_component(comp_schema, is_required): elif (schema_type == 'object') or (len(schema) == 0): return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) + elif schema_type is None and isinstance(schema, dict): + # No type constraint and no recognized structural keywords (e.g. {"description": "..."}). + # Per JSON Schema semantics this is equivalent to {} and accepts any value. + return self._add_rule(rule_name, self._add_primitive('value', PRIMITIVE_RULES['value'])) + else: assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero @@ -1031,12 +1042,27 @@ def format_grammar(self): ) -def json_schema_to_gbnf(schema: str, prop_order: Optional[List[str]] = None): +def json_schema_to_gbnf( + schema: Union[str, dict], + prop_order: Optional[List[str]] = None, + allow_fetch: bool = False, + dotall: bool = False, + raw_pattern: bool = False, +): prop_order = prop_order or [] - schema = json.loads(schema) - prop_order = {name: idx for idx, name in enumerate(prop_order)} + + if isinstance(schema, str): + schema = json.loads(schema) + elif isinstance(schema, dict): + schema = dict(schema) + else: + raise TypeError("schema must be a JSON string or dictionary") + converter = SchemaConverter( - prop_order=prop_order, allow_fetch=False, dotall=False, raw_pattern=False + prop_order={name: idx for idx, name in enumerate(prop_order)}, + allow_fetch=allow_fetch, + dotall=dotall, + raw_pattern=raw_pattern, ) schema = converter.resolve_refs(schema, "stdin") converter.visit(schema, "") From d7ed1895baa1149cd376bc9ed494d67acb18b898 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 3 May 2026 16:07:59 +0800 Subject: [PATCH 374/518] docs: add `LlamaGrammar` wiki page Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 2 + docs/wiki/modules/LlamaCache.md | 1 + docs/wiki/modules/LlamaChatFormat.md | 0 docs/wiki/modules/LlamaEmbedding.md | 1 + docs/wiki/modules/LlamaGrammar.md | 461 ++++++++++++++++++++++++++ docs/wiki/modules/LlamaSpeculative.md | 1 + 6 files changed, 466 insertions(+) delete mode 100644 docs/wiki/modules/LlamaChatFormat.md create mode 100644 docs/wiki/modules/LlamaGrammar.md diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index f010dd0c67..7a9b7bd6ad 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -313,6 +313,8 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn --- ## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] - Implementing disk or RAM-based prompt caching (LlamaRAMCache, **TrieCache**, **HybridCheckpointCache**). * [[Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] - Dedicated class for text embeddings and reranking. * [[Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] - Provides draft model interfaces and prompt-based speculative decoding helpers. diff --git a/docs/wiki/modules/LlamaCache.md b/docs/wiki/modules/LlamaCache.md index ef02fd40e4..64e6bbb5f8 100644 --- a/docs/wiki/modules/LlamaCache.md +++ b/docs/wiki/modules/LlamaCache.md @@ -1389,4 +1389,5 @@ from llama_cpp.llama_cache import HybridCheckpointCache # Related Links +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] diff --git a/docs/wiki/modules/LlamaChatFormat.md b/docs/wiki/modules/LlamaChatFormat.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/wiki/modules/LlamaEmbedding.md b/docs/wiki/modules/LlamaEmbedding.md index 57c0788e3f..1279db5cab 100644 --- a/docs/wiki/modules/LlamaEmbedding.md +++ b/docs/wiki/modules/LlamaEmbedding.md @@ -266,4 +266,5 @@ embeddings_raw = llm.embed(["search query", "document text"], normalize=NORM_MOD ## Related Links +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] diff --git a/docs/wiki/modules/LlamaGrammar.md b/docs/wiki/modules/LlamaGrammar.md new file mode 100644 index 0000000000..8c67633baa --- /dev/null +++ b/docs/wiki/modules/LlamaGrammar.md @@ -0,0 +1,461 @@ +--- +title: Llama Grammar +module_name: llama_cpp.llama_grammar +source_file: llama_cpp/llama_grammar.py +last_updated: 2026-05-03 +version_target: "latest" +--- + +# Llama Grammar + +## Overview + +`llama_grammar.py` provides grammar utilities for constrained generation in `llama-cpp-python`. + +The module defines the `LlamaGrammar` class, a collection of built-in GBNF grammar strings, and a JSON Schema to GBNF converter based on the upstream `llama.cpp` grammar tooling. + +Use this module when you need to guide model output toward a specific grammar, such as JSON, JSON arrays, lists, arithmetic expressions, or custom GBNF rules. + +## Role in the Library + +`LlamaGrammar` acts as a lightweight wrapper around a GBNF grammar string. + +The module also includes helper logic for converting JSON Schema definitions into GBNF grammar text. This allows users to define structured output constraints using JSON Schema-like input and convert it into a grammar format usable by llama.cpp-style constrained generation. + +## Important Classes + +| Class | Status | Description | +|---|---|---| +| `LlamaGrammar` | public | Main wrapper class for grammar strings. Supports creation from raw strings, files, and JSON Schema. | +| `BuiltinRule` | internal helper | Small container used by the JSON Schema converter to store built-in grammar rule content and dependencies. | +| `SchemaConverter` | internal implementation | Converts JSON Schema structures into GBNF grammar rules. Used by `json_schema_to_gbnf`. | + +## Constants + +### Default Root + +| Constant | Type | Value | Description | +|---|---|---|---| +| `LLAMA_GRAMMAR_DEFAULT_ROOT` | `str` | `"root"` | Default root rule name used by `LlamaGrammar`. | + +### Built-in GBNF Grammars + +The module includes several built-in GBNF grammar strings. + +| Constant | Description | +|---|---| +| `ARITHMETIC_GBNF` | Grammar for simple arithmetic-like expressions. | +| `C_GBNF` | Example grammar for a subset of C-like declarations and statements. | +| `CHESS_GBNF` | JSON-like grammar currently defined similarly to object/array/value grammar. | +| `ENGLISH_GBNF` | Simple English-character grammar. The source notes that it may be incomplete and mostly serves as an example. | +| `JAPANESE_GBNF` | JSON-like grammar currently defined similarly to object/array/value grammar. | +| `JSON_ARR_GBNF` | Grammar for generating JSON arrays. | +| `JSON_GBNF` | Grammar for JSON objects and values. | +| `LIST_GBNF` | Grammar for newline-separated Markdown-style list items. | + +### JSON Schema Conversion Rules + +The module also defines internal constants used by `SchemaConverter`. + +| Constant | Description | +|---|---| +| `SPACE_RULE` | Shared grammar rule for constrained whitespace. | +| `PRIMITIVE_RULES` | Built-in grammar rules for primitive schema types such as boolean, number, integer, object, array, string, and null. | +| `STRING_FORMAT_RULES` | Built-in grammar rules for selected string formats such as date, time, and date-time. | +| `RESERVED_NAMES` | Rule names reserved by the converter. | +| `DOTALL` | Pattern rule matching any Unicode code point. | +| `DOT` | Pattern rule matching any character except line breaks. | + +## `LlamaGrammar` + +```python +class LlamaGrammar +```` + +Main wrapper for GBNF grammar text. + +### Constructor + +```python +def __init__(self, *args, _grammar: str, **kwargs) +``` + +| Parameter | Type | Default | Description | +| ---------- | -------- | -------- | -------------------------------------------------------------------------------- | +| `*args` | variadic | none | Accepted by the constructor but not used directly in the current implementation. | +| `_grammar` | `str` | required | Grammar string stored by the instance. | +| `**kwargs` | variadic | none | Accepted by the constructor but not used directly in the current implementation. | + +### Important Attributes / State + +| Attribute | Type | Source | Description | +| ---------- | -------------- | ------------------------------- | ------------------------------------------------------- | +| `_grammar` | `str` | `_grammar` constructor argument | Internal grammar string stored by the instance. | +| `_root` | `str` | `LLAMA_GRAMMAR_DEFAULT_ROOT` | Internal root rule name. Defaults to `"root"`. | +| `grammar` | `str` property | `_grammar` | Read-only property returning the stored grammar string. | + +## Class Methods + +### `from_string` + +```python +@classmethod +def from_string( + cls, + grammar: str, + verbose: bool = True, +) -> "LlamaGrammar" +``` + +Creates a `LlamaGrammar` instance from a raw GBNF grammar string. + +| Parameter | Type | Default | Description | +| --------- | ------ | -------- | ------------------------------------------------------------------------------------------------- | +| `grammar` | `str` | required | Raw GBNF grammar string. | +| `verbose` | `bool` | `True` | Accepted by the method. The current implementation forwards no logging behavior from this method. | + +Returns: + +| Type | Description | +| -------------- | -------------------------------------------------------- | +| `LlamaGrammar` | Grammar instance containing the provided grammar string. | + +#### Example + +```python +from llama_cpp.llama_grammar import LlamaGrammar, JSON_GBNF + +grammar = LlamaGrammar.from_string(JSON_GBNF) + +print(grammar.grammar) +``` + +### `from_file` + +```python +@classmethod +def from_file( + cls, + file: Union[str, Path], + verbose: bool = True, +) -> "LlamaGrammar" +``` + +Creates a `LlamaGrammar` instance from a UTF-8 grammar file. + +| Parameter | Type | Default | Description | +| --------- | ------------------ | -------- | ------------------------ | +| `file` | `Union[str, Path]` | required | Path to a grammar file. | +| `verbose` | `bool` | `True` | Passed to `from_string`. | + +Behavior based on the current implementation: + +* Raises `FileNotFoundError` if the file does not exist. +* Raises `IOError` if reading the file fails. +* Raises `ValueError` if the grammar file is empty. +* Reads the file using UTF-8 encoding. + +#### Example + +```python +from llama_cpp.llama_grammar import LlamaGrammar + +grammar = LlamaGrammar.from_file("./json.gbnf") + +print(grammar.grammar) +``` + +### `from_json_schema` + +```python +@classmethod +def from_json_schema( + cls, + json_schema: Union[str, dict], + prop_order: Optional[List[str]] = None, + allow_fetch: bool = False, + dotall: bool = False, + raw_pattern: bool = False, + verbose: bool = True, +) -> "LlamaGrammar" +``` + +Creates a `LlamaGrammar` instance by converting a JSON Schema string or dictionary into GBNF grammar. + +| Parameter | Type | Default | Description | +| ------------- | --------------------- | -------- | -------------------------------------------------------------------------------------------------------- | +| `json_schema` | `Union[str, dict]` | required | JSON Schema input as a JSON string or Python dictionary. | +| `prop_order` | `Optional[List[str]]` | `None` | Optional property order. The source comment notes this can help improve stability for small models. | +| `allow_fetch` | `bool` | `False` | Allows remote schema fetching for HTTPS `$ref` values when enabled. | +| `dotall` | `bool` | `False` | Controls whether pattern `.` should match all Unicode code points during regex-to-grammar conversion. | +| `raw_pattern` | `bool` | `False` | Controls whether regex patterns are converted as raw grammar patterns instead of quoted string patterns. | +| `verbose` | `bool` | `True` | Passed to `from_string`. | + +Returns: + +| Type | Description | +| -------------- | -------------------------------------------------------------- | +| `LlamaGrammar` | Grammar instance containing the generated GBNF grammar string. | + +If conversion fails, the method raises `ValueError`. + +#### Example + +```python +from llama_cpp.llama_grammar import LlamaGrammar + +schema = { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name"], +} + +grammar = LlamaGrammar.from_json_schema(schema) + +print(grammar.grammar) +``` + +## `json_schema_to_gbnf` + +```python +def json_schema_to_gbnf( + schema: Union[str, dict], + prop_order: Optional[List[str]] = None, + allow_fetch: bool = False, + dotall: bool = False, + raw_pattern: bool = False, +) +``` + +Converts a JSON Schema string or dictionary into a GBNF grammar string. + +| Parameter | Type | Default | Description | +| ------------- | --------------------- | -------- | --------------------------------------------------------------------------------------------------- | +| `schema` | `Union[str, dict]` | required | JSON Schema input. Strings are parsed with `json.loads`; dictionaries are copied before conversion. | +| `prop_order` | `Optional[List[str]]` | `None` | Optional property ordering used by object rule generation. | +| `allow_fetch` | `bool` | `False` | Allows remote HTTPS `$ref` fetching when enabled. | +| `dotall` | `bool` | `False` | Controls regex dot behavior during pattern conversion. | +| `raw_pattern` | `bool` | `False` | Controls how regex pattern rules are emitted. | + +Returns: + +| Type | Description | +| ----- | ------------------------------ | +| `str` | Generated GBNF grammar string. | + +The function raises `TypeError` if `schema` is neither a JSON string nor a dictionary. + +### Example + +```python +from llama_cpp.llama_grammar import json_schema_to_gbnf + +schema = { + "type": "array", + "items": {"type": "string"}, + "minItems": 1, + "maxItems": 3, +} + +gbnf = json_schema_to_gbnf(schema) + +print(gbnf) +``` + +## `SchemaConverter` + +```python +class SchemaConverter +``` + +Internal implementation class used by `json_schema_to_gbnf`. + +`SchemaConverter` walks a JSON Schema dictionary, resolves references, builds grammar rules, and formats them into GBNF. + +This class is useful for understanding how conversion works, but most users should use `LlamaGrammar.from_json_schema` or `json_schema_to_gbnf` instead. + +> Warning: `SchemaConverter` appears to be an implementation detail. It should not be treated as the primary public API unless the project explicitly documents it as stable. + +### Constructor + +```python +def __init__( + self, + *, + prop_order, + allow_fetch, + dotall, + raw_pattern, +) +``` + +| Parameter | Type | Description | +| ------------- | ------------ | ----------------------------------------------------------------------- | +| `prop_order` | mapping-like | Property ordering map used when generating object rules. | +| `allow_fetch` | `bool` | Enables or disables remote schema fetching for supported `$ref` values. | +| `dotall` | `bool` | Controls regex dot behavior. | +| `raw_pattern` | `bool` | Controls raw pattern handling. | + +### Important Internal State + +| Attribute | Type | Description | +| ---------------------- | ------------ | ------------------------------------------------ | +| `_prop_order` | mapping-like | Stores property ordering preferences. | +| `_allow_fetch` | `bool` | Stores whether remote references may be fetched. | +| `_dotall` | `bool` | Stores regex dot behavior. | +| `_raw_pattern` | `bool` | Stores raw pattern handling behavior. | +| `_rules` | `dict` | Accumulates generated grammar rules. | +| `_refs` | `dict` | Stores resolved JSON Schema references. | +| `_refs_being_resolved` | `set` | Tracks references currently being resolved. | + +### Key Methods + +| Method | Description | +| -------------------- | ---------------------------------------------------------------------------------------- | +| `resolve_refs` | Resolves local and supported HTTPS `$ref` references in a schema. | +| `visit` | Main schema visitor that generates grammar rules based on schema structure. | +| `format_grammar` | Formats generated rules into a GBNF grammar string. | +| `_build_object_rule` | Builds object grammar rules from properties, required fields, and additional properties. | +| `_visit_pattern` | Converts supported regex patterns into GBNF rules. | +| `_add_rule` | Adds or reuses a grammar rule name. | +| `_add_primitive` | Adds primitive rules and their dependencies. | + +## Supported JSON Schema Features + +Based on the current implementation, the converter includes handling for: + +* `type` +* `properties` +* `required` +* `additionalProperties` +* `$ref` +* `oneOf` +* `anyOf` +* `allOf` +* `const` +* `enum` +* `items` +* `prefixItems` +* `minItems` +* `maxItems` +* `pattern` +* `format` +* `minLength` +* `maxLength` +* integer bounds: + + * `minimum` + * `exclusiveMinimum` + * `maximum` + * `exclusiveMaximum` + +String formats handled by built-in rules include: + +* `date` +* `time` +* `date-time` +* UUID-like formats matching `uuid`, `uuid1`, `uuid2`, `uuid3`, `uuid4`, or `uuid5` + +The source includes a TODO comment for unsupported string formats such as `uri` and `email`. + +## Error Handling + +| API | Error | Condition | +| ------------------------------- | ------------------- | ----------------------------------------- | +| `LlamaGrammar.from_file` | `FileNotFoundError` | Grammar file path does not exist. | +| `LlamaGrammar.from_file` | `IOError` | Grammar file cannot be read. | +| `LlamaGrammar.from_file` | `ValueError` | Grammar file is empty. | +| `LlamaGrammar.from_json_schema` | `ValueError` | JSON Schema to GBNF conversion fails. | +| `json_schema_to_gbnf` | `TypeError` | Schema input is neither `str` nor `dict`. | + +## Common Usage + +### Use a Built-in Grammar + +```python +from llama_cpp.llama_grammar import LlamaGrammar, JSON_GBNF + +grammar = LlamaGrammar.from_string(JSON_GBNF) + +print(grammar.grammar) +``` + +### Load Grammar from a File + +```python +from llama_cpp.llama_grammar import LlamaGrammar + +grammar = LlamaGrammar.from_file("./grammar.gbnf") + +print(grammar.grammar) +``` + +### Convert JSON Schema to Grammar + +```python +from llama_cpp.llama_grammar import LlamaGrammar + +schema = { + "type": "object", + "properties": { + "answer": {"type": "string"}, + "confidence": {"type": "number"}, + }, + "required": ["answer"], +} + +grammar = LlamaGrammar.from_json_schema( + schema, + prop_order=["answer", "confidence"], +) + +print(grammar.grammar) +``` + +### Convert JSON Schema Directly to GBNF + +```python +from llama_cpp.llama_grammar import json_schema_to_gbnf + +schema = { + "type": "object", + "properties": { + "items": { + "type": "array", + "items": {"type": "string"}, + } + }, +} + +gbnf = json_schema_to_gbnf(schema) + +print(gbnf) +``` + +## Best Practices & Common Patterns + +* Use `LlamaGrammar.from_string` when you already have a GBNF grammar string. +* Use `LlamaGrammar.from_file` when storing grammar definitions in `.gbnf` files. +* Use `LlamaGrammar.from_json_schema` when generating grammars from JSON Schema input. +* Use `json_schema_to_gbnf` directly when you only need the generated grammar string. +* Keep JSON Schemas small and explicit when targeting constrained generation. +* Use `prop_order` when output field order matters for stability. +* Keep `allow_fetch=False` unless remote `$ref` fetching is explicitly needed. +* Prefer public helpers over using `SchemaConverter` directly. +* Do not rely on internal converter methods as stable public APIs. + +## Limitations + +* `SchemaConverter` is implementation-oriented and may change. +* Remote `$ref` fetching is only attempted for HTTPS references and requires `allow_fetch=True`. +* The source includes TODO notes for unsupported string formats such as `uri` and `email`. +* Regex pattern conversion explicitly rejects unsupported pattern syntax such as lookaheads and non-greedy modifiers. +* The exact runtime integration between `LlamaGrammar` and model generation should be verified from the relevant generation APIs before documenting end-to-end constrained generation behavior. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] diff --git a/docs/wiki/modules/LlamaSpeculative.md b/docs/wiki/modules/LlamaSpeculative.md index d1463883b9..0c0ad099fb 100644 --- a/docs/wiki/modules/LlamaSpeculative.md +++ b/docs/wiki/modules/LlamaSpeculative.md @@ -233,5 +233,6 @@ Each class MUST have exactly the same internal structure and method names (creat ## Related Links +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] From de5a1e8dd9156966758e522d2b99d8739a56fc96 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 3 May 2026 16:12:43 +0800 Subject: [PATCH 375/518] Update /docs/wiki/index.md Signed-off-by: JamePeng --- docs/wiki/index.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index aadfd249e5..02f2dd5b9a 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -28,6 +28,7 @@ These pages document major source modules and related classes. |---|---| | [modules/LlamaCache\|Llama Cache] | Cache interfaces and implementations for reusing model state across repeated prompts. | | [modules/LlamaEmbedding\|Llama Embedding] | Embedding-related APIs and usage patterns. | +| [modules/LlamaGrammar\|Llama Grammar] | Provides grammar utilities for constrained generation. | | [modules/LlamaSpeculative\|Llama Speculative Decoding] | Draft model interfaces and prompt-based speculative decoding helpers. | --- @@ -48,9 +49,10 @@ These pages define how the wiki should be written, updated, and reviewed. If you are new to this wiki, read the pages in this order: 1. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] -2. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] -3. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] -4. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +2. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] +3. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] +4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] +5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] If you are contributing documentation, start with: @@ -68,6 +70,7 @@ Currently available pages: - `core/Llama.md` - `modules/LlamaCache.md` - `modules/LlamaEmbedding.md` +- `modules/LlamaGrammar.md` - `modules/LlamaSpeculative.md` - `SCHEMA.md` - `contributing-to-wiki.md` From a63f60e8956342a90078421b439a626cd4d1cb17 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 4 May 2026 09:01:03 +0800 Subject: [PATCH 376/518] perf: Optimize detokenize buffer sizing for CJK-heavy outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Increase the initial detokenization buffer estimate from 1x tokens to `5x + 32` bytes. Performance analysis revealed that CJK-heavy outputs often require around 4.0x–5.04x bytes per token, with small-token edge cases reaching about 6.0x, so the previous estimate frequently forced a failed `llama_detokenize` call followed by a resize and retry. Almost twice as many calls to llama_detokenize. - This reduces avoidable detokenization retries and cuts call overhead in CJK-heavy cases, resulting in an observed ~3–5% inference performance improvement. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 27dd4f80d3..b4ba1f4b21 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -298,7 +298,11 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: tokens_array = (llama_cpp.llama_token * n_tokens)(*tokens) # Initial buffer size estimation - buffer_size = max(n_tokens, 64) + # Note(JamePeng): + # Observed CJK heavy outputs are about 4.0x - 5.04x, + # with extreme values ​​even reaching 6.0x, so this avoids most retry cases. + # In CJK use cases, the call overhead will be reduced by 50% compared to the previous detokenize method. + buffer_size = max(64, n_tokens * 5 + 32) buffer = (ctypes.c_char * buffer_size)() n_chars = llama_cpp.llama_detokenize( From 22a7fdbfe0c0e16a336ef9463b6b4df282b7928f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 4 May 2026 10:40:30 +0800 Subject: [PATCH 377/518] patch(logger): filter out verbose noisy CUDA Graph debug logs Add a temporary patch in `ggml_log_callback` to suppress the noisy `CUDA Graph id %zu reused` messages generated by the underlying C++ backend. A complete logger refactoring is planned for better log control in the future. Signed-off-by: JamePeng --- llama_cpp/_logger.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 022ece22bf..015cec9faa 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -32,6 +32,12 @@ def ggml_log_callback( text: bytes, user_data: ctypes.c_void_p, ): + # Note(JamePeng): A temporary patch is used to filter out garbage debug information + # output from the underlying C++ `CUDA Graph id %zu reused`. + # The logger is planned to be refactored to meet control requirements. + if text: + if b"CUDA Graph" in text or b"CUDA graph" in text: + return # TODO: Correctly implement continue previous log global _last_log_level log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level From fe8657adf71856967320bfb932461d65ecf77b19 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 4 May 2026 11:00:32 +0800 Subject: [PATCH 378/518] Update Submodule vendor/llama.cpp 63d93d1..e48034d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 63d93d1733..e48034dfc9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 63d93d17336e41e4cc73a64451e5b1d2477abdb1 +Subproject commit e48034dfc9e5705248fd39dc437ca887dc55a528 From ef27f333f367fdc53dc1a729ad8bb6c3c9362514 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 4 May 2026 11:15:14 +0800 Subject: [PATCH 379/518] Bump version to 0.3.38 Signed-off-by: JamePeng --- CHANGELOG.md | 24 ++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d144fe2267..253b2ae4cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.38] Optimized CJK Detokenization, Sync Grammar Parser, and Patched CUDA Graph Logs + +- perf: Optimize detokenize buffer sizing for CJK-heavy outputs + - Increase the initial detokenization buffer estimate from 1x tokens to `5x + 32` bytes. Performance analysis revealed that CJK-heavy outputs often require around 4.0x–5.04x bytes per token, with small-token edge cases reaching about 6.0x, so the previous estimate frequently forced a failed `llama_detokenize` call followed by a resize and retry. Previously, this resulted in almost twice as many calls to llama_detokenize. + - This reduces avoidable detokenization retries and cuts call overhead in CJK-heavy cases, resulting in an observed ~3–5% inference performance improvement. + +- patch(logger): filter out verbose noisy CUDA Graph debug logs + - Add a temporary patch in `ggml_log_callback` to suppress the noisy `CUDA Graph id %zu reused` messages generated by the underlying C++ backend. A complete logger refactoring is planned for better log control in the future. + +- docs: add LlamaGrammar wiki page and Update /docs/wiki/index.md + - Now Github Wiki Online: https://github.com/JamePeng/llama-cpp-python/wiki + +- feat(grammar): sync JSON schema to GBNF converter with upstream + - Allow `LlamaGrammar.from_json_schema` and `json_schema_to_gbnf` to accept both string and dict schema inputs. + - Expose `allow_fetch`, `dotall`, and `raw_pattern` arguments to the public API to match the upstream script. + - Fix missing handling for empty/unconstrained schema objects (e.g. `{"description": "..."}`) which now correctly default to accepting any value. + - Fix bug where `has_min and has_max` evaluated incorrectly when variables were zero. Replaced `!= None` with `is not None` in `_generate_min_max_int`. + - Update internal constants and regex patterns (`INVALID_RULE_CHARS_RE`, `GRAMMAR_LITERAL_ESCAPE_RE`, `GRAMMAR_RANGE_LITERAL_ESCAPE_RE`) to resolve character escaping issues. + - Update reference link to point to the new `ggml-org` organization. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/e48034dfc9e5705248fd39dc437ca887dc55a528](https://github.com/ggml-org/llama.cpp/commit/e48034dfc9e5705248fd39dc437ca887dc55a528) + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/fe38cbfac424973ccd9cfaa199a64a02502af4d1...fe8657adf71856967320bfb932461d65ecf77b19 + ## [0.3.37] MoE CPU Offloading, O(1) Speculative Decoding, Thread-Safe Abort & New LLM Wiki - docs: A basic new documentation system for the LLM-Wiki has been initially established. diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 7fd2b4c492..438bf08b58 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.37" +__version__ = "0.3.38" From 1f5226b4882542545bec204d81f04171059566ee Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 20:58:58 +0200 Subject: [PATCH 380/518] Implemented generic multimodal chat handler. --- llama_cpp/llama.py | 12 +++++++++ llama_cpp/llama_chat_format.py | 49 +++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e26..848706a90d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -85,6 +85,7 @@ class Llama: def __init__( self, model_path: str, + clip_model_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -608,6 +609,17 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) + + if clip_model_path is not None: + if self.chat_handler is not None and self.verbose: + print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + + self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + gguf_metadata = self.metadata, + clip_model_path = clip_model_path, + model_arch = None, + verbose = self.verbose + ) eos_token_id = self.token_eos() bos_token_id = self.token_bos() diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db4..468a73c077 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2887,10 +2887,14 @@ def __init__( raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") # Pre-compile Jinja template + if not hasattr(self, "chat_format") or self.chat_format is None: + self.chat_format = self.CHAT_FORMAT + + self._chat_format_parser_tags = [] self.chat_template = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, - ).from_string(self.CHAT_FORMAT) + ).from_string(self.chat_format) self._exit_stack = ExitStack() @@ -3116,6 +3120,13 @@ def _process_mtmd_prompt( tool_choice=tool_choice, **getattr(self, 'extra_template_arguments', {}) ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + # Replace image_url by media_marker in text for item in media_items: text = text.replace(item["url"], media_marker) @@ -3827,6 +3838,42 @@ def from_pretrained( **kwargs, ) +class GenericMTMDChatHandler(MTMDChatHandler): + def __init__( + self, + gguf_metadata: Dict[str, Any], + clip_model_path: str, + model_arch: Optional[str] = None, + verbose: bool = True, + **kwargs + ) -> None: + self.model_metadata = gguf_metadata + + self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch + + if verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + if self.arch is None: + if verbose: + print("Unknown model architecture. Will use general/most-common tags.") + + self.arch = "unknown" + + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + + if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: + self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] + elif self.arch in ["gemma4"]: + self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] + elif self.arch in ["mistral3", "mistral4", "deepseek2"]: + self._chat_format_parser_tags += ["[IMG]"] + elif verbose: + print("Warning: Could not determine chat format parser tags.", flush = True) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From a8d19d3bbd18890693576b1f5ed6cd0b2d487eab Mon Sep 17 00:00:00 2001 From: Alcoft Date: Mon, 4 May 2026 21:19:20 +0200 Subject: [PATCH 381/518] Used text.replace() --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 468a73c077..ab5e438d3e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3125,7 +3125,7 @@ def _process_mtmd_prompt( if tag not in text: continue - text = text[:text.index(tag)] + media_marker + text[text.index(tag) + len(tag):] + text = text.replace(tag, media_marker) # Replace image_url by media_marker in text for item in media_items: From 3e031d5de16d5bd81dd35ef2cc3b8e2d49fac063 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 17:46:08 +0200 Subject: [PATCH 382/518] Fixed some bugs. --- llama_cpp/llama_chat_format.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ab5e438d3e..40491968a9 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3874,6 +3874,18 @@ def __init__( self._chat_format_parser_tags += ["[IMG]"] elif verbose: print("Warning: Could not determine chat format parser tags.", flush = True) + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) class Llava15ChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From 389d0d97babca3edcf6fb74f476e306a21183b5f Mon Sep 17 00:00:00 2001 From: Alcoft Date: Tue, 5 May 2026 18:49:21 +0200 Subject: [PATCH 383/518] Implemented 'chat_handler_kwargs'. --- llama_cpp/llama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 848706a90d..6dab44602d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -152,6 +152,7 @@ def __init__( spm_infill: bool = False, verbose: bool = True, # Extra Params + chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): """Load a llama.cpp model from `model_path`. @@ -618,7 +619,8 @@ def __init__( gguf_metadata = self.metadata, clip_model_path = clip_model_path, model_arch = None, - verbose = self.verbose + verbose = self.verbose, + **chat_handler_kwargs ) eos_token_id = self.token_eos() From 867a579ef9440f02f9bf0849ff37e30b7fd4deda Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 02:06:52 +0800 Subject: [PATCH 384/518] Update Submodule vendor/llama.cpp e48034d..bbeb89d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e48034dfc9..bbeb89d76c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e48034dfc9e5705248fd39dc437ca887dc55a528 +Subproject commit bbeb89d76c41bc250f16e4a6fefcc9b530d6e3f3 From 3796562b0cb4397bc13b295a8d8e8433f4919005 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 02:46:42 +0800 Subject: [PATCH 385/518] Sync llama : add option to save memory in device buffers Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 416e8b9357..1efd645150 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2770,6 +2770,9 @@ def llama_state_seq_load_file( # // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba) LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1 +# // keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2 + llama_state_seq_flags = ctypes.c_uint32 # LLAMA_API size_t llama_state_seq_get_size_ext( From 8eafd9edacb20fbc461081ae16b5afc8b5cd883c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 05:07:26 +0800 Subject: [PATCH 386/518] feat(HybridCheckpointCache): add `on-device` hybrid checkpoint support - Add dual-mode HybridCheckpointCache behavior Host mode keeps the existing Python-owned full checkpoint path. Device mode forwards checkpoint tensor payloads to llama.cpp-owned device buffers. - Add on_device support for llama.cpp sequence state APIs The cache now forwards LLAMA_STATE_SEQ_FLAGS_ON_DEVICE when requested. This aligns Python checkpoint behavior with upstream llama.cpp device-backed state storage. - Keep host checkpoint mode as the default on_device remains disabled by default for compatibility. Existing multi-checkpoint rollback behavior stays unchanged unless explicitly enabled. - Preserve multi-checkpoint history in host mode Host-backed checkpoints still store full serialized payloads in Python bytes. This keeps historical rollback safe for multi-turn reuse. - Add safe per-seq behavior for device mode Device-backed tensor payloads are owned by llama_context and keyed by seq_id. The cache now replaces old checkpoint metadata for the same seq_id before saving a new one. - Guard against stale on-device checkpoint restores Old Python checkpoint objects may outlive the current device payload. Restore now refuses stale on-device checkpoints to avoid mixing old metadata with newer device tensors. - Add shared FIFO eviction for checkpoint entries Checkpoint eviction is now handled through a common helper. This keeps max_checkpoints respected for both host metadata and device-mode metadata. - Clarify HybridCheckpoint data ownership semantics The dataclass docs now distinguish full host-side payloads from host-visible device-mode metadata. This makes it clear that Python does not own VRAM checkpoint tensors. - Improve cache_size documentation cache_size now describes host-visible memory usage. In device mode, it intentionally excludes llama_context-owned device tensor storage. - Expand save and restore diagnostics Verbose logs now include checkpoint mode, seq_id, position, count, and tracked memory usage. This should make hybrid rollback debugging much easier. - Rename internal state flags from _flag_partial to _flags The name now reflects that multiple sequence state flags may be combined. This is clearer now that PARTIAL_ONLY and ON_DEVICE can both be active. - Add checkpoint_on_device to Llama.__init__ Users can now enable device-backed hybrid checkpoints from the high-level Llama wrapper. The option is passed directly into HybridCheckpointCache as on_device. - Reduce default ctx_checkpoints from 32 to 16 This lowers default checkpoint memory pressure. Host mode can still be tuned higher when deeper rollback history is needed. - Document checkpoint_on_device in Llama init args The new argument explains that tensor payloads are stored in llama_context-owned device buffers. It also clarifies the tradeoff between lower device-to-host copy overhead and one active checkpoint per seq_id. - Improve hybrid cache initialization logs Llama.__init__ now prints ctx_checkpoints, checkpoint_interval, and on_device when hybrid checkpointing is enabled. This makes runtime configuration easier to verify from stderr. Signed-off-by: JamePeng --- llama_cpp/llama.py | 27 +++- llama_cpp/llama_cache.py | 293 ++++++++++++++++++++++++++++++++------- 2 files changed, 267 insertions(+), 53 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1241f81e26..e50e3b9a3b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -131,8 +131,9 @@ def __init__( swa_full: Optional[bool] = None, kv_unified: Optional[bool] = None, # HybridCheckpointCache Params - ctx_checkpoints: int = 32, + ctx_checkpoints: int = 16, checkpoint_interval: int = 4096, + checkpoint_on_device: bool = False, # Sampling Params last_n_tokens_size: int = 64, # Backend Params @@ -227,6 +228,7 @@ def __init__( kv_unified: use single unified KV buffer for the KV cache of all sequences ctx_checkpoints: max number of context checkpoints to create per slot (default: 16)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) checkpoint_interval: Hybrid model checkpoint token intervals, and archiving of text with interval sizes along the way. + checkpoint_on_device: Store hybrid/recurrent checkpoint tensor payloads in llama_context-owned device buffers via LLAMA_STATE_SEQ_FLAGS_ON_DEVICE. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. numa: numa policy chat_format: String specifying the chat format to use when calling create_chat_completion. @@ -541,6 +543,7 @@ def __init__( _is_recurrent = self._model.is_recurrent() _is_hybrid = self._model.is_hybrid() _n_swa = self._model.n_swa() + # Sync llama.cpp upstream (#20291): warn swa-full is not supported for non-SWA models. if _n_swa == 0: if (self.context_params.swa_full): @@ -555,13 +558,25 @@ def __init__( if self.is_hybrid: if self.verbose: - print(f"Llama.__init__: Hybrid/Recurrent model detected." - f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " - f" Enabling HybridCheckpointCache(ctx_checkpoints={ctx_checkpoints}, checkpoint_interval={checkpoint_interval}).", - file=sys.stderr) + print( + f"Llama.__init__: Hybrid/Recurrent model detected. " + f"(is_recurrent: {_is_recurrent}, is_hybrid: {_is_hybrid}, " + f"n_swa: {_n_swa}, swa_full: {self.context_params.swa_full}). " + f"Enabling HybridCheckpointCache(" + f"ctx_checkpoints={ctx_checkpoints}, " + f"checkpoint_interval={checkpoint_interval}, " + f"on_device={checkpoint_on_device}).", + file=sys.stderr, + ) self.ctx_checkpoints = ctx_checkpoints self.checkpoint_interval = checkpoint_interval - self._hybrid_cache_mgr = HybridCheckpointCache(self._ctx.ctx, max_checkpoints=self.ctx_checkpoints, verbose=self.verbose) + self.checkpoint_on_device = checkpoint_on_device + self._hybrid_cache_mgr = HybridCheckpointCache( + self._ctx.ctx, + max_checkpoints=self.ctx_checkpoints, + on_device=self.checkpoint_on_device, + verbose=self.verbose, + ) else: self._hybrid_cache_mgr = None diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index dc1dd20d7c..ee37df1200 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -352,58 +352,169 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"): @dataclass class HybridCheckpoint: - """Represents a single snapshot of the RNN/Hybrid model's hidden state.""" - pos: int # The token position (cursor) where this snapshot was taken - data: bytes # The raw binary RNN state data - hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching - size: int # Size of the state data in bytes - seq_id: int # Sequence ID this checkpoint belongs to + """ + Represents a single snapshot of the Hybrid/Recurrent model state. + + Notes: + - When on_device=False, `data` contains the full host-side serialized state. + - When on_device=True, `data` contains only the host-visible portion of the + serialized state. The tensor payload is stored in llama_context-owned + device buffers by llama.cpp, keyed by seq_id. + """ + pos: int # The token position (cursor) where this snapshot was taken. + data: bytes # The raw binary RNN state data. + hash_val: str # SHA-256 hash of the token prefix to ensure exact sequence matching. + size: int # Number of bytes written by llama_state_seq_get_data_ext(). + seq_id: int # Sequence id used by llama.cpp state APIs. class HybridCheckpointCache(BaseLlamaCache): """ - Manager for RNN state snapshots (Checkpoints) tailored for Hybrid/Recurrent models. - Provides rollback capabilities for models that cannot physically truncate KV cache. + Checkpoint manager for Hybrid/Recurrent model states. + + This cache is designed for models whose memory cannot be safely truncated like + a regular Transformer KV cache. For recurrent/hybrid architectures, rollback is + implemented by saving and restoring sequence state snapshots. + + Two operating modes are supported: + + 1. Host mode: on_device=False + - Full checkpoint payload is materialized as Python bytes. + - Multiple checkpoints per seq_id are safe. + - This mode is suitable for multi-turn rollback and longer conversation reuse. + + 2. Device mode: on_device=True + - LLAMA_STATE_SEQ_FLAGS_ON_DEVICE is forwarded to llama.cpp. + - Tensor payloads are stored in llama_context-owned device buffers. + - The device buffers are created per seq_id in llama.cpp. + - Therefore only one active checkpoint per seq_id is safe. + - This mode is suitable for fast speculative / branch rollback where avoiding + device-to-host tensor copies is more important than keeping many historical + checkpoints. + + Important: + Do not treat on_device=True as "Python owns a VRAM checkpoint". Python only + owns the host-visible serialized portion. The tensor payload lives inside the + llama_context and is keyed by seq_id. """ - def __init__(self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, verbose: bool = False): + def __init__( + self, + ctx: llama_cpp_lib.llama_context_p, + max_checkpoints: int = 16, + on_device: bool = False, + verbose: bool = False + ): + """ + Args: + ctx (llama_context_p): + Borrowed llama.cpp context pointer used by the state sequence APIs. + This cache does not own the context and must not free it. + + max_checkpoints(int): Maximum number of Python-side checkpoint entries to keep. + - Host mode: This is the maximum number of historical checkpoints across all seq_ids. + - Device mode: This is still a global upper bound for Python-side metadata entries, + but this class also enforces at most one active checkpoint per seq_id, + because llama.cpp stores device tensor payloads per seq_id. + + on_device(bool): Whether to request llama.cpp to keep tensor checkpoint payloads in + context-owned device buffers via LLAMA_STATE_SEQ_FLAGS_ON_DEVICE. + + verbose(bool): Enables diagnostic logging to stderr for checkpoint save/restore/eviction. + """ if ctx is None: - raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context") + raise ValueError("HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with a null model context") self._ctx = ctx + self.on_device = on_device + self.verbose = verbose + + # In host mode, max_checkpoints means "maximum number of Python-owned + # checkpoints across all seq_ids". + # + # In device mode, llama.cpp stores tensor payloads in device buffers keyed + # by seq_id. Multiple Python checkpoint metadata entries for the same seq_id + # would point to the same mutable device-side slot, so only one checkpoint + # per seq_id is safe. self.max_checkpoints = max_checkpoints + + # Python-side checkpoint registry. + # + # Host mode: + # Each HybridCheckpoint owns a full serialized checkpoint payload. + # + # Device mode: + # Each HybridCheckpoint owns only the host-visible serialized portion. + # The corresponding tensor payload is owned by llama_context. self.checkpoints: list[HybridCheckpoint] = [] + + # Total Python-tracked checkpoint size in bytes. + # + # Host mode: + # Roughly equals the total serialized checkpoint payload size. + # + # Device mode: + # Tracks only the host-visible part returned by llama.cpp, not the + # context-owned device tensor storage. self._current_size = 0 - # Cache C-type API function pointers for performance + # Cache C API function pointers for faster repeated calls. self._get_size_ext = llama_cpp_lib.llama_state_seq_get_size_ext self._get_data_ext = llama_cpp_lib.llama_state_seq_get_data_ext self._set_data_ext = llama_cpp_lib.llama_state_seq_set_data_ext - self._flag_partial = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY - self.verbose = verbose - - if self.max_checkpoints <= 0: - if self.verbose: - import sys - print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " - "Rollback capabilities are turned off. This is optimal for single-turn workflows.", - file=sys.stderr) + # State serialization flags forwarded to llama.cpp. + # + # LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY: + # Save only the sequence-specific / partial state needed for recurrent + # rollback instead of a full context state. + # + # LLAMA_STATE_SEQ_FLAGS_ON_DEVICE: + # Ask llama.cpp to store tensor payloads in context-owned device buffers. + self._flags = llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY + if on_device: + self._flags |= llama_cpp_lib.LLAMA_STATE_SEQ_FLAGS_ON_DEVICE + + if self.max_checkpoints <= 0 and self.verbose: + print("HybridCheckpointCache(__init__): Cache is DISABLED (max_checkpoints <= 0). " + "Rollback capabilities are turned off. This is optimal for single-turn workflows.", + file=sys.stderr) + + if self.on_device and self.max_checkpoints > 1 and self.verbose: + print( + "HybridCheckpointCache(__init__): on_device=True stores tensor payloads " + "in llama_context-owned device buffers keyed by seq_id. Multiple " + "historical checkpoints for the same seq_id are unsafe, so this cache " + "will keep only one checkpoint per seq_id.", + file=sys.stderr, + ) @property def cache_size(self) -> int: - """Returns the total memory used by all stored checkpoints in bytes.""" + """ + Returns the host-visible checkpoint size tracked by Python. + + In host mode, this is close to the full serialized checkpoint payload size. + In device mode, this is only the host-visible metadata/payload size returned + by llama.cpp. Device-side tensor storage is owned by llama_context and is not + fully represented by this number. + """ return self._current_size def clear(self): - """Clears all stored checkpoints and resets memory tracking.""" + """ + Clears Python-side checkpoint metadata. + + This does not explicitly release llama_context-owned device buffers. The + device buffers are managed by llama.cpp and are associated with the context. + """ if not self.checkpoints: # Empty Checkpoint: Return immediately, no need to clear. return self.checkpoints.clear() self._current_size = 0 if self.verbose: - print("HybridCheckpointCache: cleared") + print("HybridCheckpointCache(clear): cleared", file=sys.stderr) def close(self): - self.checkpoints = None + self.clear() self._ctx = None self._get_size_ext = None self._get_data_ext = None @@ -421,23 +532,72 @@ def _hash_prefix(self, tokens: List[int], length: int) -> str: """ if length <= 0: return "empty" - tokens_size = len(tokens) - if length > tokens_size: - length = tokens_size + length = min(length, len(tokens)) data = array.array('i', tokens[:length]).tobytes() return hashlib.sha256(data).hexdigest()[:32] + def _replace_checkpoint_for_seq_id(self, seq_id: int) -> None: + """ + Removes all Python-side checkpoints for one seq_id. + + Required for on_device=True because llama.cpp stores the device tensor + payload per seq_id, not per Python checkpoint object. + """ + kept: list[HybridCheckpoint] = [] + removed_size = 0 + + for cp in self.checkpoints: + if cp.seq_id == seq_id: + removed_size += cp.size + else: + kept.append(cp) + + self.checkpoints = kept + self._current_size -= removed_size + if self._current_size < 0: + self._current_size = 0 + + def _evict_checkpoints_if_needed(self) -> None: + """ + Evicts old checkpoints if needed + + Host mode: + This evicts full Python-owned checkpoint payloads, so FIFO historical + checkpoints are safe and useful. + + Device mode: + This evicts Python-side metadata only. The device tensor payload is owned + by llama_context and is keyed by seq_id. + """ + while len(self.checkpoints) > self.max_checkpoints: + old_cp = self.checkpoints.pop(0) + self._current_size -= old_cp.size + if self._current_size < 0: + self._current_size = 0 + + if self.verbose: + print( + f"HybridCheckpointCache: evicted checkpoint " + f"seq_id={old_cp.seq_id}, pos={old_cp.pos}", + file=sys.stderr, + ) + def find_best_checkpoint(self, tokens: List[int], seq_id: int = 0) -> Optional[HybridCheckpoint]: """ Finds the longest valid checkpoint that perfectly matches the provided token prefix. + + The hash check prevents restoring a checkpoint that has the same length but + belongs to a different prompt/history. + Returns None if no matching checkpoint is found. """ # Empty Checkpoint: Instant return, no hash calculation needed. if self.max_checkpoints <= 0 or len(self.checkpoints) == 0: return None - best_cp = None + best_cp: Optional[HybridCheckpoint] = None best_pos = -1 + for cp in self.checkpoints: if cp.seq_id != seq_id or cp.pos > len(tokens): # Skip if sequence ID mismatches or checkpoint is longer than the current prompt @@ -475,9 +635,17 @@ def save_checkpoint( file=sys.stderr) return False - flags = self._flag_partial + # In on-device mode, remove old Python metadata for this seq_id before saving + # the new checkpoint. The underlying llama.cpp device buffer for this seq_id + # will be overwritten by the get_data_ext() call. + if self.on_device: + self._replace_checkpoint_for_seq_id(seq_id) + + flags = self._flags - # 1. Query the required buffer size from the underlying C++ context + # 1. Query the required host-visible buffer size. + # In on_device mode this may exclude the large tensor payload + # that stays in device memory. size = self._get_size_ext(self._ctx, seq_id, flags) if size == 0: if self.verbose: @@ -487,9 +655,14 @@ def save_checkpoint( # 2. Allocate buffer and extract raw state data buffer = (ctypes.c_uint8 * size)() n_written = self._get_data_ext(self._ctx, buffer, size, seq_id, flags) + if n_written != size: if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): get failed {n_written}/{size}") + print( + f"HybridCheckpointCache(save_checkpoint): get_data_ext failed " + f"({n_written}/{size})", + file=sys.stderr, + ) return False # Note: This deep copy isolates the state from subsequent C++ backend mutations @@ -506,19 +679,18 @@ def save_checkpoint( ) self._current_size += n_written - # 4. Enforce capacity limits (FIFO eviction) - while len(self.checkpoints) > self.max_checkpoints: - if not self.checkpoints: - break - old_cp = self.checkpoints.pop(0) - self._current_size -= old_cp.size - if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): evicted pos={old_cp.pos}") + # 4. Evicts old checkpoints if needed + self._evict_checkpoints_if_needed() if self.verbose: - print(f"HybridCheckpointCache(save_checkpoint): Saved checkpoint at pos {current_pos} ({size / 1024 / 1024:.2f} MiB) " - f"total={len(self.checkpoints)} used={self._current_size / 1024 / 1024:.2f} MiB", - file=sys.stderr) + mode = "device" if self.on_device else "host" + print( + f"HybridCheckpointCache(save_checkpoint): saved {mode} checkpoint " + f"seq_id={seq_id}, pos={current_pos}, size={size / 1024 / 1024:.2f} MiB, " + f"hcc_count={len(self.checkpoints)}, " + f"hcc_mem_used={self._current_size / 1024 / 1024:.2f} MiB", + file=sys.stderr, + ) return True @@ -531,17 +703,38 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: if self.verbose: print(f"HybridCheckpointCache(restore_checkpoint): [Error] Sequence ID mismatch: checkpoint has {cp.seq_id}, requested {seq_id}", file=sys.stderr) return False - flags = self._flag_partial - # 2. Verify the underlying C++ context still expects the exact same state size. + # 2. Guard against stale on-device checkpoint objects. + # + # In on_device mode, Python does not own the full checkpoint tensor payload. + # llama.cpp keeps the large tensor payload in llama_context-owned device + # buffers keyed by seq_id. Saving a newer checkpoint for the same seq_id may + # overwrite that device-side payload while an old HybridCheckpoint object can + # still exist outside this cache. + # + # Only checkpoint objects still tracked by this cache are considered valid. + # This avoids restoring old Python metadata together with newer device tensors. + if self.on_device and cp not in self.checkpoints: + if self.verbose: + print( + "HybridCheckpointCache(restore_checkpoint): stale on-device checkpoint; " + "refusing restore because device payload may have been overwritten.", + file=sys.stderr, + ) + return False + + flags = self._flags + + # 3. Verify the underlying C++ context still expects the exact same state size. # This prevents buffer overflows if the backend context was unexpectedly altered or reallocated. current_size = self._get_size_ext(self._ctx, seq_id, flags) if current_size != cp.size: if self.verbose: - print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: expected {cp.size}, got {current_size} -> possible invalidation") + print(f"HybridCheckpointCache(restore_checkpoint): [Warning] State size mismatch before restore: " + f"expected checkpoint size={cp.size}, got current size={current_size} -> possible invalidation") return False - # 3. Copy data back to a ctypes buffer and push to the C++ backend + # 4. Copy data back to a ctypes buffer and push to the C++ backend buffer = (ctypes.c_uint8 * cp.size).from_buffer_copy(cp.data) ret = self._set_data_ext( self._ctx, buffer, cp.size, seq_id, flags @@ -549,7 +742,13 @@ def restore_checkpoint(self, cp: HybridCheckpoint, seq_id: int = 0) -> bool: success = (ret == cp.size) if self.verbose: - print(f"HybridCheckpointCache(restore_checkpoint): restore {'OK' if success else 'FAIL'} pos={cp.pos}") + mode = "device" if self.on_device else "host" + print( + f"HybridCheckpointCache(restore_checkpoint): restore " + f"{'OK' if success else 'FAIL'} " + f"mode={mode}, seq_id={seq_id}, pos={cp.pos}", + file=sys.stderr, + ) return success # Disable BaseLlamaCache Dictionary Interfaces From 54115b4e86c5ffedc2e84237ff03b2473570d756 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 05:08:14 +0800 Subject: [PATCH 387/518] Update /docs/wiki/core/Llama.md for `on_device` option Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 52 ++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 7a9b7bd6ad..0354d86150 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,13 +4,13 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-01 +last_updated: 2026-05-06 version_target: "latest" --- ``` ## Overview -The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, hybrid model checkpointing, speculative decoding, and context shifting. +The `Llama` class is the core, high-level Python wrapper for a `llama.cpp` model. It handles model loading, memory management (KV cache), tokenization, and generation (both base text completion and chat formatting). It includes advanced features like dynamic LoRA routing, dual-mode hybrid/recurrent checkpointing, speculative decoding, and context shifting. ## Constructor (`__init__`) @@ -51,8 +51,9 @@ Initialize the model and context. Note that model loading will immediately alloc | `chat_format` | `str` | `None` | String specifying the chat template (e.g., `"llama-2"`, `"chatml"`). Guessed from GGUF if None. | | `chat_handler` | `LlamaChatCompletionHandler` | `None` | Optional custom handler. See [[ChatHandlers]]. | | `draft_model` | `LlamaDraftModel` | `None` | Optional draft model for speculative decoding. | -| `ctx_checkpoints` | `int` | `32` | Max context checkpoints per slot (Hybrid/SWA models). | -| `checkpoint_interval`| `int`| `4096` | Token interval for saving Hybrid model checkpoints. | +| `ctx_checkpoints` | `int` | `16` | Max hybrid/recurrent context checkpoints to keep. Set to `0` to disable checkpointing for single-turn fast paths. | +| `checkpoint_interval` | `int` | `4096` | Token interval for saving periodic Hybrid/Recurrent checkpoints during long prompt evaluation. | +| `checkpoint_on_device` | `bool` | `False` | Store Hybrid/Recurrent checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Reduces device-to-host copy overhead, but only one active checkpoint per `seq_id` is safe. | *(Note: There are numerous additional RoPE/YaRN scaling parameters available for specialized context extension. Refer to the source code for the full list).* @@ -189,18 +190,41 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn 5. **Hybrid & Recurrent Architectures**: - The class natively detects Hybrid/Recurrent models (like LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba or specialized SWA models(Gemma3/4)) and automatically enables the `HybridCheckpointCache`. This creates periodic save-states during large context pre-filling, allowing the model to roll back seamlessly if a generation is rejected (e.g., speculative decoding mismatches) without corrupting the recurrent state. + The class natively detects Hybrid/Recurrent models (for example LFM2VL/LFM2.5VL, Qwen3.5/3.6, Mamba, RWKV, or specialized SWA models such as Gemma3/4) and automatically enables the `HybridCheckpointCache`. - * Tips: If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: + Unlike regular Transformer KV caches, Hybrid/Recurrent model memory cannot always be safely truncated token-by-token. The wrapper therefore saves periodic sequence-state checkpoints during long context prefill, allowing rollback to a verified prefix without corrupting recurrent state. + + `HybridCheckpointCache` supports two checkpoint storage modes: + + - **Host checkpoint mode** (`checkpoint_on_device=False`, default): checkpoint payloads are serialized into Python-owned bytes. This supports multiple historical checkpoints per `seq_id`, which is useful for multi-turn reuse and deeper rollback history. + - **Device checkpoint mode** (`checkpoint_on_device=True`): checkpoint tensor payloads are stored in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Python only keeps the host-visible serialized portion. This reduces device-to-host tensor copy overhead, but only one active checkpoint per `seq_id` is safe because device payloads are keyed by `seq_id`. + + *Tips*: If you are using a hybrid multimodal model for ComfyUI nodes or single-turn API wrappers where you do not need multi-turn state rollback, initialize your `Llama` instance with `ctx_checkpoints=0`: + + ```python + llm = Llama( + model_path="./Qwen3.5-VL-9B.gguf", + chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), + n_ctx=4096, + ctx_checkpoints=0 # Disable checkpoints for zero-latency single-turn fast paths + ) + ``` + + For long prompts on GPU-backed Hybrid/Recurrent models, you can enable device-backed checkpoints to reduce device-to-host copy overhead: + + ```python + llm = Llama( + model_path="./Qwen3.6-27B.gguf", + n_ctx=32768, + n_gpu_layers=-1, + ctx_checkpoints=16, + checkpoint_interval=4096, + checkpoint_on_device=True + ) + ``` + + Use `checkpoint_on_device=False` if you need multiple historical checkpoints for the same `seq_id`. Use `checkpoint_on_device=True` when fast rollback/checkpointing is more important than keeping many historical checkpoint payloads. - ```python - llm = Llama( - model_path="./Qwen3.5-VL-9B.gguf", - chat_handler=MTMDChatHandler(clip_model_path="./mmproj.gguf"), - n_ctx=4096, - ctx_checkpoints=0 # <-- SET THIS TO 0 TO ENABLE ZERO-LATENCY FAST PATH - ) - ``` 6. **Assistant Prefill**: `llama-cpp-python` supports native **Assistant Prefill** for seamless message continuation. You can now simply use the `assistant_prefill=True` parameter in the `create_chat_completion` function. From f8d88b014889f7592d6fc3caaf23cdb9e3b088f2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 6 May 2026 05:22:36 +0800 Subject: [PATCH 388/518] Update /docs/wiki/modules/LlamaCache.md for `on_device` option Signed-off-by: JamePeng --- docs/wiki/modules/LlamaCache.md | 285 +++++++++++++++++++++++--------- 1 file changed, 205 insertions(+), 80 deletions(-) diff --git a/docs/wiki/modules/LlamaCache.md b/docs/wiki/modules/LlamaCache.md index 64e6bbb5f8..d1db0a2097 100644 --- a/docs/wiki/modules/LlamaCache.md +++ b/docs/wiki/modules/LlamaCache.md @@ -2,7 +2,7 @@ title: Llama Cache module_name: llama_cpp.llama_cache source_file: llama_cpp/llama_cache.py -last_updated: 2026-05-02 +last_updated: 2026-05-06 version_target: "latest" --- @@ -21,10 +21,10 @@ It defines several cache classes: | `BaseLlamaCache` | Abstract base class for llama.cpp state caches. | | `LlamaRAMCache` | In-memory LRU cache for `LlamaState` objects. | | `LlamaDiskCache` | Disk-backed cache using the `diskcache` library. | -| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | -| `HybridCheckpointCache` | Checkpoint manager for RNN/Hybrid model hidden states. | -| `HybridCheckpoint` | Dataclass representing one saved hybrid model checkpoint. | | `TrieNode` | Internal trie node used by `LlamaTrieCache`. | +| `LlamaTrieCache` | Trie-based cache optimized for fast longest-prefix lookup. | +| `HybridCheckpoint` | Dataclass representing one saved Hybrid/Recurrent checkpoint and its host-visible payload. | +| `HybridCheckpointCache` | Checkpoint manager for Hybrid/Recurrent model state snapshots, with host and device-backed modes. | The public compatibility alias is: @@ -910,7 +910,7 @@ from llama_cpp.llama_cache import LlamaTrieCache as LlamaCache ## Overview -`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or recurrent model's hidden state. +`HybridCheckpoint` is a dataclass representing one saved snapshot of a Hybrid or Recurrent model state. It is used by `HybridCheckpointCache`. @@ -920,9 +920,14 @@ Defined in: `llama_cpp/llama_cache.py` ## Role in the API -Hybrid or recurrent models may require hidden-state rollback rather than standard KV-cache truncation. +Hybrid or recurrent models may require sequence-state rollback rather than standard KV-cache truncation. + +`HybridCheckpoint` stores the checkpoint position, prefix verification hash, sequence id, and the serialized checkpoint payload visible to Python. -`HybridCheckpoint` stores enough metadata to verify and restore a specific recurrent state snapshot. +Its `data` field has different ownership semantics depending on the cache mode: + +* In host mode (`on_device=False`), `data` contains the full host-side serialized checkpoint state. +* In device mode (`on_device=True`), `data` contains only the host-visible serialized portion. The large tensor payload is stored in `llama_context`-owned device buffers by llama.cpp, keyed by `seq_id`. --- @@ -936,19 +941,19 @@ class HybridCheckpoint: hash_val: str size: int seq_id: int -``` +```` --- ## Fields -| Field | Type | Description | -| ---------- | ------- | --------------------------------------------------------------- | -| `pos` | `int` | Token position where this checkpoint was taken. | -| `data` | `bytes` | Raw binary RNN or Hybrid model state data. | -| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | -| `size` | `int` | Size of the state data in bytes. | -| `seq_id` | `int` | Sequence ID associated with this checkpoint. | +| Field | Type | Description | +| ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `pos` | `int` | Token position where this checkpoint was taken. | +| `data` | `bytes` | Serialized checkpoint payload visible to Python. In host mode this is the full state; in device mode this is only the host-visible portion. | +| `hash_val` | `str` | SHA-256 hash prefix used to verify exact token-prefix matching. | +| `size` | `int` | Number of bytes written by `llama_state_seq_get_data_ext`. | +| `seq_id` | `int` | Sequence id used by llama.cpp sequence-state APIs. | --- @@ -958,23 +963,33 @@ class HybridCheckpoint: Users usually do not need to instantiate this dataclass manually. +In device mode, old `HybridCheckpoint` Python objects may become stale if a newer checkpoint is saved for the same `seq_id`, because the device-side tensor payload is keyed by `seq_id` and may be overwritten. + --- # `HybridCheckpointCache` ## Overview -`HybridCheckpointCache` manages RNN or Hybrid model hidden-state checkpoints. +`HybridCheckpointCache` manages Hybrid/Recurrent model state checkpoints. + +It is designed for models whose memory cannot always be safely truncated like a regular Transformer KV cache. Instead, rollback is implemented by saving and restoring sequence-state snapshots through llama.cpp state APIs. + +The cache supports two operating modes: -It is designed for models that cannot physically truncate KV cache in the same way as standard transformer-only models. +1. **Host mode** (`on_device=False`) -Instead of implementing dictionary-style cache operations, it provides explicit checkpoint operations: + * Full checkpoint payloads are materialized as Python-owned `bytes`. + * Multiple historical checkpoints per `seq_id` are safe. + * This is the default mode and is useful for multi-turn rollback or deeper prefix reuse. -* `save_checkpoint` -* `find_best_checkpoint` -* `restore_checkpoint` -* `clear` -* `close` +2. **Device mode** (`on_device=True`) + + * `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` is forwarded to llama.cpp. + * Tensor payloads are stored in `llama_context`-owned device buffers. + * Python keeps only the host-visible serialized portion. + * Only one active checkpoint per `seq_id` is safe because device payloads are keyed by `seq_id`. + * This mode can reduce device-to-host copy overhead during checkpoint save/restore. Defined in: `llama_cpp/llama_cache.py` @@ -984,12 +999,14 @@ Defined in: `llama_cpp/llama_cache.py` `HybridCheckpointCache` is a specialized cache manager for Hybrid/Recurrent model rollback. -It stores raw state snapshots extracted from the llama.cpp backend through low-level C API functions: +It stores host-visible checkpoint data extracted from the llama.cpp backend through low-level C API functions: * `llama_state_seq_get_size_ext` * `llama_state_seq_get_data_ext` * `llama_state_seq_set_data_ext` +When `on_device=True`, tensor payloads are not treated as Python-owned bytes. They are stored by llama.cpp in `llama_context`-owned device buffers, while Python keeps the host-visible serialized portion and checkpoint metadata. + It is not a drop-in replacement for `LlamaRAMCache`, `LlamaDiskCache`, or `LlamaTrieCache`. --- @@ -1001,16 +1018,18 @@ def __init__( self, ctx: llama_cpp_lib.llama_context_p, max_checkpoints: int = 16, + on_device: bool = False, verbose: bool = False ): ... ``` -| Parameter | Type | Default | Required | Description | -| ----------------- | ------------------------------- | ------: | -------: | ------------------------------------------------------------------------------------------- | -| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Low-level llama.cpp context pointer. Required for extracting and restoring sequence state. | -| `max_checkpoints` | `int` | `16` | No | Maximum number of checkpoints to retain. If set to `0` or below, checkpointing is disabled. | -| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | +| Parameter | Type | Default | Required | Description | +| ----------------- | ------------------------------- | ------: | -------: | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `ctx` | `llama_cpp_lib.llama_context_p` | — | Yes | Borrowed low-level llama.cpp context pointer used for sequence-state save/restore. The cache does not own or free this context. | +| `max_checkpoints` | `int` | `16` | No | Maximum number of Python-side checkpoint entries to retain. If set to `0` or below, checkpointing is disabled. | +| `on_device` | `bool` | `False` | No | Whether to request llama.cpp to store checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. | +| `verbose` | `bool` | `False` | No | Enables diagnostic messages printed to `stderr`. | --- @@ -1018,32 +1037,26 @@ def __init__( The constructor raises `ValueError` if `ctx` is `None`. -```python -if ctx is None: - raise ValueError( - "HybridCheckpointCache(__init__): Failed to create HybridCheckpointCache with model context" - ) -``` +If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. This mode is intended to avoid expensive state extraction for single-turn workflows. -If `max_checkpoints <= 0`, checkpointing is disabled. In verbose mode, the cache reports that rollback capabilities are turned off. - -This mode is intended to avoid expensive state extraction for single-turn workflows. +When `on_device=True`, the cache forwards `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` to llama.cpp. In this mode, the cache keeps only one active checkpoint per `seq_id` by replacing old Python-side checkpoint metadata before saving a new checkpoint for the same `seq_id`. --- ## Instance Variables -| Name | Type | Description | -| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------ | -| `_ctx` | `llama_cpp_lib.llama_context_p` | Low-level llama.cpp context pointer used for state extraction and restoration. | -| `max_checkpoints` | `int` | Maximum number of checkpoints retained. Values less than or equal to zero disable checkpointing. | -| `checkpoints` | `list[HybridCheckpoint]` | Stored checkpoint objects. | -| `_current_size` | `int` | Total memory used by all stored checkpoints in bytes. | -| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | -| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | -| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | -| `_flag_partial` | int | Cached value of `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY`. | -| `verbose` | `bool` | Enables debug output. | +| Name | Type | Description | +| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `_ctx` | `llama_cpp_lib.llama_context_p` | Borrowed llama.cpp context pointer used for state extraction and restoration. | +| `on_device` | `bool` | Whether `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` is forwarded to llama.cpp state APIs. | +| `verbose` | `bool` | Enables debug output. | +| `max_checkpoints` | `int` | Maximum number of Python-side checkpoint entries retained. Values less than or equal to zero disable checkpointing. | +| `checkpoints` | `list[HybridCheckpoint]` | Python-side checkpoint registry. In host mode, entries own full checkpoint payloads. In device mode, entries own only host-visible metadata/payload portions. | +| `_current_size` | `int` | Python-tracked host-visible checkpoint size in bytes. In device mode, this does not include `llama_context`-owned device tensor storage. | +| `_get_size_ext` | Callable | Cached reference to `llama_state_seq_get_size_ext`. | +| `_get_data_ext` | Callable | Cached reference to `llama_state_seq_get_data_ext`. | +| `_set_data_ext` | Callable | Cached reference to `llama_state_seq_set_data_ext`. | +| `_flags` | `int` | Combined llama.cpp sequence-state flags, always including `LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY` and optionally `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. | --- @@ -1057,7 +1070,11 @@ def cache_size(self) -> int: return self._current_size ``` -Returns the total memory used by stored checkpoints in bytes. +Returns the Python-tracked host-visible checkpoint size in bytes. + +In host mode, this is close to the full serialized checkpoint payload size. + +In device mode, this reports only the host-visible portion returned by llama.cpp. It does not include `llama_context`-owned device tensor storage. --- @@ -1070,14 +1087,16 @@ def clear(self): ... ``` -Clears all stored checkpoints and resets `_current_size` to `0`. +Clears Python-side checkpoint metadata and resets `_current_size` to `0`. If the checkpoint list is already empty, it returns immediately. +In device mode, this does not explicitly release `llama_context`-owned device buffers. Those buffers are managed by llama.cpp and are associated with the context. + In verbose mode, it prints: ```text -HybridCheckpointCache: cleared +HybridCheckpointCache(clear): cleared ``` --- @@ -1089,15 +1108,15 @@ def close(self): ... ``` -Releases references held by the cache. +Releases Python-side checkpoint metadata and detaches cached references held by the cache. Behavior: -* Sets `checkpoints` to `None`. +* Calls `clear()`. * Sets `_ctx` to `None`. * Sets cached C API function references to `None`. -This method is also called by `__del__`. +This method does not free the llama.cpp context itself, because the context is borrowed rather than owned by the cache. --- @@ -1133,6 +1152,50 @@ This hash is used to ensure checkpoints are restored only when the token prefix --- +### `_replace_checkpoint_for_seq_id` + +```python +def _replace_checkpoint_for_seq_id(self, seq_id: int) -> None: + ... +``` + +Removes all Python-side checkpoint entries for one `seq_id`. + +This is required in device mode because llama.cpp stores the device tensor payload per `seq_id`, not per Python checkpoint object. Keeping multiple checkpoint metadata entries for the same `seq_id` would be unsafe. + +Behavior: + +1. Iterates over all checkpoint entries. +2. Removes entries whose `seq_id` matches the requested `seq_id`. +3. Preserves entries for other sequence ids. +4. Subtracts removed checkpoint sizes from `_current_size`. +5. Clamps `_current_size` to `0` if needed. + +--- + +### `_evict_checkpoints_if_needed` + +```python +def _evict_checkpoints_if_needed(self) -> None: + ... +``` + +Evicts old checkpoint entries using FIFO order until `len(checkpoints) <= max_checkpoints`. + +In host mode, this evicts full Python-owned checkpoint payloads. + +In device mode, this evicts Python-side checkpoint metadata only. Device tensor payloads are owned by `llama_context`. + +Behavior: + +1. Checks whether the number of checkpoints exceeds `max_checkpoints`. +2. Pops the oldest checkpoint entry from the front of the list. +3. Subtracts its size from `_current_size`. +4. Clamps `_current_size` to `0` if needed. +5. Prints an eviction message in verbose mode. + +--- + ### `find_best_checkpoint` ```python @@ -1144,20 +1207,23 @@ def find_best_checkpoint( ... ``` -Finds the longest valid checkpoint matching the given token prefix and sequence ID. +Finds the longest valid checkpoint matching the given token prefix and sequence id. + +The hash check prevents restoring a checkpoint that has the same length but belongs to a different prompt/history. Returns `None` if: * Checkpointing is disabled. * There are no checkpoints. -* No checkpoint matches the requested sequence ID and token prefix. +* No checkpoint matches the requested sequence id and token prefix. Behavior: -1. Skips checkpoints whose `seq_id` differs. -2. Skips checkpoints whose `pos` is greater than the current token length. -3. Verifies token-prefix integrity using `_hash_prefix`. -4. Returns the checkpoint with the largest matching `pos`. +1. Returns immediately if `max_checkpoints <= 0` or no checkpoints exist. +2. Skips checkpoints whose `seq_id` differs from the requested `seq_id`. +3. Skips checkpoints whose `pos` is greater than the current token length. +4. Verifies token-prefix integrity using `_hash_prefix`. +5. Returns the checkpoint with the largest matching `pos`. --- @@ -1173,7 +1239,7 @@ def save_checkpoint( ... ``` -Extracts the current recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. +Extracts the current Hybrid/Recurrent model state from the C++ backend and stores it as a `HybridCheckpoint`. Returns `True` if the checkpoint was saved successfully. @@ -1186,20 +1252,24 @@ Returns `False` if: ### Behavior 1. Returns immediately if `max_checkpoints <= 0`. -2. Calls `_get_size_ext` to query the required state buffer size. -3. Allocates a `ctypes.c_uint8` buffer. -4. Calls `_get_data_ext` to extract state data. -5. Copies the state bytes into a Python `bytes` object. -6. Computes a hash of the token prefix. -7. Appends a new `HybridCheckpoint`. -8. Increments `_current_size`. -9. Evicts old checkpoints using FIFO order if the number of checkpoints exceeds `max_checkpoints`. +2. In device mode, removes old Python-side checkpoint metadata for the same `seq_id`. +3. Uses `_flags` to select partial-only state serialization, optionally with `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. +4. Calls `_get_size_ext` to query the required host-visible buffer size. +5. Allocates a `ctypes.c_uint8` buffer. +6. Calls `_get_data_ext` to extract the host-visible checkpoint data. +7. Copies the data into a Python `bytes` object. +8. Computes a hash of the token prefix. +9. Appends a new `HybridCheckpoint`. +10. Increments `_current_size`. +11. Evicts old checkpoint entries using FIFO order if the number of entries exceeds `max_checkpoints`. ### Important Performance Note The implementation intentionally bypasses checkpoint extraction when `max_checkpoints <= 0`. -This avoids potentially large synchronous VRAM-to-RAM transfers for single-turn workflows. +This avoids potentially large synchronous checkpoint extraction costs for single-turn workflows. + +When `on_device=True`, llama.cpp may keep large tensor payloads in context-owned device buffers instead of materializing them as Python-owned bytes. This can reduce device-to-host tensor copy overhead, but only one active checkpoint per `seq_id` is safe. --- @@ -1220,18 +1290,28 @@ Returns `True` if restoration succeeds. Returns `False` if: -* The checkpoint sequence ID does not match the requested `seq_id`. +* The checkpoint sequence id does not match the requested `seq_id`. +* `on_device=True` and the checkpoint object is no longer tracked by this cache. * The current backend state size differs from the checkpoint size. * The backend does not report the expected number of restored bytes. ### Behavior 1. Verifies `cp.seq_id == seq_id`. -2. Queries current expected state size from the backend. -3. Verifies it matches `cp.size`. -4. Copies checkpoint bytes into a ctypes buffer. -5. Calls `_set_data_ext` to restore the state. -6. Returns whether the number of restored bytes equals `cp.size`. +2. In device mode, rejects stale checkpoint objects that are no longer tracked by this cache. +3. Queries current expected host-visible state size from the backend. +4. Verifies it matches `cp.size`. +5. Copies checkpoint bytes into a ctypes buffer. +6. Calls `_set_data_ext` to restore the state. +7. Returns whether the number of restored bytes equals `cp.size`. + +### Stale Checkpoint Guard + +In device mode, Python does not own the full checkpoint tensor payload. The large tensor payload is stored inside `llama_context` device buffers keyed by `seq_id`. + +If a newer checkpoint is saved for the same `seq_id`, an older `HybridCheckpoint` Python object may still exist outside the cache, but its device-side tensor payload may have been overwritten. + +For this reason, `restore_checkpoint` refuses on-device checkpoint objects that are no longer tracked by the cache. This avoids restoring old Python metadata together with newer device tensors. --- @@ -1270,7 +1350,7 @@ Users should use checkpoint-specific methods instead. --- -## Example +## Example: Host-backed Checkpoints ```python from llama_cpp.llama_cache import HybridCheckpointCache @@ -1279,6 +1359,7 @@ from llama_cpp.llama_cache import HybridCheckpointCache checkpoint_cache = HybridCheckpointCache( ctx=ctx, max_checkpoints=16, + on_device=False, verbose=True, ) @@ -1299,16 +1380,57 @@ if saved: print("Restored:", restored) ``` -> Note: This example assumes `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. +Host mode stores full serialized checkpoint payloads in Python-owned `bytes`. Multiple historical checkpoints per `seq_id` are safe. + +--- + +## Example: Device-backed Checkpoints + +```python +from llama_cpp.llama_cache import HybridCheckpointCache + +# `ctx` must be a valid llama.cpp context pointer. +checkpoint_cache = HybridCheckpointCache( + ctx=ctx, + max_checkpoints=16, + on_device=True, + verbose=True, +) + +tokens = [1, 2, 3, 4] +current_pos = len(tokens) + +saved = checkpoint_cache.save_checkpoint( + current_pos=current_pos, + tokens=tokens, + seq_id=0, +) + +if saved: + checkpoint = checkpoint_cache.find_best_checkpoint(tokens, seq_id=0) + + if checkpoint is not None: + restored = checkpoint_cache.restore_checkpoint(checkpoint, seq_id=0) + print("Restored:", restored) +``` + +In device mode, llama.cpp owns the large tensor payload in context-owned device buffers. Python keeps only the host-visible checkpoint data and metadata. + +Only one active checkpoint per `seq_id` is safe. + +> Note: These examples assume `ctx` is already available from lower-level llama.cpp runtime code. Most high-level users do not manually create this cache. Instead, they configure it through the `Llama` constructor using `ctx_checkpoints`, `checkpoint_interval`, and `checkpoint_on_device`. --- ## Best Practices * Use `HybridCheckpointCache` only for Hybrid or recurrent model workflows that require hidden-state rollback. +* Keep `on_device=False` when you need multiple historical checkpoints for the same `seq_id`. +* Use `on_device=True` when reducing device-to-host checkpoint copy overhead is more important than keeping many historical checkpoint payloads. Only store the checkpoint seq_id and pos. * Set `max_checkpoints=0` for single-turn workflows where rollback is not needed. * Keep `max_checkpoints` small if checkpoint states are large. * Use `find_best_checkpoint` before calling `restore_checkpoint`. +* Do not hold and restore old on-device `HybridCheckpoint` objects after newer checkpoints have been saved for the same `seq_id`. * Do not use dictionary-style cache access with this class. --- @@ -1319,7 +1441,10 @@ if saved: * `max_checkpoints <= 0` disables checkpointing. * Restoring a checkpoint with the wrong `seq_id` fails. * Restore fails if the current backend state size no longer matches the checkpoint size. -* `close()` sets internal references to `None`; the object should not be reused afterward. +* In device mode, old `HybridCheckpoint` objects can become stale after a newer checkpoint is saved for the same `seq_id`. +* In device mode, `cache_size` does not include `llama_context`-owned device tensor storage. +* `clear()` removes Python-side checkpoint metadata but does not explicitly free llama.cpp-owned device buffers. +* `close()` detaches internal references; the object should not be reused afterward. * This class is not equivalent to `LlamaCache`. --- From 156226b00ebb6482cd83e26c917ff66aec65d104 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 12 May 2026 03:46:26 +0800 Subject: [PATCH 389/518] Update Submodule vendor/llama.cpp bbeb89d..a9883db --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index bbeb89d76c..a9883db8ee 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit bbeb89d76c41bc250f16e4a6fefcc9b530d6e3f3 +Subproject commit a9883db8ee021cf16783016a60996d41820b5195 From 89e90a74ec4823efb53baadec20197a6de08db2b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 13 May 2026 08:15:03 +0800 Subject: [PATCH 390/518] Sync llama.cpp API 20260513 Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 2 ++ llama_cpp/mtmd_cpp.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1efd645150..cc900c0648 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2763,6 +2763,8 @@ def llama_state_seq_load_file( ) -> int: ... +# define LLAMA_STATE_SEQ_FLAGS_NONE 0 +LLAMA_STATE_SEQ_FLAGS_NONE = 0 # // for backwards-compat LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 574d90e2bf..839c718ccd 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -723,6 +723,34 @@ def mtmd_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # type: ... +# // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context +# // This is only intended to be used by llama-server, breaking changes is expected +# struct mtmd_caps { +# bool inp_vision; +# bool inp_audio; +# }; +class mtmd_caps(Structure): + _fields_ = [ + ("inp_vision", c_bool), + ("inp_audio", c_bool), + ] + + if TYPE_CHECKING: + inp_vision: c_bool + inp_audio: c_bool + + +# MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); +@ctypes_function_mtmd( + "mtmd_get_cap_from_file", [c_char_p], mtmd_caps) +def mtmd_get_cap_from_file(mmproj_fname: c_char_p) -> mtmd_caps: + """ + EXPERIMENTAL API to get mmproj's capabilities without initializing the full context. + This is only intended to be used by llama-server, breaking changes is expected + """ + ... + + # // test function, to be used in test-mtmd-c-api.c # MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); @ctypes_function_mtmd( From e67169dfd59a67ec500c38b42d0cfc41475f1051 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 14 May 2026 00:40:40 +0800 Subject: [PATCH 391/518] Implement `MiniCPMV46ChatHandler` for `MiniCPM-V-4.6` Signed-off-by: JamePeng --- README.md | 1 + llama_cpp/llama_chat_format.py | 204 +++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) diff --git a/README.md b/README.md index c9aba7d42d..cd03d217d7 100644 --- a/README.md +++ b/README.md @@ -835,6 +835,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` | | [minicpm-v-4.5](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) | `MiniCPMv45ChatHandler` | `minicpm-v-4.5` | +| [minicpm-v-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6-gguf) | `MiniCPMv46ChatHandler` | `minicpm-v-4.6` | | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` | | [gemma4](https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | | [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` | diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a0d8d25db4..2ab627c89b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4280,6 +4280,210 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class MiniCPMV46ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V-4.6 models. + + Features: + - Aligned with official tokenizer_config.json special tokens. + - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. + - Integrated MTMD-style URL and Base64 injection for visual content. + - Specialized `` and `` block generation. + - Autonomously folds previous reasoning paths using `last_query_index`. + - Toggles `` block generation via `enable_thinking` (Defaults to False). + """ + + # Core tokens + MINICPM_BOS_TOKEN = "<|im_start|>" + MINICPM_EOS_TOKEN = "<|im_end|>" + MINICPM_PAD_TOKEN = "<|endoftext|>" + + # Vision tokens + MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" + MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" + MINICPM_IMAGE_TOKEN = "<|image_pad|>" + MINICPM_VIDEO_TOKEN = "<|video_pad|>" + + CHAT_FORMAT = ( + "{%- if enable_thinking is not defined -%}\n" + " {%- set enable_thinking = false -%}\n" + "{%- endif -%}\n" + "{%- macro render_content(content, is_system_content=false) -%}\n" + " {%- if content is string -%}\n" + " {{- content -}}\n" + " {%- elif content is iterable and content is not mapping -%}\n" + " {%- set ns = namespace(parts=[]) -%}\n" + " {%- for item in content -%}\n" + " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" + " {%- if is_system_content -%}\n" + " {{- raise_exception('System message cannot contain images.') -}}\n" + " {%- endif -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.type == 'image_url' -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" + # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" + # " {%- if is_system_content -%}\n" + # " {{- raise_exception('System message cannot contain videos.') -}}\n" + # " {%- endif -%}\n" + # " {%- set url_val = '' -%}\n" + # " {%- if item.type == 'video_url' -%}\n" + # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" + # " {%- endif -%}\n" + # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" + " {%- elif 'text' in item -%}\n" + " {%- set ns.parts = ns.parts + [item.text] -%}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected item type in content.') -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.parts | join('\\n') -}}\n" + " {%- elif content is none or content is undefined -%}\n" + " {{- '' -}}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected content type.') -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- if not messages %}\n" + " {{- raise_exception('No messages provided.') }}\n" + "{%- endif %}\n" + "{%- if tools and tools is iterable and tools is not mapping %}\n" + " {{- '<|im_start|>system\\n' }}\n" + " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" + " {%- for tool in tools %}\n" + " {{- '\\n' }}\n" + " {{- tool | tojson }}\n" + " {%- endfor %}\n" + " {{- '\\n' }}\n" + " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {%- if content %}\n" + " {{- '\\n\\n' + content }}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + "{%- else %}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" + "{%- for message in messages[::-1] %}\n" + " {%- set index = (messages|length - 1) - loop.index0 %}\n" + " {%- if ns.multi_step_tool and message.role == 'user' %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if not(content.startswith('') and content.endswith('')) %}\n" + " {%- set ns.multi_step_tool = false %}\n" + " {%- set ns.last_query_index = index %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if ns.multi_step_tool %}\n" + " {{- raise_exception('No user query found in messages.') }}\n" + "{%- endif %}\n" + "{%- for message in messages %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if message.role == 'system' %}\n" + " {%- if not loop.first %}\n" + " {{- raise_exception('System message must be at the beginning.') }}\n" + " {%- endif %}\n" + " {%- elif message.role == 'user' %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" + " {%- elif message.role == 'assistant' %}\n" + " {%- set reasoning_content = '' %}\n" + " {%- if message.reasoning_content is string %}\n" + " {%- set reasoning_content = message.reasoning_content %}\n" + " {%- else %}\n" + " {%- if '' in content %}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {%- set reasoning_content = reasoning_content|trim %}\n" + " {%- if loop.index0 > ns.last_query_index %}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" + " {%- else %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" + " {%- endif %}\n" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" + " {%- for tool_call in message.tool_calls %}\n" + " {%- if tool_call.function is defined %}\n" + " {%- set tool_call = tool_call.function %}\n" + " {%- endif %}\n" + " {%- if loop.first %}\n" + " {%- if content|trim %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- '\\n\\n\\n' }}\n" + " {%- endif %}\n" + " {%- if tool_call.arguments is defined %}\n" + " {%- for args_name, args_value in tool_call.arguments|items %}\n" + " {{- '\\n' }}\n" + " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" + " {{- args_value }}\n" + " {{- '\\n\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif message.role == 'tool' %}\n" + " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" + " {{- '<|im_start|>user' }}\n" + " {%- endif %}\n" + " {{- '\\n\\n' }}\n" + " {{- content }}\n" + " {{- '\\n' }}\n" + " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif loop.last %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- raise_exception('Unexpected message role.') }}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if add_generation_prompt %}\n" + " {{- '<|im_start|>assistant\\n' }}\n" + " {%- if enable_thinking is defined and enable_thinking is false %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V-4.6 Handler. + + Args: + enable_thinking (bool): Controls whether to open a `` block for reasoning. + Defaults to False as per the standard template logic. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # MiniCPM uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + class Gemma3ChatHandler(MTMDChatHandler): GEMMA3_BOI_TOKEN = "" From 99543936f58145314cde6c7cfbf88ab119a664b5 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 14 May 2026 07:17:58 +0800 Subject: [PATCH 392/518] fix(MTMDChatHandler): correct audio_url content type check and improve variable handling - Changed condition from `content == "audio_url"` to `content_type == "audio_url"` for proper type-based dispatching. - Extracted `audio_url` variable for better readability. - Converted `else` to `elif content_type == "input_audio"` to make the control flow explicit and safer. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2ab627c89b..1c41beb40f 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2997,11 +2997,12 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") # Case A: Handle custom/forward-compatible audio_url format - if content == "audio_url": - url = content["audio_url"] if isinstance(content["audio_url"], str) else content["audio_url"]["url"] + if content_type == "audio_url": + audio_url = content["audio_url"] + url = audio_url if isinstance(audio_url, str) else audio_url["url"] media_items.append({"url": url, "type": "audio"}) # Case B: Handle OpenAI standard input_audio format - else: + elif content_type == "input_audio": input_audio = content.get("input_audio", {}) if isinstance(input_audio, dict) and "data" in input_audio: # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic From 0295d0c62fd5173e0704d33a8ced4d2c8e590d9c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 04:21:06 +0800 Subject: [PATCH 393/518] Update Submodule vendor/llama.cpp a9883db..834a243 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a9883db8ee..834a243664 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a9883db8ee021cf16783016a60996d41820b5195 +Subproject commit 834a243664114487f99520370a7a7b00fc7a486f From 1fb6a6665726e6abce52959bc38f162e6a0cb2dc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 07:22:11 +0800 Subject: [PATCH 394/518] feat(logger): refactor and enhance ggml logging configuration system - Introduce a `LoggerConfig` dataclass to provide fine-grained control over native ggml/llama.cpp runtime logging. - Align verbosity levels (0 to 5) with upstream `llama.cpp` conventions (`common/log.h`). - Implement a dynamic, configurable substring filtering system, replacing the hardcoded "CUDA Graph" patch with `DEFAULT_LOG_FILTERS`. - Add comprehensive public APIs for log management: `configure_logging`, `set_verbosity`, `set_quiet`, `set_silent`, `set_log_filters`, and `add_log_filters`. - Maintain backwards compatibility for the existing `set_verbose(bool)` function. - Improve the `ggml_log_callback` to correctly handle `GGML_LOG_LEVEL_CONT` by inheriting the verbosity of the preceding log message. - Route `GGML_LOG_LEVEL_NONE` to `stdout` and all other diagnostic logs to `stderr` by default. Signed-off-by: JamePeng --- llama_cpp/_logger.py | 406 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 383 insertions(+), 23 deletions(-) diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 015cec9faa..7669e2a722 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,6 +1,9 @@ import sys import ctypes import logging +from dataclasses import dataclass, field +from typing import Iterable, Optional, TextIO, Union + import llama_cpp._ggml as _ggml import llama_cpp.llama_cpp as llama_cpp_lib @@ -12,42 +15,399 @@ # GGML_LOG_LEVEL_DEBUG = 4, # GGML_LOG_LEVEL_CONT = 5, // continue previous log # }; -GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { - 0: logging.CRITICAL, - 1: logging.INFO, - 2: logging.WARNING, - 3: logging.ERROR, - 4: logging.DEBUG, - 5: logging.DEBUG, +GGML_LOG_LEVEL_NONE = 0 +GGML_LOG_LEVEL_INFO = 1 +GGML_LOG_LEVEL_WARN = 2 +GGML_LOG_LEVEL_ERROR = 3 +GGML_LOG_LEVEL_DEBUG = 4 +GGML_LOG_LEVEL_CONT = 5 + +# common/log.h model: +# +# LOG_LEVEL_OUTPUT = 0 +# LOG_LEVEL_ERROR = 1 +# LOG_LEVEL_WARN = 2 +# LOG_LEVEL_INFO = 3 +# LOG_LEVEL_TRACE = 4 +# LOG_LEVEL_DEBUG = 5 +# +# Rule: +# +# event_verbosity <= verbosity_threshold => print +# +# Larger threshold means more verbose output. +# +LOG_LEVEL_OUTPUT = 0 +LOG_LEVEL_ERROR = 1 +LOG_LEVEL_WARN = 2 +LOG_LEVEL_INFO = 3 +LOG_LEVEL_TRACE = 4 +LOG_LEVEL_DEBUG = 5 + +LOG_DEFAULT_LLAMA = LOG_LEVEL_INFO +LOG_DEFAULT_DEBUG = LOG_LEVEL_DEBUG + +# Match the updated common_log_default_callback behavior: +# INFO -> TRACE +# CONT -> TRACE +# +# This is slightly more conservative for verbosity=3: +# if the backend emits INFO through ggml_log_callback, Python will hide it unless +# verbosity >= 4. This mirrors the current upstream default callback behavior. +GGML_LEVEL_TO_VERBOSITY = { + GGML_LOG_LEVEL_NONE: LOG_LEVEL_OUTPUT, + GGML_LOG_LEVEL_ERROR: LOG_LEVEL_ERROR, + GGML_LOG_LEVEL_WARN: LOG_LEVEL_WARN, + GGML_LOG_LEVEL_INFO: LOG_LEVEL_TRACE, + GGML_LOG_LEVEL_DEBUG: LOG_LEVEL_DEBUG, + GGML_LOG_LEVEL_CONT: LOG_LEVEL_TRACE, # fallback only; CONT inherits previous +} + +GGML_LEVEL_TO_PYTHON_LEVEL = { + GGML_LOG_LEVEL_NONE: logging.INFO, + GGML_LOG_LEVEL_ERROR: logging.ERROR, + GGML_LOG_LEVEL_WARN: logging.WARNING, + GGML_LOG_LEVEL_INFO: logging.INFO, + GGML_LOG_LEVEL_DEBUG: logging.DEBUG, + GGML_LOG_LEVEL_CONT: logging.INFO, # fallback only; CONT inherits previous } + +# Default substring filters. +# +# These are intentionally simple substring filters instead of hard-coded +# special branches. Users can replace or clear them with set_log_filters(). +DEFAULT_LOG_FILTERS = [ + "CUDA Graph", + "CUDA graph" +] + + +VerbosityLike = Union[bool, int, str, None] + logger = logging.getLogger("llama-cpp-python") -_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0] -# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); +@dataclass +class LoggerConfig: + # 0=output, 1=error, 2=warn, 3=info, 4=trace, 5=debug + verbosity: int = LOG_DEFAULT_LLAMA + + show_output: bool = True + + stdout: TextIO = sys.stdout + stderr: TextIO = sys.stderr + + # If any substring is contained in a log message, the message is dropped. + log_filters: list[str] = field(default_factory=lambda: list(DEFAULT_LOG_FILTERS)) + log_filters_case_sensitive: bool = True + + +_config = LoggerConfig() +_last_verbosity = LOG_LEVEL_INFO + + +def _normalize_verbosity( + value: VerbosityLike, + *, + default: int = LOG_DEFAULT_LLAMA, +) -> int: + """ + Convert user input to llama.cpp-style verbosity 0..5. + + Compatibility: + verbose=False -> ERROR (1) + verbose=True -> DEBUG (5) + + Numeric levels: + 0 = output + 1 = error + 2 = warn + 3 = info + 4 = trace + 5 = debug + """ + if value is None: + return default + + if isinstance(value, bool): + return LOG_LEVEL_DEBUG if value else LOG_LEVEL_ERROR + + if isinstance(value, int): + return max(LOG_LEVEL_OUTPUT, min(LOG_LEVEL_DEBUG, value)) + + if isinstance(value, str): + key = value.strip().lower() + aliases = { + "0": LOG_LEVEL_OUTPUT, + "output": LOG_LEVEL_OUTPUT, + "none": LOG_LEVEL_OUTPUT, + + "1": LOG_LEVEL_ERROR, + "error": LOG_LEVEL_ERROR, + "err": LOG_LEVEL_ERROR, + "silent": LOG_LEVEL_ERROR, + + "2": LOG_LEVEL_WARN, + "warn": LOG_LEVEL_WARN, + "warning": LOG_LEVEL_WARN, + "quiet": LOG_LEVEL_WARN, + + "3": LOG_LEVEL_INFO, + "info": LOG_LEVEL_INFO, + "default": LOG_DEFAULT_LLAMA, + "normal": LOG_DEFAULT_LLAMA, + + "4": LOG_LEVEL_TRACE, + "trace": LOG_LEVEL_TRACE, + "trc": LOG_LEVEL_TRACE, + + "5": LOG_LEVEL_DEBUG, + "debug": LOG_LEVEL_DEBUG, + "verbose": LOG_LEVEL_DEBUG, + } + + if key in aliases: + return aliases[key] + + try: + parsed = int(key) + except ValueError as exc: + raise ValueError( + "_logger._normalize_verbosity: " + "verbosity must be one of 0..5, bool, None, or " + "'silent'/'quiet'/'info'/'trace'/'debug'" + ) from exc + + return max(LOG_LEVEL_OUTPUT, min(LOG_LEVEL_DEBUG, parsed)) + + raise TypeError(f"_logger._normalize_verbosity: unsupported verbosity type: {type(value)!r}") + + +def _verbosity_to_python_level(verbosity: int) -> int: + if verbosity >= LOG_LEVEL_DEBUG: + return logging.DEBUG + if verbosity >= LOG_LEVEL_INFO: + return logging.INFO + if verbosity >= LOG_LEVEL_WARN: + return logging.WARNING + return logging.ERROR + + +def _get_verbosity(level: int) -> int: + """ + Map ggml log level to Python-side verbosity. + + GGML_LOG_LEVEL_INFO maps to LOG_LEVEL_INFO so that verbosity=3 remains + useful as the default info level. + """ + if level == GGML_LOG_LEVEL_NONE: + return LOG_LEVEL_OUTPUT + if level == GGML_LOG_LEVEL_ERROR: + return LOG_LEVEL_ERROR + if level == GGML_LOG_LEVEL_WARN: + return LOG_LEVEL_WARN + if level == GGML_LOG_LEVEL_INFO: + return LOG_LEVEL_INFO + if level == GGML_LOG_LEVEL_DEBUG: + return LOG_LEVEL_DEBUG + if level == GGML_LOG_LEVEL_CONT: + return LOG_LEVEL_INFO + return LOG_LEVEL_DEBUG + + +def _decode_log_text(text: bytes) -> str: + return text.decode("utf-8", errors="replace") + + +def _matches_log_filter(msg: str) -> bool: + filters = _config.log_filters + if not filters: + return False + + if _config.log_filters_case_sensitive: + return any(item and item in msg for item in filters) + + msg_lower = msg.lower() + return any(item and item.lower() in msg_lower for item in filters) + + +def _should_drop(level: int, verbosity: int, msg: str) -> bool: + if verbosity > _config.verbosity: + return True + + if level == GGML_LOG_LEVEL_NONE and not _config.show_output: + return True + + if _matches_log_filter(msg): + return True + + return False + + @_ggml.ggml_log_callback def ggml_log_callback( level: int, text: bytes, user_data: ctypes.c_void_p, ): - # Note(JamePeng): A temporary patch is used to filter out garbage debug information - # output from the underlying C++ `CUDA Graph id %zu reused`. - # The logger is planned to be refactored to meet control requirements. - if text: - if b"CUDA Graph" in text or b"CUDA graph" in text: - return - # TODO: Correctly implement continue previous log - global _last_log_level - log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level - if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: - print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) - _last_log_level = log_level + global _last_verbosity + + msg = _decode_log_text(text) + + if level == GGML_LOG_LEVEL_CONT: + verbosity = _last_verbosity + else: + verbosity = _get_verbosity(level) + _last_verbosity = verbosity + if _should_drop(level, verbosity, msg): + return -llama_cpp_lib.llama_log_set(ggml_log_callback, ctypes.c_void_p(0)) + out = _config.stdout if level == GGML_LOG_LEVEL_NONE else _config.stderr + print(msg, end="", flush=True, file=out) + + +# Keep a global reference to avoid ctypes callback being garbage-collected. +_ggml_log_callback_ref = ggml_log_callback + +llama_cpp_lib.llama_log_set(_ggml_log_callback_ref, ctypes.c_void_p(0)) + + +def configure_logging( + *, + verbosity: VerbosityLike = None, + verbose: Optional[bool] = None, + quiet: Optional[bool] = None, + silent: Optional[bool] = None, + show_output: Optional[bool] = None, + log_filters: Optional[Iterable[str]] = None, + append_log_filters: Optional[Iterable[str]] = None, + log_filters_case_sensitive: Optional[bool] = None, +): + """ + Configure native ggml/llama.cpp runtime logging. + + Priority: + silent > quiet > verbosity > verbose > current config + + Compatibility: + verbose=False -> ERROR + verbose=True -> DEBUG + + Numeric levels: + 0 = output + 1 = error + 2 = warn + 3 = info + 4 = trace + 5 = debug + """ + if silent is True: + v = LOG_LEVEL_ERROR + elif quiet is True: + v = LOG_LEVEL_WARN + elif verbosity is not None: + v = _normalize_verbosity(verbosity) + elif verbose is not None: + v = _normalize_verbosity(verbose) + else: + v = _config.verbosity + + _config.verbosity = v + logger.setLevel(_verbosity_to_python_level(v)) + + if show_output is not None: + _config.show_output = show_output + + if log_filters is not None: + _config.log_filters = [s for s in log_filters if s] + + if append_log_filters is not None: + _config.log_filters.extend(s for s in append_log_filters if s) + + if log_filters_case_sensitive is not None: + _config.log_filters_case_sensitive = log_filters_case_sensitive def set_verbose(verbose: bool): - logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + """ + Backward-compatible bool API. + + False -> ERROR + True -> DEBUG + """ + configure_logging(verbose=verbose) + + +def set_verbosity(verbosity: VerbosityLike): + configure_logging(verbosity=verbosity) + + +def get_verbosity() -> int: + return _config.verbosity + + +def set_quiet(quiet: bool = True): + configure_logging(quiet=quiet) + + +def set_silent(silent: bool = True): + configure_logging(silent=silent) + + +def set_log_filters( + filters: Iterable[str], + *, + case_sensitive: bool = True, +): + """ + Replace all substring log filters. + + Example: + set_log_filters(["CUDA Graph id", "clip_model_loader: tensor"]) + """ + configure_logging( + log_filters=filters, + log_filters_case_sensitive=case_sensitive, + ) + + +def get_log_filters() -> list[str]: + return list(_config.log_filters) + + +def add_log_filters(filters: Iterable[str]): + """ + Append substring log filters. + """ + configure_logging(append_log_filters=filters) + + +def clear_log_filters(): + """ + Clear all substring log filters, including default filters. + """ + _config.log_filters.clear() + + +def reset_log_filters(): + """ + Restore default substring log filters. + """ + _config.log_filters = list(DEFAULT_LOG_FILTERS) + + +def get_log_filters_case_sensitive() -> bool: + return _config.log_filters_case_sensitive + + +def reset_logging(): + """ + Reset logging to default llama.cpp-style INFO verbosity and default filters. + """ + _config.verbosity = LOG_DEFAULT_LLAMA + _config.show_output = True + _config.log_filters = list(DEFAULT_LOG_FILTERS) + _config.log_filters_case_sensitive = True + logger.setLevel(_verbosity_to_python_level(_config.verbosity)) From f64320de4fedcad1f28cb46526803ff68784c546 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 07:30:34 +0800 Subject: [PATCH 395/518] feat(core): integrate fine-grained logging API into Llama class This commit exposes the newly refactored `_logger` configuration system directly through the `Llama` class, providing users with robust, programmatic control over native `llama.cpp` backend logs. Key changes: - Expand `Llama.__init__` with `verbosity`, `log_filters`, and `log_filters_case_sensitive` parameters. - Add instance methods for runtime log management (`set_verbosity`, `get_verbosity`, `set_log_filters`, `add_log_filters`, `clear_log_filters`, etc.). - Add comprehensive docstrings explaining the 0-5 verbosity scale and explicitly noting the process-global nature of the native backend logger. Advantages over the legacy implementation: - Granular Control: Replaces the restrictive binary `verbose=True/False` flag (which only toggled between ERROR and DEBUG) with a granular 6-tier scale (output, error, warn, info, trace, debug). - Dynamic Filtering: Empowers users to actively suppress specific noisy C++ logs using custom substring filters, removing the need for hardcoded internal patches. - Better Discoverability: Attaches logging controls directly to the `Llama` object, making log management much more accessible and intuitive without requiring users to import internal logger modules. Signed-off-by: JamePeng --- llama_cpp/llama.py | 112 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e50e3b9a3b..19ede6bcfd 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -58,7 +58,16 @@ from ._ggml import ( ggml_backend_cpu_buffer_type, ) -from ._logger import set_verbose +from ._logger import ( + configure_logging, + get_verbosity, + set_verbosity, + get_log_filters, + set_log_filters, + add_log_filters, + clear_log_filters, + reset_log_filters, +) from ._utils import suppress_stdout_stderr @@ -150,7 +159,11 @@ def __init__( type_v: Optional[int] = None, # Misc spm_infill: bool = False, + # Log verbose: bool = True, + verbosity: Optional[Union[int, str, bool]] = None, + log_filters: Optional[Sequence[str]] = None, + log_filters_case_sensitive: bool = True, # Extra Params **kwargs, # type: ignore ): @@ -235,11 +248,31 @@ def __init__( chat_handler: Optional chat handler to use when calling create_chat_completion. draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. - verbose: Print verbose output to stderr. type_k: KV cache data type for K (default: f16) type_v: KV cache data type for V (default: f16) spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. - + verbose: Backward-compatible boolean switch for native llama.cpp / ggml runtime logs. + False keeps only error-level native logs; True enables debug-level native logs. + If `verbosity` is provided, `verbosity` takes precedence over `verbose`. + verbosity: Fine-grained llama.cpp-style native runtime log verbosity. + Accepts 0-5, bool, or string aliases. + Numeric levels: + 0 = output only + 1 = error + 2 = warning + 3 = info + 4 = trace + 5 = debug + Use `verbosity=3` for llama.cpp-style default info logs. + `verbose=False` remains equivalent to error-only logging, while + `verbose=True` remains equivalent to debug logging. + log_filters: Optional substring filters for native runtime logs. + If any provided substring appears in a decoded backend log message, + that message is suppressed. By default, the logger may include built-in + filters for noisy low-level logs such as CUDA Graph reuse spam messages. + Pass an empty list to disable all substring filtering for this instance. + log_filters_case_sensitive: Whether `log_filters` should match case-sensitively. + Defaults to True for predictable low-level backend log filtering. Raises: ValueError: If the model path does not exist. @@ -247,9 +280,15 @@ def __init__( A Llama instance. """ self.verbose = verbose + self.verbosity = verbosity self._stack = contextlib.ExitStack() - set_verbose(verbose) + configure_logging( + verbose=verbose, + verbosity=verbosity, + log_filters=log_filters, + log_filters_case_sensitive=log_filters_case_sensitive, + ) if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): @@ -795,6 +834,71 @@ def eval_logits(self) -> Deque[List[float]]: maxlen=self._n_ctx if self._logits_all else 1, ) + # Logger API + + def set_verbosity(self, verbosity: Union[int, str, bool, None]) -> None: + """Set native llama.cpp / ggml runtime log verbosity for this process. + + Levels: + 0 = output only + 1 = error + 2 = warning + 3 = info + 4 = trace + 5 = debug + + Note: + Native backend logging is process-global because llama.cpp / ggml use + a global log callback. Changing this affects all Llama instances in + the current Python process. + """ + set_verbosity(verbosity) + self.verbosity = get_verbosity() + self.verbose = self.verbosity >= 5 + + + def get_verbosity(self) -> int: + """Return the current native runtime log verbosity.""" + return get_verbosity() + + + def set_log_filters( + self, + filters: Sequence[str], + *, + case_sensitive: bool = True, + ) -> None: + """Replace substring filters for native runtime logs. + + Any backend log message containing one of these substrings will be + suppressed. Pass an empty list to disable all substring filtering. + + Note: + Native backend logging is process-global, so this affects all Llama + instances in the current Python process. + """ + set_log_filters(filters, case_sensitive=case_sensitive) + + + def add_log_filters(self, filters: Sequence[str]) -> None: + """Append substring filters for native runtime logs.""" + add_log_filters(filters) + + + def get_log_filters(self) -> List[str]: + """Return the current substring filters for native runtime logs.""" + return get_log_filters() + + + def clear_log_filters(self) -> None: + """Clear all substring filters, including default filters.""" + clear_log_filters() + + + def reset_log_filters(self) -> None: + """Restore default substring filters for native runtime logs.""" + reset_log_filters() + # LoRA / Adapter Management API def load_lora(self, name: str, path: str): From d89aa5a7d7ea3629c8767b332d692e9f3b9a9e5f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 15 May 2026 07:43:27 +0800 Subject: [PATCH 396/518] docs(wiki): document runtime verbosity and log filters for Llama Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 106 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index 0354d86150..a061861ece 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,7 +4,7 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-06 +last_updated: 2026-05-15 version_target: "latest" --- ``` @@ -55,6 +55,15 @@ Initialize the model and context. Note that model loading will immediately alloc | `checkpoint_interval` | `int` | `4096` | Token interval for saving periodic Hybrid/Recurrent checkpoints during long prompt evaluation. | | `checkpoint_on_device` | `bool` | `False` | Store Hybrid/Recurrent checkpoint tensor payloads in `llama_context`-owned device buffers via `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE`. Reduces device-to-host copy overhead, but only one active checkpoint per `seq_id` is safe. | +### Runtime Logging Parameters + +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `verbose` | `bool` | `True` | Backward-compatible boolean native logging switch. `False` keeps only error-level llama.cpp / ggml logs; `True` enables debug-level native logs. If `verbosity` is provided, `verbosity` takes precedence over `verbose`. | +| `verbosity` | `Optional[Union[int, str, bool]]` | `None` | Fine-grained llama.cpp-style native runtime log verbosity. Numeric levels: `0=output`, `1=error`, `2=warning`, `3=info`, `4=trace`, `5=debug`. Use `verbosity=3` for llama.cpp-style default info logs. String aliases such as `"silent"`, `"quiet"`, `"info"`, `"trace"`, and `"debug"` are also accepted. | +| `log_filters` | `Optional[Sequence[str]]` | `None` | Optional substring filters for native runtime logs. If any provided substring appears in a decoded backend log message, that message is suppressed. The default logger may include built-in filters for noisy low-level logs such as `CUDA Graph id %d reuse` messages. Pass an empty list `[]` to disable default substring filtering. | +| `log_filters_case_sensitive` | `bool` | `True` | Whether `log_filters` should match case-sensitively. Defaults to `True` for predictable low-level backend log filtering. | + *(Note: There are numerous additional RoPE/YaRN scaling parameters available for specialized context extension. Refer to the source code for the full list).* --- @@ -112,6 +121,42 @@ model.eval(tokens=[1, 453, 234, 987], active_loras=[{"name": "coding_adapter", " Immediately halts an active generation loop safely. * **Usage**: Typically called from a separate monitoring thread (like a timer). When triggered, the running stream will exit and the final chunk will contain `"finish_reason": "abort"`. +### Runtime Logging Control + +The `Llama` class exposes lightweight runtime helpers for adjusting native llama.cpp / ggml logging after initialization. + +> **Note:** Native backend logging is process-global because llama.cpp / ggml use a global log callback. Changing verbosity or log filters affects all `Llama` instances in the current Python process. + +* `set_verbosity(verbosity: Union[int, str, bool, None])`: Set native runtime log verbosity. +* `get_verbosity() -> int`: Return the current native runtime log verbosity. +* `set_log_filters(filters: Sequence[str], case_sensitive: bool = True)`: Replace substring filters for native runtime logs. +* `add_log_filters(filters: Sequence[str])`: Append substring filters. +* `get_log_filters() -> List[str]`: Return the current substring filters. +* `clear_log_filters()`: Clear all substring filters, including default filters. +* `reset_log_filters()`: Restore default substring filters. + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # llama.cpp-style info logs +) + +# Temporarily enable debug-level native logs. +llm.set_verbosity(5) + +# Suppress noisy backend messages by substring. +llm.add_log_filters([ + "CUDA Graph", + "CUDA graph", + "clip_model_loader: tensor", +]) + +# Return to quiet error-only logging. +llm.set_verbosity(1) +``` + ### Dynamic LoRA Management The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dynamically per-generation or per-eval. * `load_lora(name: str, path: str)`: Loads an adapter into VRAM (does not apply it yet). @@ -185,7 +230,7 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn llm.create_completion("Once upon a time", active_loras=[{"name": "story", "scale": 0.9}]) # Use sql adapter - llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}])v + llm.create_completion("SELECT *", active_loras=[{"name": "sql_expert", "scale": 0.8}]) ``` 5. **Hybrid & Recurrent Architectures**: @@ -321,6 +366,63 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn run_controlled_generation("Explain quantum mechanics in a way that relates to bugs in code.", timeout_seconds=8) ``` +8. **Runtime Logging & Backend Noise Filtering**: + + `Llama` supports fine-grained native llama.cpp / ggml logging through `verbosity`. This is more precise than the legacy `verbose` boolean flag. + + ```python + from llama_cpp import Llama + + # Legacy behavior: + # verbose=False -> error-only logs + llm_quiet = Llama( + model_path="models/qwen3.gguf", + verbose=False, + ) + + # Recommended precise logging: + # 0 = output, 1 = error, 2 = warning, 3 = info, 4 = trace, 5 = debug + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # llama.cpp-style default info logs + ) + ``` + + For low-level debugging, use `verbosity=5`. By default, the logger may suppress known noisy backend messages such as CUDA Graph reuse logs. Pass `log_filters=[]` to disable all substring filtering. + + ```python + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[], # show all debug logs, including normally filtered ones + ) + ``` + + To suppress additional noisy messages, pass substring filters: + + ```python + llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[ + "CUDA Graph id", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", + ], + ) + ``` + + You can also adjust logging at runtime: + + ```python + llm.set_verbosity(5) + llm.add_log_filters(["llama_perf_context_print"]) + + # Later, return to warning-level logs. + llm.set_verbosity(2) + ``` + + **Important:** native backend logging is process-global. Runtime changes affect all `Llama` instances in the same Python process. --- From c14d769a4408e516bb03bd13f68e838ab6edbe4a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 07:30:10 +0800 Subject: [PATCH 397/518] Update Submodule vendor/llama.cpp 834a243..49d1701 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 834a243664..49d1701bd2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 834a243664114487f99520370a7a7b00fc7a486f +Subproject commit 49d1701bd24e4cedf6dfec9e50e185111203946b From 4d3e320b321db7c505721c92c7d3d26641e95623 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:01:49 +0800 Subject: [PATCH 398/518] Implement `Qwen3ASRChatHandler` for `Qwen3-ASR` models. - Integrate MTMD multimodal logic to extract and inject `audio_url` and base64 `input_audio` data directly into the `<|audio_start|><|audio_pad|>[DATA]<|audio_end|>` sequence. - Define a default multilingual transcription system prompt and configure model-specific stop tokens. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 78 ++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1c41beb40f..0365d8f871 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5473,6 +5473,84 @@ def __call__(self, **kwargs): # Use parent implementation return super().__call__(**kwargs) +class Qwen3ASRChatHandler(MTMDChatHandler): + """ + Handler for Qwen 3 ASR (Automatic Speech Recognition) models. + + Features: + - Highly specialized for Speech-to-Text tasks. + - Aggregates all system text into a single cohesive system block. + - Drops user text entirely, extracting ONLY audio data into a unified user turn. + - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. + - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. + """ + + DEFAULT_SYSTEM_MESSAGE = """ + You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. + You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. + """ + + QWEN3_ASR_BOS_TOKEN = "<|im_start|>" + QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" + QWEN3_ASR_EOS_TOKEN = "<|im_end|>" + + + QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" + QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" + QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" + + CHAT_FORMAT = ( + "{%- set ns = namespace(system_text='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.role == 'system' -%}\n" + " {%- if m.content is string -%}\n" + " {%- set ns.system_text = ns.system_text + m.content -%}\n" + " {%- else -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'text' and (c.text is defined) -%}\n" + " {%- set ns.system_text = ns.system_text + c.text -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- set ns2 = namespace(audio_tokens='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.content is not string -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" + " {#- MTMD Audio Injection -#}\n" + " {%- set audio_val = '' -%}\n" + " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" + " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" + " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" + " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" + " {%- endif -%}\n" + " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" + "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token + kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") + + return super().__call__(**kwargs) class Qwen3VLChatHandler(MTMDChatHandler): CHAT_FORMAT = ( From ad67e0e979620e3dc18c91460bd7a96a3dfc1934 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:05:21 +0800 Subject: [PATCH 399/518] docs(README.md): add Qwen3-ASR documentation and usage example - Update the supported multi-modal models table to include `qwen3-asr` and the `Qwen3ASRChatHandler`. - Add a new dedicated section for Speech-to-Text inference with a complete, collapsible Python script. - Provide a `build_media_payload` helper function to demonstrate proper Base64 encoding of local `.wav` and `.mp3` files into OpenAI-compatible `input_audio` schemas. - Include a critical warning advising users to use BF16 quantization for the multimodal projector (`mmproj`) to prevent audio degradation. - Clarify usage mechanics, specifically that all instructions must be placed in the `system` role due to the ASR template's text-dropping behavior. Signed-off-by: JamePeng --- README.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/README.md b/README.md index cd03d217d7..26aa11a55f 100644 --- a/README.md +++ b/README.md @@ -845,6 +845,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | +| [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` | | [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` | | [qwen3.6](https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF) | `Qwen35ChatHandler` | `qwen3.6` | @@ -1072,6 +1073,111 @@ print(res["choices"][0]["message"]["content"])
+## Speech Recognition With Qwen3-ASR (Speech-to-Text) + +The `Qwen3ASRChatHandler` is specifically designed for the Qwen3 Automatic Speech Recognition (ASR) models. Unlike standard multimodal models, this handler aggregates system prompts for instructions and automatically extracts audio data from the user's message, ignoring any user text. + +> **⚠️ Important Note on Quantization:** > For Qwen3-ASR models, it is highly recommended to use the **BF16** version of the multimodal projector (`mmproj`). Other quantizations are known to cause severe audio degradation. + +**Example Code**:
+ +```python +from llama_cpp import Llama +from llama_cpp.llama_chat_format import Qwen3ASRChatHandler +import base64 +import os + +# 1. Define paths to the model and the BF16 multimodal projector +MODEL_PATH = r"./Qwen3-ASR-1.7B-BF16.gguf" +MMPROJ_PATH = r"./mmproj-Qwen3-ASR-1.7b-BF16.gguf" + +# 2. Initialize the Llama model with the dedicated ASR handler +llm = Llama( + model_path=MODEL_PATH, + chat_handler=Qwen3ASRChatHandler( + clip_model_path=MMPROJ_PATH, + verbose=False, + ), + n_gpu_layers=-1, + n_ctx=10240, + verbose=False, + verbosity=0 +) + +# 3. Helper function to encode audio files into OpenAI-compatible payloads +_MEDIA_MIME_TYPES = { + '.wav': ('audio', 'wav'), + '.mp3': ('audio', 'mp3'), +} + +def build_media_payload(file_path: str) -> dict: + """Reads a local audio file and converts it into the LLM input structure.""" + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Media file not found: {file_path}") + + extension = os.path.splitext(file_path)[1].lower() + media_category, mime_or_format = _MEDIA_MIME_TYPES.get(extension, ('unknown', 'application/octet-stream')) + + if media_category == 'unknown': + print(f"Warning: Unknown extension '{extension}'.") + + # Read and Base64 encode the file + with open(file_path, "rb") as f: + encoded_data = base64.b64encode(f.read()).decode("utf-8") + + if media_category == 'audio': + return { + "type": "input_audio", + "input_audio": { + "data": encoded_data, + "format": mime_or_format + } + } + else: + return {"type": "text", "text": f"[Attached unsupported file: {file_path}]"} + + +# ======================== +# Main Inference Section +# ======================== + +media_paths = ["./audio/test.wav"] +user_content = [build_media_payload(path) for path in media_paths] + +# 4. Generate the transcription +response = llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": ( + "You are an advanced multilingual Speech-to-Text model. " + "Accurately transcribe the audio into text in its original spoken language. " + "You should ignore background noise, filler words, and stutters where possible, " + "and format the final output with correct grammar and capitalization." + ) + }, + { + "role": "user", + "content": user_content + } + ], + temperature=1.0, + top_p=0.95, + top_k=64, + max_tokens=10240, +) + +print(f"Transcribe: {response['choices'][0]['message']['content']}") + +``` + +#### How it works: + +* **`input_audio` Schema:** The script reads the local `.wav` or `.mp3` file, encodes it in Base64, and wraps it in an OpenAI-compatible `"type": "input_audio"` dictionary. +* **System Prompt:** Because the Qwen3-ASR template strips out user text, all instructions (like translation requests or formatting rules) **must** be placed in the `"system"` role. + +
+ ## Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text) Below is a complete, production-ready example demonstrating how to dynamically route and process both image and audio files. It includes a universal media processor that automatically converts local files into the correct payload structure (Data URIs for images, and `input_audio` for audio files). From 43b85f38ed9b55ab1cff4646e41796f93b4b1129 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:12:34 +0800 Subject: [PATCH 400/518] docs(README.md): Update the jump link for Qwen3-ASR in the top directory. Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 26aa11a55f..caec7e32e0 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ This package provides: - [Multi-modal Models Support](https://github.com/JamePeng/llama-cpp-python#multi-modal-models) - Support Models Lists - [Loading a Local Image With Qwen3VL(Thinking/Instruct)](https://github.com/JamePeng/llama-cpp-python#loading-a-local-image-with-qwen3vlthinkinginstruct) + - [Speech Recognition With Qwen3-ASR (Speech-to-Text)](https://github.com/JamePeng/llama-cpp-python#speech-recognition-with-qwen3-asr-speech-to-text) - [Comprehensive Omni MultiModal Example: Gemma-4 (Vision + Audio + Text)](https://github.com/JamePeng/llama-cpp-python#comprehensive-omni-multimodal-example-gemma-4-vision--audio--text) - [Embeddings & Reranking (GGUF)](https://github.com/JamePeng/llama-cpp-python#embeddings--reranking-gguf) - [1. Text Embeddings (Vector Search)](https://github.com/JamePeng/llama-cpp-python#1-text-embeddings-vector-search) From 4fb074682bf18c0c8097e9b9e4800940eb49bc07 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 09:57:46 +0800 Subject: [PATCH 401/518] Update SCHEMA.md --- docs/wiki/SCHEMA.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index b96ec964c7..23954a156e 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -4,7 +4,7 @@ - **Author**: JamePeng - **Maintainer**: LLM-assisted documentation workflow - **Project**: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) wiki -- **Last Modified**: 2026-05-02 +- **Last Modified**: 2026-05-16 - **Version Target**: latest source code - **Schema Version**: 0.3 @@ -24,6 +24,7 @@ - `llama_cpp.py` - `mtmd_cpp.py` - `_ggml.py` + - `_logger.py` - Never invent parameters or behavior. Always read the current source code before writing/updating a page. - Prefer documenting public and user-facing APIs first. Internal implementation details may be documented only when they help users understand behavior, extension points, debugging, or advanced usage. - All examples must be complete, runnable with the latest API, and include necessary imports. From 1064cf17361c394f38f6f3d89fc68367d45d8720 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 10:00:53 +0800 Subject: [PATCH 402/518] docs(Llama.md): update `verbose=False` vs. `verbosity=0` note Signed-off-by: JamePeng --- docs/wiki/core/Llama.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/wiki/core/Llama.md b/docs/wiki/core/Llama.md index a061861ece..1f7cce206b 100644 --- a/docs/wiki/core/Llama.md +++ b/docs/wiki/core/Llama.md @@ -4,7 +4,7 @@ title: Llama Class module_name: llama_cpp.llama source_file: llama_cpp/llama.py class_name: Llama -last_updated: 2026-05-15 +last_updated: 2026-05-16 version_target: "latest" --- ``` @@ -424,6 +424,10 @@ The `Llama` class allows you to load multiple LoRAs into VRAM and apply them dyn **Important:** native backend logging is process-global. Runtime changes affect all `Llama` instances in the same Python process. + **verbose=False** vs. **verbosity=0**: These have distinct behaviors. + - `verbose=False` silences Python wrapper prints but not backend diagnostics; like `if self.verbose: print()` + - `verbosity=0` silences all backend non-error output. + --- ## Deprecated / Changed APIs From 7eab8d3ad3f178d9cae6bebdd6013213176415d8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 10:04:43 +0800 Subject: [PATCH 403/518] docs(Logger.md): Upload Logger documentation Signed-off-by: JamePeng --- docs/wiki/modules/Logger.md | 216 ++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 docs/wiki/modules/Logger.md diff --git a/docs/wiki/modules/Logger.md b/docs/wiki/modules/Logger.md new file mode 100644 index 0000000000..f24f7f43a8 --- /dev/null +++ b/docs/wiki/modules/Logger.md @@ -0,0 +1,216 @@ +--- +title: Logger +class_name: Logger (module) +module_name: llama_cpp._logger +source_file: llama_cpp/_logger.py +last_updated: 2026-05-16 +version_target: latest +--- + +## Overview + +The `Logger` module provides configuration for runtime logging in `llama-cpp-python`, wrapping the native `ggml`/`llama.cpp` logging infrastructure. It controls verbosity levels, output streams, substring filtering, and callback integration, allowing fine-grained control over diagnostic and informational output from the underlying bindings. + +## Role in the Library + +- **Wraps low-level logging**: It intercepts and transforms log events from the C/C++ backend (`ggml_log_callback`). +- **Connects to Python logging**: Maps `ggml` verbosity levels (0–5) to `logging` levels (ERROR, WARNING, INFO, DEBUG), and routes output to `stdout`/`stderr` based on severity. +- **Provides filtering**: Substring-based message filtering to suppress specific log categories (e.g., CUDA Graph output). +- **Extends the API surface**: Offers both explicit configuration functions and convenient shorthand setters (`set_verbose`, `set_quiet`), while preserving full control through `configure_logging`. + +## Core Methods + +### `configure_logging(*, verbosity=None, verbose=None, quiet=None, silent=None, show_output=None, log_filters=None, append_log_filters=None, log_filters_case_sensitive=None)` + +The primary configuration function. Combines multiple parameters into a unified verbosity level. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `verbosity` | int \| bool \| None | None | Numeric level (0–5). `False` maps to `ERROR` (1), `True` to `DEBUG` (5). | +| `verbose` | bool | None | Shorthand: `True` → `DEBUG`, `False` → `ERROR`. | +| `quiet` | bool | None | Shorthand: `True` → `WARN` (2). | +| `silent` | bool | None | Shorthand: `True` → `ERROR` (1). | +| `show_output` | bool | None | Whether `GGML_LOG_LEVEL_NONE` (output) should be shown. | +| `log_filters` | Iterable[str] | None | List of substring patterns to filter out. | +| `append_log_filters` | Iterable[str] | None | Append additional filter patterns. | +| `log_filters_case_sensitive` | bool | None | Whether filters are case-sensitive. | + +### `set_verbose(verbose: bool)` + +Shorthand setter. `verbose=True` sets `verbosity=DEBUG`, `verbose=False` sets `verbosity=ERROR`. + +### `set_verbosity(verbosity: VerbosityLike)` + +Sets verbosity to any value accepted by `configure_logging`. + +### `get_verbosity() -> int` + +Returns current configured verbosity level (0–5). + +### `set_quiet(quiet: bool = True)` + +Sets `verbosity=WARN` (`2`). + +### `set_silent(silent: bool = True)` + +Sets `verbosity=ERROR` (`1`). + +### `set_log_filters(filters: Iterable[str], *, case_sensitive: bool = True)` + +Replaces all substring log filters. + +### `get_log_filters() -> list[str]` + +Returns current filter list. + +### `add_log_filters(filters: Iterable[str])` + +Appends filters to the current list. + +### `clear_log_filters()` + +Removes all user-defined filters. + +### `reset_log_filters()` + +Restores the default filter list: `["CUDA Graph", "CUDA graph"]`. + +### `reset_logging()` + +Resets to default: `verbosity=INFO` (`3`), `show_output=True`, default filters. + +## Important Attributes / State + +| Attribute | Type | Source | Description | +|-----------|------|--------|-------------| +| `_config` | LoggerConfig | Internal | Holds the current configuration: verbosity, output streams, filters. | +| `_last_verbosity` | int | Internal | Tracks the last verbosity level set by `ggml_log_callback`. | + +## Best Practices & Common Patterns + +### 1. Default Behavior +Use `reset_logging()` to start with `INFO` verbosity, which shows warnings and errors but hides internal debug output. + +```python +from llama_cpp import Llama +from llama_cpp import reset_logging + +reset_logging() # Default verbosity=3 (INFO), show warnings and errors +llm = Llama(model_path="models/qwen3.gguf") +llm("Explain quantum physics.") +``` + +### 2. Precise Logging via `verbosity` +Replace the legacy `verbose` boolean with the precise `verbosity` parameter. `verbose=False` maps to `ERROR` (1), `verbose=True` to `DEBUG` (5). + +```python +from llama_cpp import Llama + +# Legacy (coarse control): +llm_quiet = Llama(model_path="models/qwen3.gguf", verbose=False) +llm_quiet("What is a neural network?") + +# Modern (fine-grained control): +llm = Llama(model_path="models/qwen3.gguf", verbosity=3) +llm("What is a neural network?") +``` + +### 3. Low-Level Debugging +For deep backend debugging, set `verbosity=5` (DEBUG) and optionally disable substring filters to see all diagnostic output. + +```python +from llama_cpp import Llama + +# Debug-level logs, showing all backend diagnostics +llm = Llama(model_path="models/qwen3.gguf", verbosity=5) + +# If you want to see normally filtered CUDA Graph messages: +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=5, + log_filters=[], # Disable all substring filters +) +``` + +### 4. Substring-Based Backend Noise Filtering +Suppress known noisy backend messages by passing substring filters. This prevents "CUDA Graph" and model loading chatter from flooding the console. + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="models/qwen3.gguf", + verbosity=3, # INFO level + log_filters=[ + "CUDA Graph id", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", + "llama_perf_context_print", + ], +) +llm("What is a transformer?") +``` + +### 5. Runtime Logging Adjustments +Since logging is process-global, you can adjust verbosity or filters at runtime — changes apply to all `Llama` instances in the same process. + +```python +from llama_cpp import Llama + +llm = Llama(model_path="models/qwen3.gguf", verbosity=2) # QUIET: only show warnings and errors +llm("Quick answer: What is machine learning?") + +# Temporarily increase verbosity for diagnostics +llm.set_verbosity(5) +llm("Show me the full debug log for this prompt") +llm.set_verbosity(2) # Return to QUIET + +# Add a specific filter without resetting everything +llm.add_log_filters(["llama_perf_context_print"]) +llm("Final answer: What is machine learning?") +``` + +### 6. Complete Diagnostic Session +For a full diagnostic session, combine precise verbosity, custom filters, and runtime control: + +```python +from llama_cpp import Llama + +# 1. Start with info-level verbosity +llm = Llama(model_path="models/qwen3.gguf", verbosity=3) + +# 2. Suppress backend noise +llm.set_log_filters([ + "CUDA Graph", + "CUDA graph", + "clip_model_loader: tensor", + "ggml_cuda_graph_update_required", +]) + +# 3. Run inference +llm("Explain the llama.cpp inference pipeline") + +# 4. Temporarily increase verbosity for a specific call +llm.set_verbosity(5) +llm("Show debug output for cache hit details") +llm.set_verbosity(2) # Return to normal + +# 5. Remove filters after session +llm.clear_log_filters() +``` + +## Key Considerations + +- **Process-global**: Logging configuration affects all `Llama` instances in the same process. Use `add_log_filters` or `set_log_filters` carefully when multiple instances run concurrently. +- **Flushed immediately**: Every log call flushes to `stdout`/`stderr`, so output appears immediately. +- **Shorthand vs. precise**: Prefer `verbosity`/`set_verbosity` over `verbose`/`set_verbose`/`set_quiet`/`set_silent` for precision, though the shorthands remain for backward compatibility. +- **verbose=False** vs. **verbosity=0**: These have distinct behaviors — `verbose=False` silences Python wrapper prints but not backend diagnostics; `verbosity=0` silences all backend non-error output. + +## Deprecated / Changed APIs + +None documented. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] From 43a96e2e098ca2845d3e4c6acd92643b80579240 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 10:08:29 +0800 Subject: [PATCH 404/518] docs(index): Append Logger.md info and link Signed-off-by: JamePeng --- docs/wiki/index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index 02f2dd5b9a..143d6e629b 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -30,6 +30,7 @@ These pages document major source modules and related classes. | [modules/LlamaEmbedding\|Llama Embedding] | Embedding-related APIs and usage patterns. | | [modules/LlamaGrammar\|Llama Grammar] | Provides grammar utilities for constrained generation. | | [modules/LlamaSpeculative\|Llama Speculative Decoding] | Draft model interfaces and prompt-based speculative decoding helpers. | +| [modules/Logger\|Logger] | provides configuration for runtime logging in `llama-cpp-python`, wrapping the native `ggml`/`llama.cpp` logging infrastructure. It controls verbosity levels, output streams, substring filtering, and callback integration, allowing fine-grained control over diagnostic and informational output from the underlying bindings. | --- @@ -53,6 +54,7 @@ If you are new to this wiki, read the pages in this order: 3. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] 4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] 5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +6. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] If you are contributing documentation, start with: @@ -72,6 +74,7 @@ Currently available pages: - `modules/LlamaEmbedding.md` - `modules/LlamaGrammar.md` - `modules/LlamaSpeculative.md` +- `modules/Logger.md` - `SCHEMA.md` - `contributing-to-wiki.md` From 9187910e35e6f4d063f33364a10812727a05e58d Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sat, 16 May 2026 06:41:17 +0200 Subject: [PATCH 405/518] fix --- llama_cpp/llama.py | 1 - llama_cpp/llama_chat_format.py | 33 +++++++++++---------------------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6dab44602d..7666b822a8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -618,7 +618,6 @@ def __init__( self.chat_handler = llama_chat_format.GenericMTMDChatHandler( gguf_metadata = self.metadata, clip_model_path = clip_model_path, - model_arch = None, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 40491968a9..0be38a19d3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3839,47 +3839,36 @@ def from_pretrained( ) class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + def __init__( self, gguf_metadata: Dict[str, Any], clip_model_path: str, - model_arch: Optional[str] = None, verbose: bool = True, **kwargs ) -> None: self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) - self.arch = self.model_metadata.get("general.architecture", None) if model_arch is None else model_arch if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.arch is None: - if verbose: - print("Unknown model architecture. Will use general/most-common tags.") - - self.arch = "unknown" if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) - - if self.arch in ["unknown", "qwen3vl", "qwen35moe", "qwen35"]: - self._chat_format_parser_tags += ["<|image_pad|>", "<|audio_pad|>", "<|video_pad|>"] - elif self.arch in ["gemma4"]: - self._chat_format_parser_tags += ["<|image|>", "<|audio|>", "<|video|>"] - elif self.arch in ["mistral3", "mistral4", "deepseek2"]: - self._chat_format_parser_tags += ["[IMG]"] - elif verbose: - print("Warning: Could not determine chat format parser tags.", flush = True) def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] if self.verbose: print(f"{self.log_prefix} - Start processing") From 2dad6dc407c6b56af281a29bbe2f7a3e15fb712f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 12:46:38 +0800 Subject: [PATCH 406/518] build(cmake): refactor install target lists for new GGML backend layout - Categorize build targets into logical groups (`LLAMA_CPP_TARGETS`, `GGML_CORE_TARGETS`, `GGML_CPU_VARIANT_TARGETS`, and `GGML_BACKEND_TARGETS`) to improve maintainability and keep the Python package installation in sync with the updated upstream GGML backend layout. - Add missing targets such as `llama-common` and the separated `ggml-cpu-*` CPU variant backends. - Ensure all grouped targets are passed through `llama_cpp_python_install_target`. Signed-off-by: JamePeng --- CMakeLists.txt | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04d3ec1fff..c42bbe95f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,14 +117,38 @@ if (LLAMA_BUILD) set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() - # Define list of GGML targets to install - set(GGML_TARGETS + # Define list of LLAMA_CPP/GGML targets to install + set(LLAMA_CPP_TARGETS llama + llama-common + ) + set(GGML_CORE_TARGETS ggml ggml-base ggml-blas - ggml-cann ggml-cpu + ggml-rpc + ) + + set(GGML_CPU_VARIANT_TARGETS + ggml-cpu-x64 + ggml-cpu-sse42 + ggml-cpu-sandybridge + ggml-cpu-ivybridge + ggml-cpu-piledriver + ggml-cpu-haswell + ggml-cpu-skylakex + ggml-cpu-cannonlake + ggml-cpu-cascadelake + ggml-cpu-cooperlake + ggml-cpu-icelake + ggml-cpu-alderlake + ggml-cpu-sapphirerapids + ggml-cpu-zen4 + ) + + set(GGML_BACKEND_TARGETS + ggml-cann ggml-cuda ggml-hexagon ggml-hip @@ -132,7 +156,6 @@ if (LLAMA_BUILD) ggml-musa ggml-opencl ggml-openvino - ggml-rpc ggml-sycl ggml-virtgpu ggml-vulkan @@ -141,8 +164,12 @@ if (LLAMA_BUILD) ggml-zendnn ) - # Loop through targets to avoid repetitive function calls - foreach(TARGET_NAME ${GGML_TARGETS}) + foreach(TARGET_NAME + ${LLAMA_CPP_TARGETS} + ${GGML_CORE_TARGETS} + ${GGML_CPU_VARIANT_TARGETS} + ${GGML_BACKEND_TARGETS} + ) llama_cpp_python_install_target(${TARGET_NAME}) endforeach() From 24b1dc859cba8b2dce7fb2463c78faacc1955997 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 13:07:25 +0800 Subject: [PATCH 407/518] build(cmake): sync llama build options and disable server UI - Update llama build option descriptions to match the current upstream naming style. - Explicitly disable `LLAMA_BUILD_SERVER` to avoid building the server target for Python package wheels. - Explicitly disable `LLAMA_BUILD_UI` and `LLAMA_USE_PREBUILT_UI` because the embedded server Web UI is not needed for wheel builds. - Keep examples, tests, and curl support disabled for minimal wheel artifacts. Signed-off-by: JamePeng --- CMakeLists.txt | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c42bbe95f0..ee72ae9582 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,16 +72,23 @@ if (LLAMA_BUILD) set(CMAKE_SKIP_RPATH FALSE) # Enable building of the common library - set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama.cpp: build common utils library" FORCE) + set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama: build common utils library" FORCE) # Enable build and link OpenSSL - set(LLAMA_OPENSSL ON CACHE BOOL "llama.cpp: build and link OpenSSL" FORCE) + set(LLAMA_OPENSSL ON CACHE BOOL "llama: use openssl to support HTTPS" FORCE) # Disable building of examples - set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama.cpp: build examples" FORCE) + set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama: build examples" FORCE) # Disable building of tests - set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama.cpp: build tests" FORCE) + set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama: build tests" FORCE) + + # Disable building of server + set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example" FORCE) + + # Disable build the embedded Web UI for server + set(LLAMA_BUILD_UI OFF CACHE BOOL "llama: build the embedded Web UI for server" FORCE) + set(LLAMA_USE_PREBUILT_UI OFF CACHE BOOL "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" FORCE) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: use libcurl to download model from an URL" FORCE) From 4c4e3d007a649e65e8f38a6ce387807299310bdf Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 13:55:17 +0800 Subject: [PATCH 408/518] build(cmake): clean up dev files and import libs from Windows wheels - Remove `ARCHIVE DESTINATION` for Windows targets to avoid installing `.lib` files. - Add a cleanup function to strip `cmake`, `pkgconfig`, and import libraries from the python wheel runtime directories. - Ensures Windows builds only package the required runtime DLLs. Signed-off-by: JamePeng --- CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee72ae9582..8e5d583d90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,13 +11,12 @@ set(CMAKE_INSTALL_LIBDIR llama_cpp/lib CACHE PATH "" FORCE) set(CMAKE_INSTALL_INCLUDEDIR llama_cpp/include CACHE PATH "" FORCE) -# Helper function to install targets to Python package directories +# Install a built target into the Python package runtime directory. function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) return() endif() - # Define install destinations to avoid code duplication set(INSTALL_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" @@ -33,6 +32,9 @@ function(llama_cpp_python_install_target target) RESOURCE DESTINATION ${DIR} ) + # Copy runtime DLL dependencies of this target when available. + # This does not replace explicit installation of dynamic backend + # targets such as ggml-cpu-*; those are installed as targets below. # Automatically handle Windows DLL installation for each target if (WIN32) install( @@ -57,6 +59,40 @@ function(llama_cpp_python_install_target target) endif() endfunction() + +# Remove development-only artifacts from Python wheel runtime directories. +# +# Upstream install rules may place CMake package files, pkg-config files, and +# Windows import libraries under llama_cpp/lib because CMAKE_INSTALL_LIBDIR is +# redirected there for wheel builds. They are not needed at runtime. +function(llama_cpp_python_cleanup_dev_files) + if(NOT WIN32) + return() + endif() + + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + ) + + foreach(DIR ${INSTALL_DIRS}) + install(CODE " + if(EXISTS \"${DIR}\") + file(GLOB LLAMA_CPP_IMPORT_LIBS \"${DIR}/*.lib\") + if(LLAMA_CPP_IMPORT_LIBS) + file(REMOVE \${LLAMA_CPP_IMPORT_LIBS}) + endif() + + file(REMOVE_RECURSE + \"${DIR}/cmake\" + \"${DIR}/pkgconfig\" + ) + endif() + ") + endforeach() +endfunction() + + if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") @@ -204,4 +240,8 @@ if (LLAMA_BUILD) llama_cpp_python_install_target(mtmd) endif() + + # Run after all runtime targets are installed, including mtmd. + llama_cpp_python_cleanup_dev_files() + endif() From 6af3cd7df808cb9725b2da0273c23098111c25ea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 15:18:32 +0800 Subject: [PATCH 409/518] fix(_ggml): correct `ggml_backend_unload` function name --- llama_cpp/_ggml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 8f4cb1187f..c4ae7c94bf 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -1295,8 +1295,8 @@ def ggml_backend_load(path: ctypes.c_char_p) -> ggml_backend_reg_t: # // Unload a backend if loaded dynamically and unregister it # GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); -@ggml_function("ggml_backend_load_all", [ctypes.c_void_p], None) -def ggml_backend_load_all(reg: ggml_backend_reg_t): +@ggml_function("ggml_backend_unload", [ctypes.c_void_p], None) +def ggml_backend_unload(reg: ggml_backend_reg_t): """ Unload a backend if loaded dynamically and unregister it """ From 038a953079126fd4a81e574c9c06680e36b0a10e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 16:13:53 +0800 Subject: [PATCH 410/518] feat(core): support loading GGML_BACKEND_DL dynamic backend libraries from wheel lib - Import `ggml_backend_load_all_from_path` and `ggml_backend_reg_count` from `_ggml`. - Load dynamic ggml backend libraries from the packaged `llama_cpp/lib` directory after `llama_backend_init()`. - Support wheels built with `GGML_BACKEND_DL`, where CPU variants and accelerator backends such as `ggml-cpu-*` and `ggml-cuda` are shipped as separate runtime libraries. - Print the registered backend count in verbose mode to help diagnose backend discovery issues. Signed-off-by: JamePeng --- llama_cpp/llama.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 19ede6bcfd..8b1070be4f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -57,6 +57,8 @@ ) from ._ggml import ( ggml_backend_cpu_buffer_type, + ggml_backend_load_all_from_path, + ggml_backend_reg_count ) from ._logger import ( configure_logging, @@ -290,9 +292,40 @@ def __init__( log_filters_case_sensitive=log_filters_case_sensitive, ) + # llama.cpp / ggml backend initialization is process-global. + # Run it once before loading any model. if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): llama_cpp_lib.llama_backend_init() + + # Wheels built with `GGML_BACKEND_DL` ship ggml backends as separate + # dynamic libraries under llama_cpp/lib, for example: + # + # ggml-cpu-x64.dll + # ggml-cpu-haswell.dll + # ggml-cpu-alderlake.dll + # ggml-cuda.dll + # + # With the dynamic backend layout, llama_backend_init() initializes + # the global backend system but does not necessarily register every + # packaged backend. Loading the package lib directory ensures ggml can + # discover CPU variants and optional accelerator backends before model + # loading. + lib_dir = Path(llama_cpp_lib.__file__).resolve().parent / "lib" + + if not lib_dir.exists(): + raise FileNotFoundError(f"Llama.__init__: llama_cpp lib directory not found: {lib_dir}") + + # Load all dynamic ggml backend plugins from the packaged lib directory. + ggml_backend_load_all_from_path( + ctypes.c_char_p(str(lib_dir).encode("utf-8")) + ) + + # Print the number of backend registrations to confirm whether the DLL is loaded. + if self.verbose: + count = ggml_backend_reg_count() + print(f"Llama.__init__: Loaded ggml backend registry count: {count}", file=sys.stderr) + Llama.__backend_initialized = True if isinstance(numa, bool): From a8f928c9b134b61d5f6370d1e02de29efdd95227 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 17:14:15 +0800 Subject: [PATCH 411/518] Bump version to 0.3.39-preview Signed-off-by: JamePeng --- llama_cpp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 438bf08b58..b32fbfd36e 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.38" +__version__ = "0.3.39-preview" From 628373c1af97935a8c00e5273c5e9dd90dcd6b4c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 16 May 2026 17:29:56 +0800 Subject: [PATCH 412/518] ci(cu131+windows): build CU131 wheels with GGML dynamic backends for windows - Replace the old CPU/AVX release tag matrix with a single CU131 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu130`. - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu130-win.yml | 134 ------------- .github/workflows/build-wheels-cu131-win.yml | 191 +++++++++++++++++++ 2 files changed, 191 insertions(+), 134 deletions(-) delete mode 100644 .github/workflows/build-wheels-cu130-win.yml create mode 100644 .github/workflows/build-wheels-cu131-win.yml diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml deleted file mode 100644 index d6187d7bf4..0000000000 --- a/.github/workflows/build-wheels-cu130-win.yml +++ /dev/null @@ -1,134 +0,0 @@ -name: Build Wheels (CU130) for Windows - -on: - workflow_dispatch: - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: ['windows-2022'] - pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] - cuda: ["13.0.2"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] - defaults: - run: - shell: pwsh - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 - - steps: - - name: Add MSBuild to PATH - if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v3 - with: - msbuild-architecture: x64 - - - uses: actions/checkout@v6 - with: - submodules: "recursive" - - # from kingbri1/flash-attention build-wheels.yml - - name: Install CUDA ${{ matrix.cuda }} - uses: Jimver/cuda-toolkit@v0.2.35 - id: cuda-toolkit - with: - cuda: "${{ matrix.cuda }}" - use-github-cache: false - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v7 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - name: Install Dependencies - run: | - git config --system core.longpaths true - uv pip install --upgrade build setuptools wheel packaging - - - name: Build Wheel - run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') - $env:CUDA_HOME = $env:CUDA_PATH - $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH - $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' - } - python -m build --wheel - - # Check if wheel was built - if (!(Test-Path '.\dist\*.whl')) { - Write-Error "No wheel built in dist/ directory" - exit 1 - } - - $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - - # Split file name: name-ver-py-abi-plat.whl - $parts = $wheelFile.Name.Split('-') - $distName = $parts[0] - $version = $parts[1] - $pyTag = $parts[2] - $abiTag = $parts[3] - $platTag = $parts[4] - - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - - $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" - - # Rename wheel file - Rename-Item -Path $wheelFile.FullName -NewName $newName - Write-Output "Renamed wheel to: $newName" - - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV - Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - - name: Get Current Date - id: get-date - run: | - $currentDate = Get-Date -UFormat "%Y%m%d" - Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - - name: Create Release - if: always() && env.TAG_VERSION != '' - uses: softprops/action-gh-release@v3 - with: - files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml new file mode 100644 index 0000000000..14bea65d19 --- /dev/null +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -0,0 +1,191 @@ +name: Build Wheels (CU131) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu131 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.1.1"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From df07219dee25ae2cc95f842bd1a7c81e6bfb599f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 07:57:05 +0800 Subject: [PATCH 413/518] ci(cu12+windows): build CU124-128 wheels with GGML dynamic backends for windows - Replace the old CPU/AVX release tag matrix with a single CU124-128 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu124/cu126/cu128`. - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu124-win.yml | 145 ++++++++++++------ .github/workflows/build-wheels-cu126-win.yml | 145 ++++++++++++------ .github/workflows/build-wheels-cu128-win.yml | 147 +++++++++++++------ 3 files changed, 304 insertions(+), 133 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-win.yml b/.github/workflows/build-wheels-cu124-win.yml index 01bd48e7de..e856533410 100644 --- a/.github/workflows/build-wheels-cu124-win.yml +++ b/.github/workflows/build-wheels-cu124-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu124 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.4.1"] - releasetag: ["Basic"] cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split wheel filename: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu126-win.yml b/.github/workflows/build-wheels-cu126-win.yml index 9330cb130b..b77b17917f 100644 --- a/.github/workflows/build-wheels-cu126-win.yml +++ b/.github/workflows/build-wheels-cu126-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu126 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.6.3"] - releasetag: ["Basic"] cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real;90-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split wheel filename: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-win.yml b/.github/workflows/build-wheels-cu128-win.yml index 98ebbc4127..223473dde6 100644 --- a/.github/workflows/build-wheels-cu128-win.yml +++ b/.github/workflows/build-wheels-cu128-win.yml @@ -8,85 +8,141 @@ permissions: jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu128 runs-on: ${{ matrix.os }} + strategy: + fail-fast: false matrix: - os: ['windows-2022'] + os: ["windows-2022"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] cuda: ["12.8.1"] - releasetag: ["Basic"] - cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + defaults: run: shell: pwsh + env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html - # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list - # e.g. "all" "89" "90" "100" "120" - MAX_JOBS: 8 + MAX_JOBS: 12 steps: - name: Add MSBuild to PATH - if: runner.os == 'Windows' uses: microsoft/setup-msbuild@v3 with: msbuild-architecture: x64 - - uses: actions/checkout@v6 + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from kingbri1/flash-attention build-wheels.yml - name: Install CUDA ${{ matrix.cuda }} uses: Jimver/cuda-toolkit@v0.2.35 id: cuda-toolkit with: - cuda: "${{ matrix.cuda }}" + cuda: ${{ matrix.cuda }} use-github-cache: false - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - name: Install Dependencies + - name: Install dependencies run: | git config --system core.longpaths true uv pip install --upgrade build setuptools wheel packaging - - name: Build Wheel + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + $env:CUDA_HOME = $env:CUDA_PATH $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=' + $env:CUDAARCHVER + ' -DCMAKE_BUILD_PARALLEL_LEVEL=' + $env:MAX_JOBS - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=on -DCUDA_SEPARABLE_COMPILATION=on $env:CMAKE_ARGS" - $env:CMAKE_ARGS = "-DENABLE_CCACHE=on -DLLAMA_CURL=off $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' - } - if ($env:AVXVER -eq 'AVX2') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off' - } - if ($env:AVXVER -eq 'AVXVNNI') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on' - } - # if ($env:AVXVER -eq 'AVX512') { - # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' - # } - # Basic options for compiling without AVX instructions - if ($env:AVXVER -eq 'Basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off' + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + python -m build --wheel # Check if wheel was built @@ -97,7 +153,8 @@ jobs: $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 - # Split file name: name-ver-py-abi-plat.whl + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl $parts = $wheelFile.Name.Split('-') $distName = $parts[0] $version = $parts[1] @@ -105,30 +162,30 @@ jobs: $abiTag = $parts[3] $platTag = $parts[4] - $newVersion = "$version+cu$cudaVersion.$($env:AVXVER.ToLower())" - + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" # Rename wheel file Rename-Item -Path $wheelFile.FullName -NewName $newName Write-Output "Renamed wheel to: $newName" - # write the build tag to the output + # Write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV - - name: Get Current Date + - name: Get current date id: get-date run: | $currentDate = Get-Date -UFormat "%Y%m%d" Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV - - name: Create Release + - name: Create release if: always() && env.TAG_VERSION != '' uses: softprops/action-gh-release@v3 with: files: dist/* - # Set tag_name to -cu--win- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-win-${{ env.BUILD_DATE }} + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From e47843f591f1b879e461bcce3d1877ff943c3f71 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 07:58:19 +0800 Subject: [PATCH 414/518] Update Submodule vendor/llama.cpp 49d1701..b64739e --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 49d1701bd2..b64739ea39 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 49d1701bd24e4cedf6dfec9e50e185111203946b +Subproject commit b64739ea393b3c9d07cc9907e0a611f707838051 From 39785d0efb7a45490fdc45ac340ff2ec1a2eae8c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 08:08:45 +0800 Subject: [PATCH 415/518] fix(_internals): Remove unnecessary free operations; models should not be released within the context. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b4ba1f4b21..277d22aebf 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -494,7 +494,6 @@ def __init__( ctx = llama_cpp.llama_init_from_model(self.model.model, self.params) if ctx is None: - llama_cpp.llama_model_free(self.model.model) raise ValueError("Failed to create context with model") self.ctx = ctx From 127881293e712ffdc3cae43af969d2a46e52c80e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 08:47:19 +0800 Subject: [PATCH 416/518] Sync llama.cpp API 20260517 - llama + spec: MTP Support Signed-off-by: JamePeng --- llama_cpp/_internals.py | 3 +++ llama_cpp/llama.py | 8 ++++++++ llama_cpp/llama_cpp.py | 41 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 277d22aebf..a8dd56083b 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -533,6 +533,9 @@ def n_ubatch(self) -> int: def n_seq_max(self) -> int: return llama_cpp.llama_n_seq_max(self.ctx) + def n_rs_seq(self) -> int: + return llama_cpp.llama_n_rs_seq(self.ctx) + def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8b1070be4f..734485802e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -119,8 +119,12 @@ def __init__( n_batch: int = 2048, n_ubatch: int = 512, n_seq_max: int = 1, + n_rs_seq: int = 0, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, + ctx_type: Optional[ + int + ] = llama_cpp_lib.llama_context_type.LLAMA_CONTEXT_TYPE_DEFAULT, rope_scaling_type: Optional[ int ] = llama_cpp_lib.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, @@ -474,6 +478,7 @@ def __init__( self.n_batch = min(n_ctx, n_batch) # ??? self.n_keep = n_keep if n_keep > 0 else 256 self.n_seq_max = n_seq_max + self.n_rs_seq = n_rs_seq self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -486,8 +491,11 @@ def __init__( self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) self.context_params.n_seq_max = self.n_seq_max + self.context_params.n_rs_seq = self.n_rs_seq self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch + + self.context_params.ctx_type = ctx_type self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index cc900c0648..ec2b665a16 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -471,6 +471,14 @@ class llama_split_mode(enum.IntEnum): LLAMA_SPLIT_MODE_ROW = 2 LLAMA_SPLIT_MODE_TENSOR = 3 +# enum llama_context_type { +# LLAMA_CONTEXT_TYPE_DEFAULT = 0, +# LLAMA_CONTEXT_TYPE_MTP = 1, +# }; +class llama_context_type(enum.IntEnum): + LLAMA_CONTEXT_TYPE_DEFAULT = 0 + LLAMA_CONTEXT_TYPE_MTP = 1 + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -827,9 +835,11 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) +# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing +# enum llama_context_type ctx_type; // set the context type (e.g. MTP) # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings @@ -843,13 +853,14 @@ class llama_sampler_seq_config(ctypes.Structure): # float yarn_beta_fast; // YaRN low correction dim # float yarn_beta_slow; // YaRN high correction dim # uint32_t yarn_orig_ctx; // YaRN original context size -# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, < 0 disabled (default) +# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; # enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] # enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] + # // Abort callback # // if it returns true, execution of llama_decode() will be aborted # // currently works only with CPU execution @@ -862,11 +873,12 @@ class llama_sampler_seq_config(ctypes.Structure): # bool no_perf; // measure performance timings # bool op_offload; // offload host tensor operations to device # bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) -# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some casesAdd commentMore actions -# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 +# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases +# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 # bool kv_unified; // use a unified buffer across the input sequences when computing the attention -# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix -# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 +# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix +# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + # // [EXPERIMENTAL] # // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) # // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) @@ -881,12 +893,16 @@ class llama_context_params(ctypes.Structure): n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) + n_rs_seq (int): number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing + + ctx_type (int): set the context type (e.g. MTP) rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings flash_attn_type (int): when to enable Flash Attention + rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -895,18 +911,23 @@ class llama_context_params(ctypes.Structure): yarn_beta_slow (float): YaRN high correction dim yarn_orig_ctx (int): YaRN original context size defrag_thold (float): [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) + cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval + type_k (int): data type for K cache type_v (int): data type for V cache + abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback + embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU no_perf (bool): whether to measure performance timings op_offload(bool): whether to offload host tensor operations to device swa_full(bool): whether to use full-size SWA cache kv_unified(bool): use a unified buffer across the input sequences when computing the attention + samplers(llama_sampler_seq_config *): the samplers must be sampler chains (i.e. use llama_sampler_chain_init) n_samplers(size_t): numbers of sampler chains """ @@ -916,8 +937,10 @@ class llama_context_params(ctypes.Structure): n_batch: int n_ubatch: int n_seq_max: int + n_rs_seq: int n_threads: int n_threads_batch: int + ctx_type: int rope_scaling_type: int pooling_type: int attention_type: int @@ -950,8 +973,10 @@ class llama_context_params(ctypes.Structure): ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), + ("n_rs_seq", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), + ("ctx_type", ctypes.c_int), ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), @@ -1602,6 +1627,12 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_rs_seq(ctx: llama_context_p, /) -> int: + ... + + # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: From 50627bfa9b16a5061df699b04006029070935c62 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 09:06:03 +0800 Subject: [PATCH 417/518] fix(context): prevent operations on uninitialized or closed contexts - Introduce an internal `_assert_ctx()` method to verify that the underlying C context (`self.ctx`) is valid before invoking dependent `llama.cpp` operations. - Apply `_assert_ctx()` to critical methods (`encode`, `decode`, `get_logits*`, `get_embeddings*`) to prevent hard crashes (segfaults) caused by passing null pointers to the C API. - Upgrade the context initialization failure exception from a generic `ValueError` to a detailed `RuntimeError`, providing developers with actionable hints about potentially out-of-sync `llama_context_params`. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index a8dd56083b..c026440a2d 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -493,8 +493,13 @@ def __init__( ctx = llama_cpp.llama_init_from_model(self.model.model, self.params) - if ctx is None: - raise ValueError("Failed to create context with model") + if not ctx: + raise RuntimeError( + "Failed to create llama context with model. " + "This may indicate that llama_context_params is out of sync with " + "the bundled llama.cpp version, or that required context parameters " + "were not initialized correctly." + ) self.ctx = ctx @@ -518,6 +523,13 @@ def close(self): def __del__(self): self.close() + def _assert_ctx(self): + if not getattr(self, "ctx", None): + raise RuntimeError( + "LlamaContext is not initialized or has already been closed. " + "Context-dependent llama.cpp operations cannot continue." + ) + def n_ctx(self) -> int: return llama_cpp.llama_n_ctx(self.ctx) @@ -654,6 +666,7 @@ def set_state_seq_data_ext( # // Decoding API def encode(self, batch: LlamaBatch): + self._assert_ctx() return_code = llama_cpp.llama_encode( self.ctx, batch.batch, @@ -678,6 +691,7 @@ def decode(self, batch: 'LlamaBatch') -> int: RuntimeError: If a fatal, non-recoverable error occurs during decoding (e.g., negative error codes or invalid batch structures). """ + self._assert_ctx() return_code = llama_cpp.llama_decode(self.ctx, batch.batch) if return_code == 0: @@ -741,21 +755,27 @@ def synchronize(self): llama_cpp.llama_synchronize(self.ctx) def get_logits(self): + self._assert_ctx() return llama_cpp.llama_get_logits(self.ctx) def get_logits_ith(self, i: int): + self._assert_ctx() return llama_cpp.llama_get_logits_ith(self.ctx, i) def set_embeddings(self, embeddings: bool): + self._assert_ctx() llama_cpp.llama_set_embeddings(self.ctx, embeddings) def get_embeddings(self): + self._assert_ctx() return llama_cpp.llama_get_embeddings(self.ctx) def get_embeddings_ith(self, i: int): + self._assert_ctx() return llama_cpp.llama_get_embeddings_ith(self.ctx, i) def get_embeddings_seq(self, seq_id: int): + self._assert_ctx() return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id) def reset_timings(self): From a4c8d77d1817bcf7255b8e93aa4733e5378d9211 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 12:29:48 +0800 Subject: [PATCH 418/518] ci(cu131+linux): build CU131 wheels with GGML dynamic backends for linux - Replace the old CPU/AVX release tag matrix with a single CU131 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Linux wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu131`. - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. Signed-off-by: JamePeng --- .../workflows/build-wheels-cu130-linux.yml | 132 --------------- .../workflows/build-wheels-cu131-linux.yml | 156 ++++++++++++++++++ 2 files changed, 156 insertions(+), 132 deletions(-) delete mode 100644 .github/workflows/build-wheels-cu130-linux.yml create mode 100644 .github/workflows/build-wheels-cu131-linux.yml diff --git a/.github/workflows/build-wheels-cu130-linux.yml b/.github/workflows/build-wheels-cu130-linux.yml deleted file mode 100644 index 4f4305ad3e..0000000000 --- a/.github/workflows/build-wheels-cu130-linux.yml +++ /dev/null @@ -1,132 +0,0 @@ -name: Build Wheels(CU130) for Linux - -on: - workflow_dispatch: # Manual trigger - -permissions: - contents: write - -jobs: - build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} - runs-on: ubuntu-22.04 - container: nvidia/cuda:13.0.2-cudnn-devel-ubuntu22.04 - strategy: - matrix: # Define the build matrix directly here - os: ["ubuntu-22.04"] - pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions - cuda: ["13.0.2"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc - - defaults: - run: - shell: bash - - env: - CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} - CUDAARCHVER: ${{ matrix.cudaarch }} - - steps: - - name: Install dependencies - run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code - with: - submodules: "recursive" - - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version - uses: astral-sh/setup-uv@v7 - with: - python-version: ${{ matrix.pyver }} - activate-environment: true - enable-cache: true - - - run: nvcc -V - - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel - env: - LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR - run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel - - # --- Post-build steps to get info for rename wheel file and release tag --- - - cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') - - wheel_path=$(ls dist/*.whl | head -n 1) - filename=$(basename "$wheel_path") - - # Split wheel filename - IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" - new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - - # Rename wheel file - mv "$wheel_path" "dist/$new_filename" - echo "Renamed wheel to: $new_filename" - - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step - - - name: Get Current Date # Step to get current date for the release tag - id: get-date - run: | - # Get date in YYYYMMDD format using bash date command - currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV - - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release - with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file diff --git a/.github/workflows/build-wheels-cu131-linux.yml b/.github/workflows/build-wheels-cu131-linux.yml new file mode 100644 index 0000000000..d70f8a01c8 --- /dev/null +++ b/.github/workflows/build-wheels-cu131-linux.yml @@ -0,0 +1,156 @@ +name: Build Wheels (CU131) for Linux + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu131 + runs-on: ubuntu-22.04 + container: nvidia/cuda:13.1.2-cudnn-devel-ubuntu22.04 + + strategy: + fail-fast: false + matrix: + os: ["ubuntu-22.04"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions + cuda: ["13.1.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real;121-real"] + + defaults: + run: + shell: bash + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Install dependencies + run: | + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Show CUDA version + run: nvcc -V + + - name: Build wheel + env: + LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" + run: | + set -euo pipefail + + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true + + cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi + + wheel_path=$(ls dist/*.whl | head -n 1) + filename=$(basename "$wheel_path") + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" + + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" + new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" + + mv "$wheel_path" "dist/$new_filename" + echo "Renamed wheel to: $new_filename" + + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" + + - name: Get current date + id: get-date + run: | + currentDate=$(date +%Y%m%d) + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 42891efb70bfd8ac6dd438e99f6c2a1b4119299d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 19:39:57 +0800 Subject: [PATCH 419/518] ci(cu12+linux): build CU124/126/128 wheels with GGML dynamic backends for linux - Replace the old CPU/AVX release tag matrix with a single CU124/126/128 backend wheel layout. - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Linux wheels ship runtime-loadable GGML backend DLLs and CPU variant backends. - Disable non-wheel targets such as examples, tests, tools, server, embedded UI, and curl. - Remove the `.basic` style local version suffix and publish wheels as `+cu124/cu126/cu128`. Signed-off-by: JamePeng --- .../workflows/build-wheels-cu124-linux.yml | 170 ++++++++++-------- .../workflows/build-wheels-cu126-linux.yml | 170 ++++++++++-------- .../workflows/build-wheels-cu128-linux.yml | 170 ++++++++++-------- 3 files changed, 291 insertions(+), 219 deletions(-) diff --git a/.github/workflows/build-wheels-cu124-linux.yml b/.github/workflows/build-wheels-cu124-linux.yml index 889a1679a4..d7a3a90d81 100644 --- a/.github/workflows/build-wheels-cu124-linux.yml +++ b/.github/workflows/build-wheels-cu124-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU124) for Linux # Workflow name +name: Build Wheels (CU124) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu124 runs-on: ubuntu-22.04 container: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.4.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu126-linux.yml b/.github/workflows/build-wheels-cu126-linux.yml index 568824c642..9f28a57ca2 100644 --- a/.github/workflows/build-wheels-cu126-linux.yml +++ b/.github/workflows/build-wheels-cu126-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU126) for Linux # Workflow name +name: Build Wheels (CU126) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu126 runs-on: ubuntu-22.04 container: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.6.3"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["70-real;75-real;80-real;86-real;87-real;89-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='70-real;75-real;80-real;86-real;87-real;89-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-cu128-linux.yml b/.github/workflows/build-wheels-cu128-linux.yml index d1c387c52a..c6b255c9f9 100644 --- a/.github/workflows/build-wheels-cu128-linux.yml +++ b/.github/workflows/build-wheels-cu128-linux.yml @@ -1,23 +1,24 @@ -name: Build Wheels(CU128) for Linux # Workflow name +name: Build Wheels (CU128) for Linux on: - workflow_dispatch: # Manual trigger + workflow_dispatch: permissions: contents: write jobs: build_wheels: - name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu128 runs-on: ubuntu-22.04 container: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + strategy: - matrix: # Define the build matrix directly here + fail-fast: false + matrix: os: ["ubuntu-22.04"] pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] # Python versions cuda: ["12.8.1"] - releasetag: ["Basic"] # Controls CMAKE_ARGS for CPU features (even in CUDA build) - cudaarch: ["all"] # Controls target CUDA architectures for nvcc + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] defaults: run: @@ -25,108 +26,131 @@ jobs: env: CUDAVER: ${{ matrix.cuda }} - AVXVER: ${{ matrix.releasetag }} CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 steps: - name: Install dependencies run: | - apt update - apt install -y build-essential ccache cmake curl git libgomp1 libjpeg-dev libssl-dev - - - uses: actions/checkout@v6 # Checkout code + apt update + apt install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libgomp1 \ + libjpeg-dev \ + libssl-dev \ + ninja-build + + - name: Checkout + uses: actions/checkout@v6 with: - submodules: "recursive" + submodules: recursive - # from astral-sh/setup-uv - - name: Install the latest version of uv and set the python version + - name: Install uv and Python ${{ matrix.pyver }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.pyver }} activate-environment: true enable-cache: true - - run: nvcc -V + - name: Show CUDA version + run: nvcc -V - - name: Build Wheel With Cmake # Main build step: configures and builds the wheel + - name: Build wheel env: LD_LIBRARY_PATH: "/usr/local/cuda/lib64:/usr/local/cuda/compat:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" - VERBOSE: 1 # Enable verbose build output - CUDA_HOME: "/usr/local/cuda/" # Set CUDA_HOME - CUDA_PATH: "${PATH}" - CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda/" # Set CUDA_TOOLKIT_ROOT_DIR + VERBOSE: "1" + CUDA_HOME: "/usr/local/cuda" + CUDA_PATH: "/usr/local/cuda" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" run: | - echo "VERBOSE=1" >> $GITHUB_ENV # Enable verbose build output for troubleshooting - find /usr/ -name 'libcuda.so.*' - find /usr/ -name 'libcudart.so.*' - echo $LD_LIBRARY_PATH - - # Add project-specific and feature flags - CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES='75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;120-real'" - CMAKE_ARGS="-DGGML_CUDA_FORCE_MMQ=on ${CMAKE_ARGS}" - CMAKE_ARGS="${CMAKE_ARGS} -DLLAMA_CURL=off -DLLAMA_OPENSSL=on" - - if [ "${AVXVER}" = "AVX" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" - fi - if [ "${AVXVER}" = "AVX2" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off" - fi - if [ "${AVXVER}" = "AVXVNNI" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX_VNNI=on" - fi - # if [ "${AVXVER}" = "AVX512" ]; then - # CMAKE_ARGS="${CMAKE_ARGS} -DGGML_AVX512=on" - # fi - # Basic options for compiling without AVX instructions - if [ "${AVXVER}" = "Basic" ]; then - CMAKE_ARGS="${CMAKE_ARGS} -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX_VNNI=off -DGGML_AVX512=off -DGGML_AVX512_VBMI=off -DGGML_AVX512_VNNI=off -DGGML_AVX512_BF16=off -DGGML_FMA=off -DGGML_F16C=off" - fi - - # Export CMAKE_ARGS environment variable so the python -m build command can use it - echo ${CMAKE_ARGS} - echo "CMAKE_ARGS=${CMAKE_ARGS}" >> $GITHUB_ENV - - # Run the Python build command to generate the wheel - uv pip install build setuptools wheel packaging - CMAKE_ARGS=${CMAKE_ARGS} uv build --wheel + set -euo pipefail - # --- Post-build steps to get info for rename wheel file and release tag --- + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + find /usr/ -name 'libcuda.so.*' || true + find /usr/ -name 'libcudart.so.*' || true cuda_ver_short=$(echo "${CUDAVER}" | cut -d'.' -f 1,2 | sed 's/\.//g') - avx_ver=$(echo "${AVXVER}" | tr '[:upper:]' '[:lower:]') + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend shared libraries. + # - GGML_CPU_ALL_VARIANTS builds CPU variant backends when supported. + # - GGML_NATIVE=OFF avoids binding the wheel to the CI runner CPU. + CMAKE_ARGS_ARRAY=( + "-G Ninja" + + # Disable non-wheel targets. + "-DLLAMA_BUILD_EXAMPLES=OFF" + "-DLLAMA_BUILD_TESTS=OFF" + "-DLLAMA_BUILD_TOOLS=OFF" + "-DLLAMA_BUILD_SERVER=OFF" + "-DLLAMA_BUILD_UI=OFF" + "-DLLAMA_USE_PREBUILT_UI=OFF" + "-DLLAMA_CURL=OFF" + "-DLLAMA_OPENSSL=ON" + + # GGML dynamic backend layout. + "-DGGML_CPU=ON" + "-DGGML_CUDA=ON" + "-DGGML_NATIVE=OFF" + "-DGGML_BACKEND_DL=ON" + "-DGGML_CPU_ALL_VARIANTS=ON" + "-DGGML_OPENMP=ON" + + # CUDA backend. + "-DCMAKE_CUDA_ARCHITECTURES=${CUDAARCHVER}" + "-DGGML_CUDA_FORCE_MMQ=ON" + "-DCUDA_SEPARABLE_COMPILATION=ON" + "-DCMAKE_CUDA_FLAGS=--diag-suppress=177,221,550" + + # Build behavior. + "-DCMAKE_BUILD_PARALLEL_LEVEL=${MAX_JOBS}" + "-DGGML_CCACHE=ON" + "-DENABLE_CCACHE=ON" + ) + + CMAKE_ARGS="${CMAKE_ARGS_ARRAY[*]}" + echo "CMAKE_ARGS=${CMAKE_ARGS}" + + uv pip install --upgrade build setuptools wheel packaging + CMAKE_ARGS="${CMAKE_ARGS}" uv build --wheel + + if ! ls dist/*.whl >/dev/null 2>&1; then + echo "No wheel built in dist/ directory" + exit 1 + fi wheel_path=$(ls dist/*.whl | head -n 1) filename=$(basename "$wheel_path") - # Split wheel filename + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl IFS='-' read -r dist_name version py_tag abi_tag plat_tag <<< "$filename" - new_version="${version}+cu${cuda_ver_short}.${avx_ver}" + # CPU all-variants is now an internal runtime layout detail. + new_version="${version}+cu${cuda_ver_short}" new_filename="${dist_name}-${new_version}-${py_tag}-${abi_tag}-${plat_tag}" - # Rename wheel file mv "$wheel_path" "dist/$new_filename" echo "Renamed wheel to: $new_filename" - echo "CUDA_VERSION=$cuda_ver_short" >> $GITHUB_ENV # Store short CUDA version in env - echo "TAG_VERSION=$version" >> $GITHUB_ENV # Store version in env for release step + echo "CUDA_VERSION=$cuda_ver_short" >> "$GITHUB_ENV" + echo "TAG_VERSION=$version" >> "$GITHUB_ENV" - - name: Get Current Date # Step to get current date for the release tag + - name: Get current date id: get-date run: | - # Get date in YYYYMMDD format using bash date command currentDate=$(date +%Y%m%d) - # Store the date in environment variable for the release step - echo "BUILD_DATE=$currentDate" >> $GITHUB_ENV + echo "BUILD_DATE=$currentDate" >> "$GITHUB_ENV" - - uses: softprops/action-gh-release@v3 # Action to create a GitHub Release + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 with: - files: dist/* # Upload the generated wheel files from the dist directory - # Define the release tag name using the collected environment variables - # Format: v-cu--linux- - tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-${{ env.AVXVER }}-linux-${{ env.BUILD_DATE }} # Release tag format for Linux - # Note: This action will create a new release tag if it doesn't exist, - # or upload assets to an existing tag. Be mindful of potential tag name conflicts. + files: dist/* + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-linux-${{ env.BUILD_DATE }} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the secret provided by GitHub Actions for authentication \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 6e32ef0171b075605fa527181b924a816a237b77 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 19:42:56 +0800 Subject: [PATCH 420/518] ci: Remove outdated workflow Signed-off-by: JamePeng --- .github/workflows/build-and-release.yaml | 145 ----------------------- .github/workflows/build-docker.yaml | 50 -------- 2 files changed, 195 deletions(-) delete mode 100644 .github/workflows/build-and-release.yaml delete mode 100644 .github/workflows/build-docker.yaml diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml deleted file mode 100644 index 7eaf017fbc..0000000000 --- a/.github/workflows/build-and-release.yaml +++ /dev/null @@ -1,145 +0,0 @@ -name: Build Release - -on: workflow_dispatch - -permissions: - contents: write - -jobs: - build_wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-22.04, windows-2022, macos-14, macos-15] - - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - # Used to host cibuildwheel - - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies (Linux/MacOS) - if: runner.os != 'Windows' - run: | - python -m pip install --upgrade pip - python -m pip install uv - RUST_LOG=trace python -m uv pip install -e .[all] --verbose - shell: bash - - - name: Install dependencies (Windows) - if: runner.os == 'Windows' - env: - RUST_LOG: trace - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - shell: cmd - - - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 - env: - # disable repair - CIBW_REPAIR_WHEEL_COMMAND: "" - with: - package-dir: . - output-dir: wheelhouse - - - uses: actions/upload-artifact@v4 - with: - name: wheels-${{ matrix.os }} - path: ./wheelhouse/*.whl - - build_wheels_arm64: - name: Build arm64 wheels - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 - env: - CIBW_SKIP: "*musllinux* pp*" - CIBW_REPAIR_WHEEL_COMMAND: "" - CIBW_ARCHS: "aarch64" - CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON" - CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" - with: - output-dir: wheelhouse - - - name: Upload wheels as artifacts - uses: actions/upload-artifact@v4 - with: - name: wheels_arm64 - path: ./wheelhouse/*.whl - - build_sdist: - name: Build source distribution - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies (Linux/MacOS) - if: runner.os != 'Windows' - run: | - python -m pip install --upgrade pip - python -m pip install uv - RUST_LOG=trace python -m uv pip install -e .[all] --verbose - python -m uv pip install build - shell: bash - - - name: Install dependencies (Windows) - if: runner.os == 'Windows' - env: - RUST_LOG: trace - run: | - python -m pip install --upgrade pip - python -m pip install uv - python -m uv pip install -e .[all] --verbose - python -m uv pip install build - shell: cmd - - - name: Build source distribution - run: | - python -m build --sdist - - - uses: actions/upload-artifact@v4 - with: - name: sdist - path: ./dist/*.tar.gz - - release: - name: Release - needs: [build_wheels, build_wheels_arm64, build_sdist] - runs-on: ubuntu-latest - - steps: - - uses: actions/download-artifact@v4 - with: - merge-multiple: true - path: dist - - - uses: softprops/action-gh-release@v2 - with: - files: dist/* - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml deleted file mode 100644 index b290f6273f..0000000000 --- a/.github/workflows/build-docker.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: Build Docker - -on: workflow_dispatch - -permissions: - contents: write - packages: write - -jobs: - docker: - name: Build and push Docker image - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: "recursive" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - id: docker_build - uses: docker/build-push-action@v6 - with: - context: . - file: "docker/simple/Dockerfile" - push: ${{ startsWith(github.ref, 'refs/tags/') }} - pull: true - platforms: linux/amd64,linux/arm64 - tags: | - ghcr.io/abetlen/llama-cpp-python:latest - ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }} - build-args: | - BUILDKIT_INLINE_CACHE=1 - - - name: Publish to GitHub Tag - if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/') - run: | - echo "Docker image published for tag: ${{ github.ref_name }}" From b8d69f29e71b74973a7fd295e32d9e9d86908f5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:11:42 +0800 Subject: [PATCH 421/518] Update .gitmodules submodule git addr Signed-off-by: JamePeng --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7edf0975dc..f56cca32df 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/ggml-org/llama.cpp.git From ec580cb1c3b0ef946523979dca63df5c2d0483cc Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:23:49 +0800 Subject: [PATCH 422/518] chore(docs): remove outdated mkdocs workflow - Transition documentation focus to the repository wiki and /docs/wiki. - Clean up and remove all unnecessary mkdocs-related configuration files. Signed-off-by: JamePeng --- .readthedocs.yaml | 24 ------------ docs/api-reference.md | 88 ------------------------------------------- docs/changelog.md | 1 - docs/index.md | 5 --- docs/requirements.txt | 3 -- mkdocs.yml | 74 ------------------------------------ pyproject.toml | 14 ++----- 7 files changed, 3 insertions(+), 206 deletions(-) delete mode 100644 .readthedocs.yaml delete mode 100644 docs/api-reference.md delete mode 100644 docs/changelog.md delete mode 100644 docs/index.md delete mode 100644 docs/requirements.txt delete mode 100644 mkdocs.yml diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index ff3e950cd1..0000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Read the Docs configuration file for MkDocs projects -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-22.04 - tools: - python: "3.11" - -mkdocs: - configuration: mkdocs.yml - -python: - install: - - method: pip - path: . - - requirements: docs/requirements.txt - -submodules: - include: all - recursive: true \ No newline at end of file diff --git a/docs/api-reference.md b/docs/api-reference.md deleted file mode 100644 index ab51ef754e..0000000000 --- a/docs/api-reference.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -title: API Reference ---- - -## High Level API - -High-level Python bindings for llama.cpp. - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - create_chat_completion_openai_v1 - - set_cache - - save_state - - load_state - - token_bos - - token_eos - - from_pretrained - show_root_heading: true - -::: llama_cpp.LlamaGrammar - options: - members: - - from_string - - from_json_schema - -::: llama_cpp.LlamaCache - options: - show_root_heading: true - -::: llama_cpp.LlamaState - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessor - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessorList - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteria - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteriaList - options: - show_root_heading: true - -## Low Level API - -Low-level Python bindings for llama.cpp using Python's ctypes library. - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - # filter only members starting with `llama_` - filters: - - "^llama_" - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - # filter only members starting with `LLAMA_` - filters: - - "^LLAMA_" - -## Misc - -::: llama_cpp.llama_types - options: - show_if_no_docstring: true \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index 047bc14424..0000000000 --- a/docs/changelog.md +++ /dev/null @@ -1 +0,0 @@ --8<- "CHANGELOG.md" \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 60bc7aef42..0000000000 --- a/docs/index.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Getting Started ---- - --8<- "README.md" \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 199bd4ffbf..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mkdocs -mkdocs-material -mkdocstrings[python] \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 79a9e67a1a..0000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,74 +0,0 @@ -site_name: llama-cpp-python -repo_url: https://github.com/abetlen/llama-cpp-python - -theme: - name: material - palette: - - # Palette toggle for light mode - - scheme: default - primary: indigo - toggle: - icon: material/brightness-7 - name: Switch to dark mode - - # Palette toggle for dark mode - - scheme: slate - primary: indigo - toggle: - icon: material/brightness-4 - name: Switch to light mode - -plugins: - - search - - mkdocstrings: - handlers: - python: - options: - members_order: source - group_by_category: false - signature_crossrefs: true - show_signature: true - docstring_section_style: list - show_root_heading: true - heading_level: 3 - preload_modules: - - typing - - typing_extensions - - ctypes - import: - - https://docs.python.org/3/objects.inv - - https://numpy.org/doc/stable/objects.inv - -watch: - - llama_cpp - - README.md - -nav: - - "Getting Started": "index.md" - - "Installation Guides": - - "macOS (Metal)": "install/macos.md" - - "API Reference": "api-reference.md" - - "OpenAI Compatible Web Server": "server.md" - - "Changelog": "changelog.md" - -markdown_extensions: - - attr_list - - pymdownx.emoji: - emoji_index: !!python/name:materialx.emoji.twemoji - emoji_generator: !!python/name:materialx.emoji.to_svg - - pymdownx.highlight: - anchor_linenums: true - line_spans: __span - pygments_lang_class: true - - pymdownx.inlinehilite - - pymdownx.magiclink: - repo_url_shorthand: true - user: abetlen - repo: llama-cpp-python - - pymdownx.snippets - - pymdownx.superfences - - pymdownx.tabbed: - alternate_style: true - - pymdownx.tilde - - tables diff --git a/pyproject.toml b/pyproject.toml index 2e439c0685..eb4b879dd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,17 +49,9 @@ test = [ "pydantic-settings>=2.0.1", "huggingface-hub>=0.23.0" ] -dev = [ - "black>=23.3.0", - "twine>=4.0.2", - "mkdocs>=1.4.3", - "mkdocstrings[python]>=0.22.0", - "mkdocs-material>=9.1.18", - "pytest>=7.4.0", - "httpx>=0.24.1", -] + all = [ - "llama_cpp_python[server,test,dev]", + "llama_cpp_python[server,test]", ] [tool.scikit-build] @@ -76,7 +68,7 @@ input = "llama_cpp/__init__.py" [project.urls] Homepage = "https://github.com/JamePeng/llama-cpp-python" Issues = "https://github.com/JamePeng/llama-cpp-python/issues" -Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/" +Documentation = "https://github.com/JamePeng/llama-cpp-python/wiki" Changelog = "https://github.com/JamePeng/llama-cpp-python/blob/main/CHANGELOG.md" FAQ = "https://github.com/JamePeng/llama-cpp-python?tab=readme-ov-file#faq" From d33d98806f1e55219a9a9bfde9557b06e8f16b01 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:28:23 +0800 Subject: [PATCH 423/518] Update Submodule vendor/llama.cpp b64739e..39cf5d6 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b64739ea39..39cf5d6191 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b64739ea393b3c9d07cc9907e0a611f707838051 +Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee From e87041e4ee6a89798abe9f36315f60f3fb06c5cb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 22:57:56 +0800 Subject: [PATCH 424/518] docs(readme): update wheel requirements and dynamic CPU backend info - Update supported CUDA versions to include 12.8 and 13.1, while outlining the supported compute architectures (SM70 up to SM120a). - Document the transition to `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` starting in `0.3.39-preview`. - Clarify that dynamic CPU backend loading eliminates the need for separate `Basic` and `AVX2` wheel distributions. - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for achieving full x64 CPU variant coverage on Windows. Signed-off-by: JamePeng --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index caec7e32e0..4a1550a85c 100644 --- a/README.md +++ b/README.md @@ -162,12 +162,41 @@ pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python **Pre-built Wheel (New)** -It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: +It is also possible to install a pre-built wheel with CUDA support. Make sure your system meets the following requirements: -- CUDA Version is 12.4, 12.6, 12.8 or 13.0 -- Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 -- Basic version(Default): A version compiled without using AVX instructions (for compatibility with CPU platforms lacking AVX instructions or with AVX instruction compatibility issues). -- AVX2 version: A version compiled using AVX2 instructions. +- CUDA version: 12.4, 12.6, 12.8, or 13.1 +- Python version: 3.10, 3.11, 3.12, 3.13, or 3.14 +- Starting with `0.3.39-preview`, Windows and Linux x64 wheels are built with `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS`. + +This means CPU backends are shipped as dynamically loaded runtime libraries under: + +```text +site-packages/llama_cpp/lib +```` + +Supported CPU backend variants may include: + +* `ggml-cpu-x64` +* `ggml-cpu-sse42` +* `ggml-cpu-sandybridge` +* `ggml-cpu-ivybridge` +* `ggml-cpu-piledriver` +* `ggml-cpu-haswell` +* `ggml-cpu-skylakex` +* `ggml-cpu-cannonlake` +* `ggml-cpu-cascadelake` +* `ggml-cpu-cooperlake` +* `ggml-cpu-icelake` +* `ggml-cpu-alderlake` +* `ggml-cpu-sapphirerapids` +* `ggml-cpu-zen4` + +The old `Basic` and `AVX2` wheel variants are no longer required for the new dynamic-backend wheels. GGML can load the compatible CPU backend at runtime, which improves CPU instruction-set compatibility across different x64 machines. + +Before `0.3.39-preview`: + +* `Basic`: compiled without AVX instructions for maximum compatibility. +* `AVX2`: compiled with AVX2 instructions for newer CPUs. Check the releases page: https://github.com/JamePeng/llama-cpp-python/releases @@ -1695,17 +1724,20 @@ This error is primarily caused by the following reasons: 3. **CUDA Version Mismatch:** Regarding `ggml-cuda.dll`, the CUDA version of the pre-compiled library does not match your local CUDA Toolkit version (e.g., a mismatch between CUDA 12.X and CUDA 13.X). It is recommended to fully configure your local CUDA Toolkit environment (ensuring the PATH for dynamic libraries is set and the nvcc compiler is recognized). Then, clone the code and compile it locally. -### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions range from 300MB to 900MB? +### Why are libraries compiled by other authors only around 100MB, while your pre-compiled versions are 300MB or larger? -My GitHub Actions script is configured to compile against **all supported CUDA compute architectures** for each specific CUDA version I maintain. +My GitHub Actions workflow is configured to compile against multiple supported CUDA compute architectures for each CUDA version I maintain. For example: -* **CUDA 13.0.2:** Currently supports architectures from SM75 (Turing) up to SM120a (Blackwell). -* **CUDA 12.4.1 and 12.6.3:** Support older architectures as well, such as SM70. -* *(Note: The Windows versions are built to support every architecture compatible with the respective CUDA version).* +- **CUDA 13.1 and CUDA 12.8:** currently target architectures from SM75 (Turing) up to SM120a / SM121a (Blackwell generation, depending on CUDA support). +- **CUDA 12.4 and CUDA 12.6:** currently target architectures from SM70 (Volta) up to SM90 (Hopper). + +Libraries from other authors are often smaller because they may only compile for a single architecture, such as RTX 30 series (`SM86`) or RTX 40 series (`SM89`). To maximize compatibility, these wheels include CUDA kernels for a wider range of GPUs. You only need to choose the wheel that matches your installed CUDA version. + + - **Updated 2026-05-16 / 2026-05-17:** Starting with `0.3.39-preview`, Windows wheels support the `GGML_BACKEND_DL` + `GGML_CPU_ALL_VARIANTS` runtime layout. CPU backend libraries such as `ggml-cpu-*.dll` are packaged under `site-packages/llama_cpp/lib` and loaded dynamically at runtime. This allows GGML to select a compatible CPU backend automatically, reducing the need for separate `Basic` / `AVX2` wheel variants. -The reason libraries from other authors are smaller is that they often **only compile for a single architecture** (e.g., targeting only the RTX 30 series [SM86] or the RTX 40 series [SM89]). To maximize convenience, I provide an **integrated compilation** covering a wide range of hardware; you simply need to select the CUDA version that matches your environment to load and run it. + - Note: for full x64 CPU variant coverage on Windows, LLVM/Clang builds are preferred. MSVC may skip some variants such as `zen4`, `cooperlake`, or `sapphirerapids` due to compiler intrinsic support limitations. ### Quick tips for develop/user (continuously updated): From a778c57d73ec7d4f43e2518a513e7d4cf68a0df8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 17 May 2026 23:35:57 +0800 Subject: [PATCH 425/518] Bump version to 0.3.39 Signed-off-by: JamePeng --- CHANGELOG.md | 116 ++++++++++++++++++++++++++++++++++++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 253b2ae4cc..e4c6e4c976 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,122 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.39] Dynamic GGML Backends, Qwen3-ASR/MiniCPM-V-4.6, On-Device Hybrid Checkpoint, and Granular Logging + +- **ci(cu131/128/126/124): build wheels with GGML dynamic backends for windows/Linux** + - Replace the old CPU/AVX release tag matrix with a single backend + wheel layout. + - Enable `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` so Windows wheels ship + runtime-loadable GGML backend DLLs and CPU variant backends. + - Use the Windows LLVM toolchain and disable non-wheel targets such as examples, + tests, tools, server, embedded UI, and curl. + - Remove the `.basic` style local version suffix and publish wheels + as `+cu131`. + - Update CUDA architectures to CUDA 13.1 and simplify CMake argument handling. + - Note: for full x64 CPU variant coverage on Windows, LLVM/Clang builds are preferred. MSVC may skip some variants such as zen4, cooperlake, or sapphirerapids due to compiler intrinsic support limitations. + +- **feat(core): support loading GGML_BACKEND_DL dynamic backend libraries from wheel lib** + - Import `ggml_backend_load_all_from_path` and `ggml_backend_reg_count` + from `_ggml`. + - Load dynamic ggml backend libraries from the packaged `llama_cpp/lib` + directory after `llama_backend_init()`. + - Support wheels built with `GGML_BACKEND_DL`, where CPU variants and + accelerator backends such as `ggml-cpu-*` and `ggml-cuda` are shipped as + separate runtime libraries. + - Print the registered backend count in verbose mode to help diagnose backend + discovery issues. + +- **build(cmake): refactor install target lists for new GGML backend layout** + - Categorize build targets into logical groups (`LLAMA_CPP_TARGETS`, + `GGML_CORE_TARGETS`, `GGML_CPU_VARIANT_TARGETS`, and `GGML_BACKEND_TARGETS`) + to improve maintainability and keep the Python package installation in sync + with the updated upstream GGML backend layout. + - Add missing targets such as `llama-common` and the separated + `ggml-cpu-*` CPU variant backends. + - Ensure all grouped targets are passed through `llama_cpp_python_install_target`. + - Update llama build option descriptions to match the current upstream naming style. + - Explicitly disable `LLAMA_BUILD_SERVER` to avoid building the server target for Python package wheels. + - Explicitly disable `LLAMA_BUILD_UI` and `LLAMA_USE_PREBUILT_UI` because the + embedded server Web UI is not needed for wheel builds. + - Keep examples, tests, and curl support disabled for minimal wheel artifacts. + - Add a cleanup function to strip `cmake`, `pkgconfig`, and import libraries from the python wheel runtime directories. + - Ensures Windows builds only package the required runtime DLLs. + +- **Implement Qwen3ASRChatHandler for Qwen3-ASR models.** + - Integrate MTMD multimodal logic to extract and inject `audio_url` and base64 `input_audio` data directly into the `<|audio_start|><|audio_pad|>[DATA]<|audio_end|>` sequence. + - Define a default multilingual transcription system prompt and configure model-specific stop tokens. + - docs(README.md): add Qwen3-ASR documentation and usage example + - Update the supported multi-modal models table to include `qwen3-asr` and the `Qwen3ASRChatHandler`. + - Add a new dedicated section for Speech-to-Text inference with a complete, collapsible Python script. + - Provide a `build_media_payload` helper function to demonstrate proper Base64 encoding of local `.wav` and `.mp3` files into OpenAI-compatible `input_audio` schemas. + - Include a critical warning advising users to use BF16 quantization for the multimodal projector (`mmproj`) to prevent audio degradation. + - Clarify usage mechanics, specifically that all instructions must be placed in the `system` role due to the ASR template's text-dropping behavior. + +- **Implement MiniCPMV46ChatHandler for MiniCPM-V-4.6** + +- **feat(core): integrate fine-grained logging API into Llama class** + - This commit exposes the newly refactored `_logger` configuration system directly through the `Llama` class, providing users with robust, programmatic control over native `llama.cpp` backend logs. + - docs(wiki): document runtime verbosity and log filters for Llama + - docs(Llama.md): update verbose=False vs. verbosity=0 note + - Key changes: + - Expand `Llama.__init__` with `verbosity`, `log_filters`, and `log_filters_case_sensitive` parameters. + - Add instance methods for runtime log management (`set_verbosity`, `get_verbosity`, `set_log_filters`, `add_log_filters`, `clear_log_filters`, etc.). + - Add comprehensive docstrings explaining the 0-5 verbosity scale and explicitly noting the process-global nature of the native backend logger. + - Advantages over the legacy implementation: + - Granular Control: Replaces the restrictive binary `verbose=True/False` flag (which only toggled between ERROR and DEBUG) with a granular 6-tier scale (output, error, warn, info, trace, debug). + - Dynamic Filtering: Empowers users to actively suppress specific noisy C++ logs using custom substring filters, removing the need for hardcoded internal patches. + - Better Discoverability: Attaches logging controls directly to the `Llama` object, making log management much more accessible and intuitive without requiring users to import internal logger modules. + +- **feat(logger): refactor and enhance ggml logging configuration system** + - Introduce a `LoggerConfig` dataclass to provide fine-grained control over native ggml/llama.cpp runtime logging. + - Align `verbosity` levels (0 to 5) with upstream `llama.cpp` conventions (`common/log.h`). + - Implement a dynamic, configurable substring filtering system, replacing the hardcoded "CUDA Graph" patch with `DEFAULT_LOG_FILTERS`. + - Add comprehensive public APIs for log management: `configure_logging`, `set_verbosity`, `set_quiet`, `set_silent`, `set_log_filters`, and `add_log_filters`. + - Maintain backwards compatibility for the existing `set_verbose(bool)` function. + - Improve the `ggml_log_callback` to correctly handle `GGML_LOG_LEVEL_CONT` by inheriting the verbosity of the preceding log message. + - Route `GGML_LOG_LEVEL_NONE` to `stdout` and all other diagnostic logs to `stderr` by default. + - docs(Logger.md): Upload Logger documentation + +- fix(MTMDChatHandler): correct audio_url content type check and improve variable handling + - Changed condition from `content == "audio_url"` to `content_type == "audio_url"` for proper type-based dispatching. + - Extracted `audio_url` variable for better readability. + - Converted `else` to `elif content_type == "input_audio"` to make the control flow explicit and safer. + +- fix(_internals): Remove unnecessary free operations; models should not be released within the context. + +- **feat(cache): add on-device hybrid checkpoint support** + - Introduce `HybridCheckpointCache` with dual-mode behavior (Host/On-Device). + - Device mode utilizes `LLAMA_STATE_SEQ_FLAGS_ON_DEVICE` to keep tensor + payloads in `llama_context` VRAM, reducing host-device copy overhead. + - Host mode remains the default, preserving full Python-owned rollback history. + - Implement safety guards against stale on-device checkpoint restores and + enforce one active device checkpoint per `seq_id`. + - Unify checkpoint management with shared FIFO eviction. + - Expose `checkpoint_on_device` in `Llama.__init__` and reduce default + `ctx_checkpoints` from 32 to 16. + - Enhance verbose logging and docs to clarify host vs. VRAM ownership + semantics and track memory usage accurately. + - Rename internal `_flag_partial` to `_flags` to support multiple state flags. + - Update /docs/wiki/core/Llama.md for on_device option + - Update /docs/wiki/modules/LlamaCache.md for on_device option + +- docs: Update /docs/wiki and README.md file and remove outdated mkdocs workflow + - docs(readme): update wheel requirements and dynamic CPU backend info + - Update supported CUDA versions to include 12.8 and 13.1, while outlining + the supported compute architectures (SM70 up to SM120a). + - Document the transition to `GGML_BACKEND_DL` and `GGML_CPU_ALL_VARIANTS` + starting in `0.3.39-preview`. + - Clarify that dynamic CPU backend loading eliminates the need for separate + `Basic` and `AVX2` wheel distributions. + - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for + achieving full x64 CPU variant coverage on Windows. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee](https://github.com/ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee) + +- feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260517 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/ef27f333f367fdc53dc1a729ad8bb6c3c9362514...e87041e4ee6a89798abe9f36315f60f3fb06c5cb + ## [0.3.38] Optimized CJK Detokenization, Sync Grammar Parser, and Patched CUDA Graph Logs - perf: Optimize detokenize buffer sizing for CJK-heavy outputs diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index b32fbfd36e..ec28faae66 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.39-preview" +__version__ = "0.3.39" From a96f2807c3be057650b7bc34173274e1cda68128 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:41:47 +0800 Subject: [PATCH 426/518] ci(metal): upgrade actions/download-artifact@v6 ->v7 - actions/download-artifact@v7 now runs on Node.js 24 Signed-off-by: JamePeng --- .github/workflows/build-wheels-metal.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 40675b4c26..a809909720 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -75,7 +75,7 @@ jobs: uses: actions/checkout@v6 - name: Download artifacts - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: merge-multiple: true path: dist2 From b48d57a2b4019bbd248c848eefa1442c9e7890cb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:43:37 +0800 Subject: [PATCH 427/518] Update Submodule vendor/llama.cpp 39cf5d6..6db1304 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 39cf5d6191..6db130445d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee +Subproject commit 6db130445d29b243ee2171efb8cd61b84a1c5322 From f309265b0df3ab2477682db3a959656dcb6d06e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 19:36:28 +0800 Subject: [PATCH 428/518] build(ci+cu131): bundle LLVM OpenMP runtime for Windows CPU backends - Add a PowerShell step to the Windows CI workflow to locate and copy `libomp140.x86_64.dll` from the Visual Studio redistributables. - Place the runtime DLL into the `llama_cpp\lib` package directory. This ensures that the dynamically loaded `ggml-cpu-*.dll` variants (which are built with LLVM OpenMP on Windows) have their required dependencies packaged in the wheel. Without this, `ggml_backend_load_all_from_path()` can silently fail to load the CPU backends at runtime on end-user machines. Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu131-win.yml | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 14bea65d19..5f77003a5f 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,6 +67,31 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + - name: Copy LLVM OpenMP runtime + shell: pwsh + run: | + # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. + # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. + # If it is missing from the wheel, ggml_backend_load_all_from_path() + # may fail to load CPU backend DLLs at runtime. + $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" + New-Item -ItemType Directory -Force $packageLibDir | Out-Null + + $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` + -Recurse ` + -Filter "libomp140.x86_64.dll" ` + -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "OpenMP\.LLVM" } | + Select-Object -First 1 + + if (!$omp) { + Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." + exit 1 + } + + Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force + Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" + - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') From dd61687fc6cbabb0885e45d708cc9562d1bd2d53 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 18 May 2026 21:43:37 +0800 Subject: [PATCH 429/518] Update Submodule vendor/llama.cpp 39cf5d6..6db1304 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 39cf5d6191..d14ce3dab4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 39cf5d61915769124b7efbbfa69c46f19a6363ee +Subproject commit d14ce3dab4de197adec5166faa54ac5db8262f26 From 2bc3cdded9285b591454e11e50a4b1524afa32ff Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 22:54:36 +0800 Subject: [PATCH 430/518] build(cmake): package LLVM OpenMP runtime DLL for Windows wheels Dynamically loaded GGML CPU backends compiled with LLVM/Clang and OpenMP require `libomp140.x86_64.dll` at runtime. - Add `llama_cpp_python_install_windows_runtime_file` to handle installing arbitrary extra DLLs with proper CMake path normalization. - Add `llama_cpp_python_install_windows_openmp_runtime` to automatically locate the OpenMP DLL in common Visual Studio 2022 directories, with an override available via `LLAMA_CPP_OPENMP_RUNTIME_DLL`. - Execute the OpenMP runtime installation before the dev-file cleanup step to ensure the DLL is correctly packaged in the final wheel. Signed-off-by: JamePeng --- CMakeLists.txt | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e5d583d90..f6dfb7c136 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,106 @@ function(llama_cpp_python_install_target target) endfunction() +# Install an extra Windows runtime DLL into the Python package runtime directory. +# +# Some dynamically loaded backend libraries depend on runtime DLLs that are not +# always discoverable through $. One important example +# is libomp140.x86_64.dll, required by LLVM OpenMP CPU backend variants. +function(llama_cpp_python_install_windows_runtime_file runtime_file) + if(NOT WIN32) + return() + endif() + + if(NOT runtime_file) + return() + endif() + + if(NOT EXISTS "${runtime_file}") + message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}") + return() + endif() + + # Normalize Windows paths for generated cmake_install.cmake. + # Without this, paths like C:\Program Files (...) may produce invalid + # CMake escape sequences such as \P during install. + file(TO_CMAKE_PATH "${runtime_file}" runtime_file_cmake) + + set(INSTALL_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib" + "${SKBUILD_PLATLIB_DIR}/llama_cpp/lib" + ) + + foreach(DIR ${INSTALL_DIRS}) + file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + + install( + FILES "${runtime_file_cmake}" + DESTINATION "${DIR_CMAKE}" + ) + endforeach() +endfunction() + + +# Locate and install the Windows LLVM OpenMP runtime when available. +# +# GGML CPU all-variant backends built with LLVM/Clang + OpenMP depend on +# libomp140.x86_64.dll. Since ggml-cpu-*.dll files are loaded dynamically via +# ggml_backend_load_all_from_path(), the OpenMP runtime must be packaged next to +# them under llama_cpp/lib. +# +# CI may pass LLAMA_CPP_OPENMP_RUNTIME_DLL explicitly. Local builds can rely on +# fallback search paths for Visual Studio Enterprise / BuildTools. +function(llama_cpp_python_install_windows_openmp_runtime) + if(NOT WIN32) + return() + endif() + + set(OPENMP_RUNTIME_DLL "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + else() + file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) + file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) + + set(VS_OPENMP_SEARCH_ROOTS + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" + ) + + foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS}) + if(EXISTS "${ROOT}") + file( + GLOB_RECURSE FOUND_OPENMP_DLLS + "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll" + "${ROOT}/**/libomp140.x86_64.dll" + ) + + if(FOUND_OPENMP_DLLS) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + break() + endif() + endif() + endforeach() + endif() + + if(OPENMP_RUNTIME_DLL) + message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}") + llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") + else() + message(WARNING + "Could not find libomp140.x86_64.dll. " + "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " + "the packaged ggml-cpu-*.dll files may fail to load at runtime. " + "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " + "to package it explicitly." + ) + endif() +endfunction() + + # Remove development-only artifacts from Python wheel runtime directories. # # Upstream install rules may place CMake package files, pkg-config files, and @@ -241,6 +341,11 @@ if (LLAMA_BUILD) llama_cpp_python_install_target(mtmd) endif() + # Install Windows LLVM OpenMP runtime when available. + # This must run before cleanup so the final wheel keeps runtime DLLs but + # removes development-only files such as .lib, cmake/, and pkgconfig/. + llama_cpp_python_install_windows_openmp_runtime() + # Run after all runtime targets are installed, including mtmd. llama_cpp_python_cleanup_dev_files() From fa36f70421815f3e050f1538e37f40feb5a7005a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 23:00:09 +0800 Subject: [PATCH 431/518] Update CHANGELOG.md upstream version link Signed-off-by: JamePeng --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4c6e4c976..e8ebb5cd3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -117,7 +117,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add a technical note in the FAQ recommending LLVM/Clang over MSVC for achieving full x64 CPU variant coverage on Windows. -- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee](https://github.com/ggml-org/llama.cpp/commit/39cf5d61915769124b7efbbfa69c46f19a6363ee) +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/d14ce3dab4de197adec5166faa54ac5db8262f26](https://github.com/ggml-org/llama.cpp/commit/d14ce3dab4de197adec5166faa54ac5db8262f26) - feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260517 From d37951799450b6461ac73160630371c9d1d36065 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 19 May 2026 23:07:12 +0800 Subject: [PATCH 432/518] ci: pin windows runner and streamline python test matrix - Pin the Windows CI runner from `windows-latest` to `windows-2022` to ensure build environment stability and prevent unexpected breakages from runner updates. - Remove Python 3.13 from the test matrix to reduce CI runtime and resource consumption. - Retain Python 3.9 (oldest supported) and 3.14 (latest) to ensure compatibility boundaries are still properly tested across Ubuntu, Windows, and macOS (Metal / Non-Metal). Signed-off-by: JamePeng --- .github/workflows/test.yaml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 335b0f0ac3..420c5e9495 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,18 +24,14 @@ jobs: # Don't cancel other jobs in the matrix if one fails fail-fast: false matrix: - os: [ubuntu-latest, windows-latest] - python-version: ["3.9", "3.13", "3.14"] + os: [ubuntu-latest, windows-2022] + python-version: ["3.9", "3.14"] include: # macOS Non-Metal - os: macos-14 python-version: "3.9" cmake_args: "-DLLAMA_METAL=off" metal_status: "(No Metal)" - - os: macos-14 - python-version: "3.13" - cmake_args: "-DLLAMA_METAL=off" - metal_status: "(No Metal)" - os: macos-14 python-version: "3.14" cmake_args: "-DLLAMA_METAL=off" @@ -46,10 +42,6 @@ jobs: python-version: "3.9" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" metal_status: "(Metal)" - - os: macos-14 - python-version: "3.13" - cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" - metal_status: "(Metal)" - os: macos-14 python-version: "3.14" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" From c7668b150fb1f8f36ecca0100f294829dfef7e66 Mon Sep 17 00:00:00 2001 From: DELUXA Date: Wed, 20 May 2026 15:39:41 +0300 Subject: [PATCH 433/518] Add Windows ROCm build instructions --- README.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a1550a85c..14148562bf 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,9 @@ https://github.com/JamePeng/llama-cpp-python/releases
HIP (ROCm) +
+Linux ROCm + This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). @@ -303,6 +306,40 @@ More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/bu
+
+Windows ROCm + +> **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. + +```powershell +cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } + +rocm-sdk init + +$ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" +$ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" +$ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName + +$env:HIP_PATH = $ROCM_DEVEL +$env:ROCM_PATH = $ROCM_DEVEL +$env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" +$env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" +$env:CMAKE_GENERATOR = "Ninja" +$env:HIP_PLATFORM = "amd" +$env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" +$env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" +$env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" + +$R = $ROCM_DEVEL -replace '\\', '/' +$env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" + +pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir +``` + +
+ +
+
Vulkan @@ -1743,7 +1780,7 @@ Libraries from other authors are often smaller because they may only compile for * 1. I've determined that `llama_cpp.server` is currently in a semi-deprecated state (meaning it won't be maintained unless absolutely necessary, and I might even consider deleting or separating it to reduce the library size). I highly recommend using the `llama-server` program maintained by the upstream `llama.cpp` project, which offers a lower-level implementation, more frequent maintenance and optimization, and more reliable API calls. -* 2. Regarding AMD and Intel graphics cards, AMD can certainly use ROCm as the primary backend (but the drawback is that it's basically only stable on Linux platforms), and Intel's Sycl will also encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. +* 2. Regarding AMD and Intel graphics cards, AMD can use ROCm as the primary backend, while Intel's Sycl will encounter some compilation difficulties. I consistently recommend using the Vulkan backend for these two types of graphics cards for greater efficiency and stability, because the upstream `llama.cpp` Vulkan backend is actively maintained by many developers, generally allowing you to enjoy new feature optimizations and bug fixes earlier and faster. * 3. If you are using hybrid multimodal model for building ComfyUI nodes or running single-turn API wrappers where you do not need multi-turn state rollbacks, simply initialize your Llama instance with `ctx_checkpoints=0`: From a4080a4a7e1f4550fd534fa3f798d39946089c7a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 20 May 2026 22:00:36 +0800 Subject: [PATCH 434/518] docs: Optimize the formatting of the ROCm section in README.md. Signed-off-by: JamePeng --- README.md | 68 +++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 14148562bf..07d7635cbc 100644 --- a/README.md +++ b/README.md @@ -288,55 +288,55 @@ https://github.com/JamePeng/llama-cpp-python/releases
HIP (ROCm) -
-Linux ROCm + -
+ Linux ROCm -This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. + This provides GPU acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. -You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). + You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). -To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: + To install with HIP / ROCm support for AMD cards, set the `GGML_HIP=ON` environment variable before installing: -```bash -CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" -``` -Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. + ```bash + CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" + ``` + Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system. -More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip + More details see here: https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip -
+
-
-Windows ROCm + -
+ Windows ROCm -> **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. + > **Note:** Install TheRock ROCm, activate your venv, then run in PowerShell. Replace `gfx1200` with your GPU architecture. -```powershell -cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } + ```powershell + cmd /c '"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 && set' | ForEach-Object { if ($_ -match '^([^=]+)=(.*)$') { [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], 'Process') } } -rocm-sdk init + rocm-sdk init -$ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" -$ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" -$ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName + $ROCM_DEVEL = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_devel" + $ROCM_CORE = "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_core" + $ROCM_GFX = (Get-Item "$env:VIRTUAL_ENV\Lib\site-packages\_rocm_sdk_libraries_gfx*").FullName -$env:HIP_PATH = $ROCM_DEVEL -$env:ROCM_PATH = $ROCM_DEVEL -$env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" -$env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" -$env:CMAKE_GENERATOR = "Ninja" -$env:HIP_PLATFORM = "amd" -$env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" -$env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" -$env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" + $env:HIP_PATH = $ROCM_DEVEL + $env:ROCM_PATH = $ROCM_DEVEL + $env:HIP_DEVICE_LIB_PATH = "$ROCM_CORE\lib\llvm\amdgcn\bitcode" + $env:PATH = "$ROCM_DEVEL\bin;$ROCM_DEVEL\lib\llvm\bin;$ROCM_GFX\bin;$env:PATH" + $env:CMAKE_GENERATOR = "Ninja" + $env:HIP_PLATFORM = "amd" + $env:CC = "$ROCM_DEVEL\lib\llvm\bin\clang.exe" + $env:CXX = "$ROCM_DEVEL\lib\llvm\bin\clang++.exe" + $env:HIP_CLANG_PATH = "$ROCM_DEVEL\lib\llvm\bin" -$R = $ROCM_DEVEL -replace '\\', '/' -$env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" + $R = $ROCM_DEVEL -replace '\\', '/' + $env:CMAKE_ARGS = "-DGGML_HIP=ON -DGGML_HIPBLAS=on -DGPU_TARGETS=gfx1200 -DCMAKE_HIP_ARCHITECTURES=gfx1200 -DCMAKE_C_COMPILER=`"$R/lib/llvm/bin/clang.exe`" -DCMAKE_CXX_COMPILER=`"$R/lib/llvm/bin/clang++.exe`" -DHIP_LIBRARIES=`"$R/lib/amdhip64.lib`" -DCMAKE_PREFIX_PATH=`"$R`"" -pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir -``` + pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" --no-cache-dir + ``` -
+
From 89927d4633e3ff6dde3aa903c3cc84d454e040ad Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 20 May 2026 22:00:49 +0800 Subject: [PATCH 435/518] Update Submodule vendor/llama.cpp d14ce3d..e947228 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d14ce3dab4..e947228222 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d14ce3dab4de197adec5166faa54ac5db8262f26 +Subproject commit e947228222147356bc7e64154d3439e142481632 From 023780091755724b9e41d62d3df9f9ffcbafda09 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 21 May 2026 08:27:48 +0800 Subject: [PATCH 436/518] docs: Removed outdated macOS installation guides and added the latest installation notes. Signed-off-by: JamePeng --- README.md | 84 +++++++++++++++++++++++++++++++------------ docs/install/macos.md | 59 ------------------------------ 2 files changed, 62 insertions(+), 81 deletions(-) delete mode 100644 docs/install/macos.md diff --git a/README.md b/README.md index 07d7635cbc..6c56c034c3 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,8 @@ On MacOS, Metal is enabled by default(`GGML_METAL=ON`). Using Metal makes the co To disable the Metal build at compile time use the `CMAKE_ARGS="-DGGML_METAL=OFF"` cmake option. +When built with Metal support, you can explicitly disable GPU inference with the `n-gpu-layers=0` parameter. + ```bash pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` @@ -277,6 +279,7 @@ pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements: +- CPU Arch: arm64 - MacOS Version is 11.0 or later - Python Version is 3.10, 3.11, 3.12, 3.13 or 3.14 @@ -415,46 +418,83 @@ CMAKE_ARGS="-DGGML_RPC=on" pip install "llama-cpp-python @ git+https://github.co
-### Windows Notes - +### Install Notes
-Error: Can't find 'nmake' or 'CMAKE_C_COMPILER' + Optimization Options (Optional) -If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install: +> **💡 Tip:** If you want to save compilation time, you can skip building of llama.cpp with the standalone examples, tools, tests, and server by adding the following flags, as they are not required for Python bindings: -```ps -$env:CMAKE_GENERATOR = "MinGW Makefiles" -$env:CMAKE_ARGS = "-DGGML_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe" +```bash +-DLLAMA_BUILD_EXAMPLES=OFF \ +-DLLAMA_BUILD_TOOLS=OFF \ +-DLLAMA_BUILD_TESTS=OFF \ +-DLLAMA_BUILD_SERVER=OFF ``` - -See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
-### MacOS Notes +
+ CUDA compiler warning suppression is optional +CUDA nvcc compiler may print many template-related warnings from ggml-cuda, such as: -Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/) +```bash +warning #177-D +warning #221-D +warning #550-D +``` -
-M1 Mac Performance Issue +These usually generate a huge amount of noisy diagnostics rather than build blockers. They constantly flood logs and consume CPU printing performance. -Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +For cleaner CI/local logs, you can pass: ```bash -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh +-DCMAKE_CUDA_FLAGS="--diag-suppress=177 --diag-suppress=221 --diag-suppress=550" ``` - -Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
-M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))` + Notes for `GGML_BACKEND_DL` + `GGML_CPU_ALL_VARIANTS` builds +When building wheels with `GGML_BACKEND_DL=ON` and `GGML_CPU_ALL_VARIANTS=ON`, +GGML CPU backends are built as separate dynamic libraries, such as: -Try installing with +```text +ggml-cpu-x64.dll +ggml-cpu-haswell.dll +ggml-cpu-alderlake.dll +ggml-cpu-zen4.dll +``` +These backend libraries must be packaged together under: -```bash -CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +```text +site-packages/llama_cpp/lib ``` + +The runtime must also explicitly load them with: + +```text +ggml_backend_load_all_from_path() +``` + +### Windows notes + +For full x64 CPU variant coverage, `LLVM/Clang` is recommended. `MSVC` may skip some variants such as `zen4`, `cooperlake`, or `sapphirerapids`. + +If `GGML_OPENMP=ON` is used, the LLVM OpenMP runtime must also be packaged next to the backend DLLs: + +```text +libomp140.x86_64.dll +``` + +Without this file, `ggml-cpu-*.dll` may fail to load dynamically at runtime. + +### Wheel packaging checklist + +* Enable `GGML_BACKEND_DL=ON` +* Enable `GGML_CPU_ALL_VARIANTS=ON` +* Use `GGML_NATIVE=OFF` for portable wheels +* Install all `ggml-cpu-*` backend libraries into `llama_cpp/lib` +* Package required runtime dependencies such as `libomp140.x86_64.dll` +* Remove development-only files such as `.lib`, `cmake/`, and `pkgconfig/` +
### Upgrading and Reinstalling diff --git a/docs/install/macos.md b/docs/install/macos.md deleted file mode 100644 index e006fc0a3c..0000000000 --- a/docs/install/macos.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: MacOS Install with Metal GPU ---- - -**(1) Make sure you have xcode installed... at least the command line parts** -``` -# check the path of your xcode install -xcode-select -p - -# xcode installed returns -# /Applications/Xcode-beta.app/Contents/Developer - -# if xcode is missing then install it... it takes ages; -xcode-select --install -``` - -**(2) Install the conda version for MacOS that supports Metal GPU** -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` - -**(3) Make a conda environment** -``` -conda create -n llama python=3.9.16 -conda activate llama -``` - -**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** - *(you needed xcode installed in order pip to build/compile the C++ code)* -``` -pip uninstall llama-cpp-python -y -CMAKE_ARGS="-DGGML_METAL=on" pip install -U llama-cpp-python --no-cache-dir -pip install 'llama-cpp-python[server]' - -# you should now have llama-cpp-python v0.1.62 or higher installed -llama-cpp-python         0.1.68 - -``` - -**(5) Download a v3 gguf v2 model** - - **ggufv2** - - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0 - -https://huggingface.co/TheBloke/CodeLlama-7B-GGUF - - -**(6) run the llama-cpp-python API server with MacOS Metal GPU support** -``` -# config your ggml model path -# make sure it is gguf v2 -# make sure it is q4_0 -export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf -python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 -``` - -***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* - - From 14b98ae81802d8c89a55609ec2bf64349aac58f6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 21 May 2026 20:33:15 +0800 Subject: [PATCH 437/518] Update Submodule vendor/llama.cpp e947228..40d5358 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e947228222..40d5358d3c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e947228222147356bc7e64154d3439e142481632 +Subproject commit 40d5358d3c730b81729ba81cd5c44ed596d02510 From b2f09bb42c0242ae9fcc8a24f0456365891b28de Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 07:14:34 +0800 Subject: [PATCH 438/518] Update Submodule vendor/llama.cpp 40d5358..1acee6b Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 40d5358d3c..1acee6bf89 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 40d5358d3c730b81729ba81cd5c44ed596d02510 +Subproject commit 1acee6bf8939948f9bcbf4b14034e4b475f06069 From 78fa55bd5f8129ebbbf11a4fd6f7fef046707b85 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 08:00:23 +0800 Subject: [PATCH 439/518] feat(speculative): upgrade ngram map decoder with k/k4v modes Enhance `LlamaNGramMapDecoding` to align with the upstream llama.cpp ngram-map algorithm, offering better memory management and draft quality. - Introduce `mode` selection ("k" and "k4v"): "k" stores only historical positions for memory efficiency, while "k4v" caches continuation values directly for faster lookups. - Add `min_hits` threshold to filter out low-confidence drafts. - Implement `max_entries_per_key` to cap dictionary growth and prevent memory bloat during long-context generations. - Improve state synchronization (`_sync_and_index`) using `sync_check_tokens` to safely verify incremental history appends. - Add explicit lifecycle management methods (`clear`, `close`, `accept`) for better API symmetry and resource cleanup. Signed-off-by: JamePeng --- llama_cpp/llama_speculative.py | 313 ++++++++++++++++++++++++++------- 1 file changed, 252 insertions(+), 61 deletions(-) diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py index c3814aaf42..c4289d0797 100644 --- a/llama_cpp/llama_speculative.py +++ b/llama_cpp/llama_speculative.py @@ -1,7 +1,7 @@ import abc import collections -from typing import Any, Dict, List, Tuple +from typing import Any, DefaultDict, Dict, List, Literal, Optional, Tuple import numpy as np import numpy.typing as npt @@ -17,102 +17,293 @@ def __call__( class LlamaNGramMapDecoding(LlamaDraftModel): """ - Ultra-fast speculative decoder based on hash inverted index and incremental updates. - O(1) time complexity, aligned with llama.cpp's underlying ngram-map algorithm. + Fast model-free speculative decoder based on prompt n-gram lookup. + + It supports two modes: + + - "k": + Key-only mode. Stores n-gram key -> history positions. + This is memory-efficient and similar to llama.cpp's ngram-map-k behavior. + + - "k4v": + Key-to-value mode. Stores n-gram key -> continuation tokens. + This uses more memory, but can return cached continuations directly. + + This class does not use a draft model. It only speculates from already verified + token history. Therefore, rejected tokens are handled naturally when the next + `input_ids` is passed in. + + Aligned with llama.cpp's underlying ngram-map k/k4v algorithm. """ - def __init__(self, ngram_size: int = 3, num_pred_tokens: int = 10): + def __init__( + self, + ngram_size: int = 3, + num_pred_tokens: int = 10, + mode: Literal["k", "k4v"] = "k", + min_hits: int = 2, + max_entries_per_key: Optional[int] = None, + sync_check_tokens: int = 16, + ) -> None: """ - Initializes the N-Gram Map speculative decoder. - Args: - ngram_size (int): The length of the token sequence used as the search key. - Larger values provide strictly accurate context matching but may result - in fewer cache hits. Defaults to 3. - num_pred_tokens (int): The maximum number of future tokens to draft (predict) - and return once a match is found in the history. Defaults to 10. + ngram_size: + Number of tokens used as the lookup key. + + num_pred_tokens: + Maximum number of draft tokens to return. + + mode: + "k" stores only matched positions. + "k4v" stores matched continuation values directly. + + min_hits: + Minimum number of historical matches required before returning a draft. + Use 1 for maximum recall. Use >1 to reduce low-confidence drafts. + + max_entries_per_key: + Optional memory cap per n-gram key. + When set, only the most recent entries are kept. + For k4v mode, setting max_entries_per_key is strongly recommended. + + sync_check_tokens: + Number of trailing tokens used to verify whether the new input is an + incremental append of the previous input. This avoids expensive full + prefix comparison while still detecting most rollback/prompt-switch cases. """ - self.ngram_size = ngram_size - self.num_pred_tokens = num_pred_tokens + if ngram_size <= 0: + raise ValueError("ngram_size must be greater than 0") + if num_pred_tokens <= 0: + raise ValueError("num_pred_tokens must be greater than 0") + if min_hits <= 0: + raise ValueError("min_hits must be greater than 0") + if max_entries_per_key is not None and max_entries_per_key <= 0: + raise ValueError("max_entries_per_key must be None or greater than 0") + if sync_check_tokens <= 0: + raise ValueError("sync_check_tokens must be greater than 0") + + mode = mode.lower() + if mode not in ("k", "k4v"): + raise ValueError("mode must be either 'k' or 'k4v'") + + self.ngram_size = int(ngram_size) + self.num_pred_tokens = int(num_pred_tokens) + self.mode = mode + self.min_hits = int(min_hits) + self.sync_check_tokens = int(sync_check_tokens) + + if mode == "k4v" and max_entries_per_key is None: + max_entries_per_key = 8 + self.max_entries_per_key = max_entries_per_key - # Core state cache - # Mapping format: (token_1, ..., token_N) -> [index_1, index_2, ...] - self._ngram_map: Dict[Tuple[int, ...], List[int]] = collections.defaultdict(list) self._history: List[int] = [] - def _update_cache(self, input_ids: npt.NDArray[np.intc]) -> None: + # In "k" mode: + # key -> [position, position, ...] + self._map_k: DefaultDict[Tuple[int, ...], List[int]] = collections.defaultdict(list) + + # In "k4v" mode: + # key -> {position: continuation} + # + # A dict is used so that recent entries can be refreshed when more continuation + # tokens become available. + self._map_k4v: DefaultDict[ + Tuple[int, ...], Dict[int, Tuple[int, ...]] + ] = collections.defaultdict(dict) + + self._closed = False + self._last_draft_len = 0 + + def clear(self) -> None: """ - Smart state synchronization and incremental build (Extreme O(1) optimization). + Clear token history and indexes. - Args: - input_ids (npt.NDArray[np.intc]): The complete sequence of current token IDs - generated or processed so far. + Use this when starting a completely unrelated generation while keeping the + decoder instance reusable. + """ + self._history.clear() + self._map_k.clear() + self._map_k4v.clear() + self._last_draft_len = 0 + + def close(self) -> None: + """ + Release internal memory. + + This class does not own native memory, but clearing large Python containers + explicitly is still useful for long-running applications. + """ + self.clear() + self._closed = True + + def __del__(self) -> None: + # Best-effort cleanup. Program correctness must not depend on __del__. + try: + self.close() + except Exception: + pass + + def accept(self, n_accepted: int) -> None: """ - new_len = len(input_ids) + Notify how many draft tokens were accepted by the target model. + + This implementation does not need to update internal state here, because the + next call receives the verified token history through `input_ids`. + + The method is kept for API symmetry and future extensions, such as acceptance + statistics, adaptive reset, or low-acceptance fallback. + """ + return + + def _sync_and_index(self, input_ids: npt.NDArray[np.intc]) -> None: + """ + Synchronize internal history with input_ids and update the n-gram index. + + The index intentionally stores only n-grams that have at least one continuation + token. This prevents the current tail n-gram from matching itself and returning + an empty draft. + """ + if self._closed: + raise RuntimeError("LlamaNGramMapDecoding is closed") + + tokens = np.asarray(input_ids, dtype=np.intc).reshape(-1).tolist() + old_len = len(self._history) + new_len = len(tokens) + + if new_len == 0: + self.clear() + return + + # Fast path: identical input, no update needed. + if new_len == old_len: + if self._history == tokens: + return + + # Incremental append path. + is_append = False + if old_len > 0 and new_len > old_len: + check_len = min(old_len, max(self.ngram_size, self.sync_check_tokens)) + is_append = self._history[old_len - check_len : old_len] == tokens[ + old_len - check_len : old_len + ] + + if is_append: + # Append only new tokens. + self._history.extend(tokens[old_len:]) + + if self.mode == "k": + # Only newly-valid keys need to be added. + start = max(0, old_len - self.ngram_size) + else: + # K4V must also refresh recent keys because their continuation values + # can grow as new tokens are appended. + start = max(0, old_len - self.ngram_size - self.num_pred_tokens + 1) + else: + # Rollback, prompt switch, truncation, or unsafe mutation. + self.clear() + self._history.extend(tokens) + start = 0 + + # Only index keys that have at least one token after the key. + # Valid pos satisfies: + # pos + ngram_size < len(history) + end = max(0, len(self._history) - self.ngram_size) + + if start >= end: + return + + if self.mode == "k": + for pos in range(start, end): + key = tuple(self._history[pos : pos + self.ngram_size]) + bucket = self._map_k[key] + + if not bucket or bucket[-1] != pos: + bucket.append(pos) + + if ( + self.max_entries_per_key is not None + and len(bucket) > self.max_entries_per_key + ): + del bucket[: len(bucket) - self.max_entries_per_key] - # Check if it's a perfect incremental append (verify if the previous token matches) - is_incremental = False - if new_len > old_len and old_len > 0: - if self._history[-1] == input_ids[old_len - 1]: - is_incremental = True - - if is_incremental: - # Only extract, convert, and append new tokens. - # Never copy or touch the entire historical array! - new_tokens = input_ids[old_len:].tolist() - self._history.extend(new_tokens) - start_idx = max(0, old_len - self.ngram_size) else: - # Rollback occurred (wrong prediction) or a completely new Prompt. Trigger full rebuild. - self._ngram_map.clear() - self._history = input_ids.tolist() - start_idx = 0 + for pos in range(start, end): + key_start = pos + value_start = pos + self.ngram_size + value_end = min(value_start + self.num_pred_tokens, len(self._history)) + + if value_start >= value_end: + continue + + key = tuple(self._history[key_start:value_start]) + value = tuple(self._history[value_start:value_end]) - # Build/update the hash inverted index - for i in range(start_idx, new_len - self.ngram_size): - key = tuple(self._history[i : i + self.ngram_size]) - self._ngram_map[key].append(i) + bucket = self._map_k4v[key] + bucket[pos] = value + + if ( + self.max_entries_per_key is not None + and len(bucket) > self.max_entries_per_key + ): + # Keep the most recent positions. + for old_pos in sorted(bucket)[: len(bucket) - self.max_entries_per_key]: + del bucket[old_pos] def __call__( self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any ) -> npt.NDArray[np.intc]: """ - Generates draft tokens based on historical N-Gram frequency. + Generate draft tokens from verified token history. Args: - input_ids (npt.NDArray[np.intc]): The current sequence of token IDs. - **kwargs: Additional generation arguments (ignored in this implementation). + input_ids: + Complete verified token sequence so far. Returns: - npt.NDArray[np.intc]: An array of predicted draft tokens. Returns an empty - array if no matching context is found. + np.ndarray[np.intc]: + Predicted draft tokens. Empty array means no reliable match was found. """ - # 1. Ultra-fast state synchronization - self._update_cache(input_ids) + _ = kwargs + + self._sync_and_index(input_ids) + self._last_draft_len = 0 - # 2. Cannot speculate if the history is too short if len(self._history) < self.ngram_size: return np.array([], dtype=np.intc) - # 3. Extract the Search Key (the last N tokens) - search_key = tuple(self._history[-self.ngram_size:]) + search_key = tuple(self._history[-self.ngram_size :]) - # 4. O(1) instant lookup - match_indices = self._ngram_map.get(search_key) + if self.mode == "k": + positions = self._map_k.get(search_key) + if not positions or len(positions) < self.min_hits: + return np.array([], dtype=np.intc) - if not match_indices: - return np.array([], dtype=np.intc) + # Use the latest valid match with an available continuation. + draft: List[int] = [] + for pos in reversed(positions): + start = pos + self.ngram_size + if start < len(self._history): + end = min(start + self.num_pred_tokens, len(self._history)) + draft = self._history[start:end] + break + + else: + values = self._map_k4v.get(search_key) + if not values or len(values) < self.min_hits: + return np.array([], dtype=np.intc) - # 5. Get the context of the last match and extract draft tokens - best_match_idx = match_indices[-1] - draft_start = best_match_idx + self.ngram_size - draft_end = min(draft_start + self.num_pred_tokens, len(self._history)) + # Use the continuation from the latest historical position. + latest_pos = max(values) + draft = list(values[latest_pos]) - return np.array(self._history[draft_start:draft_end], dtype=np.intc) + self._last_draft_len = len(draft) + return np.asarray(draft, dtype=np.intc) # Legacy Numpy sliding window implementation +# Fast in some cases, but may degrade output quality. +# Not recommended for production. class LlamaPromptLookupDecoding(LlamaDraftModel): """ Stateless speculative decoding based on Numpy sliding window From 91627a0c6b713858ce5a102253d9c694e36c511b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 08:11:26 +0800 Subject: [PATCH 440/518] examples: add benchmark script for speculative decoding - Add `benchmark_speculative.py` to the `examples/benchmark` directory. - Test `LlamaPromptLookupDecoding` and `LlamaNGramMapDecoding` (k/k4v). - Include diverse test scenarios (code, JSON logs, tables, essays) to measure tokens-per-second (TPS) speedup compared to baseline generation. Signed-off-by: JamePeng --- examples/benchmark/benchmark_speculative.py | 466 ++++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100644 examples/benchmark/benchmark_speculative.py diff --git a/examples/benchmark/benchmark_speculative.py b/examples/benchmark/benchmark_speculative.py new file mode 100644 index 0000000000..73e7c203a2 --- /dev/null +++ b/examples/benchmark/benchmark_speculative.py @@ -0,0 +1,466 @@ +import csv +import gc +import random +import statistics +import time +from dataclasses import dataclass +from typing import Callable, Dict, List, Optional + +from llama_cpp import Llama +from llama_cpp.llama_speculative import ( + LlamaPromptLookupDecoding, + LlamaNGramMapDecoding, +) + + +# ============================================================ +# Model Configuration +# ============================================================ + +MODEL_PATH = r"/path/to/your/model.GGUF" + +N_CTX = 4096 +MAX_TOKENS = 1024 +REPEATS = 2 +CSV_OUTPUT = "speculative_benchmark_results.csv" + +RANDOMIZE_ENGINE_ORDER = False + + +# ============================================================ +# Benchmark Scenario Definition +# ============================================================ + +@dataclass(frozen=True) +class Scenario: + name: str + category: str + prompt: str + expected_behavior: str + + +TEST_SCENARIOS: List[Scenario] = [ + Scenario( + name="A1. Medium-High Repetition - CRUD Boilerplate Code", + category="code_boilerplate", + expected_behavior="Should benefit from n-gram lookup because class and method structures repeat.", + prompt="""<|im_start|>system +You are a senior backend developer. Write highly structured and consistent boilerplate code.<|im_end|> +<|im_start|>user +Write a Python script using `sqlite3` to define CRUD operations for a core banking system database. + +Create 6 separate classes: +- Account +- Transaction +- Customer +- Loan +- Portfolio +- AuditLog + +Each class MUST use the same internal method structure: +- create +- get +- update +- delete +- list_all + +Do not add extra explanations. Output only code.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="A2. Extreme Repetition - JSONL Trading Logs", + category="structured_logs", + expected_behavior="Should strongly favor n-gram methods, especially K/K4V.", + prompt="""<|im_start|>system +You are a deterministic data generation script. Output only raw JSON lines.<|im_end|> +<|im_start|>user +Continue this algorithmic trading execution log for 40 more lines. +Only change timestamp seconds, symbol, quantity, price, and execution_time_ms. + +{"timestamp":"2026-05-23T09:30:01Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"AAPL","side":"BUY","quantity":100,"price":175.50,"execution_time_ms":12} +{"timestamp":"2026-05-23T09:30:02Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"MSFT","side":"SELL","quantity":50,"price":410.25,"execution_time_ms":15} +{"timestamp":"2026-05-23T09:30:03Z","level":"INFO","module":"exec_engine","event":"trade_filled","symbol":"TSLA","side":"BUY","quantity":200,"price":180.10,"execution_time_ms":11}<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="A3. Markdown Table - Repetitive Course Catalog", + category="markdown_table", + expected_behavior="Repeated table columns and row structure should benefit from speculative lookup.", + prompt="""<|im_start|>system +You generate clean Markdown tables with consistent formatting.<|im_end|> +<|im_start|>user +Create a Markdown comparison table for 30 university postgraduate courses. + +Columns: +| Course ID | Course Title | Department | Credits | Prerequisites | Grading Basis | Core Objective | + +The row format must stay consistent. +Use concise but realistic academic descriptions. +Do not add explanation outside the table.<|im_end|> +<|im_start|>assistant +| Course ID | Course Title | Department | Credits | Prerequisites | Grading Basis | Core Objective | +|---:|---|---|---:|---|---|---| +""", + ), + Scenario( + name="A4. Structured Financial Market Report", + category="structured_report", + expected_behavior="Heading and bullet patterns repeat; n-gram lookup should help moderately.", + prompt="""<|im_start|>system +You are a quantitative macroeconomic analyst. Output structured, clear, and professional financial reports.<|im_end|> +<|im_start|>user +Write a Q3 Macroeconomic & Equity Strategy Outlook Report for institutional investors. + +Requirements: +1. Divide the report into exactly 8 sections. +2. Each section MUST contain exactly one heading and 3 bullet points. +3. Repeatedly emphasize the following themes across the sections: interest rate trajectory, inflation stickiness, equity market volatility, supply chain realignment, and fixed-income duration strategies. +4. Keep the tone highly professional and analytical.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="B1. Low Repetition - Macroeconomic Historical Essay", + category="low_repetition_creative", + expected_behavior="Should show limited or no speedup; useful as a negative control.", + prompt="""<|im_start|>system +You are an academic historian of economics. Write with varied sentence structures, rich vocabulary, and analytical depth.<|im_end|> +<|im_start|>user +Write a comprehensive essay exploring the psychological and sociological impacts of hyperinflation on institutional trust during the Weimar Republic in the 1920s. + +Requirements: +- Use highly academic and varied language. +- Do NOT use repetitive paragraph structures. +- Do NOT use bullet points or lists. +- Avoid parallel phrasing; favor complex, flowing narrative analysis. +- Make it a long, continuous essay.<|im_end|> +<|im_start|>assistant +The catastrophic devaluation of the Papiermark in the early 1920s fundamentally fractured the psychological bedrock of the Weimar Republic. """, + ), + Scenario( + name="B2. Reasoning-Like Explanation - Quantitative Finance", + category="reasoning_explanation", + expected_behavior="May show smaller speedup because content is less template-like.", + prompt="""<|im_start|>system +You are a careful technical explainer. Avoid repetitive phrasing.<|im_end|> +<|im_start|>user +Explain the foundational assumptions and inherent limitations of the Black-Scholes option pricing model. + +Discuss the following concepts contextually: +- Log-normal distribution of asset prices +- The assumption of constant volatility and risk-free rates +- Frictionless markets (no transaction costs or taxes) +- The difference in applicability between European and American options + +Write in clear, academic paragraphs. Do not use bullet points or lists.<|im_end|> +<|im_start|>assistant +""", + ), + Scenario( + name="C1. Long Context Copy-Edit - High Local Reuse", + category="copy_edit", + expected_behavior="Prompt contains repeated phrases; n-gram lookup should exploit local reuse.", + prompt="""<|im_start|>system +You are a precise academic editing assistant. Preserve the structure while improving the wording.<|im_end|> +<|im_start|>user +Rewrite the following academic grant proposal abstract in a cleaner professional style. +Keep the same repetitive sentence layout but fix the grammar and flow. + +Draft Proposal: +The proposed research will investigate the efficiency of machine learning in high-frequency trading. +The proposed research will demonstrate the risk vectors of automated market making. +The methodology will utilize massive historical limit order book datasets. +The methodology will require significant computational cluster resources. +The expected outcomes will provide a new framework for liquidity provisioning. +The expected outcomes will establish a baseline for regulatory compliance monitoring. +The budget will allocate funds for data acquisition from major exchanges. +The budget will allocate funds for two postdoctoral researchers. +The timeline will span twenty-four months of continuous data analysis. +The timeline will include three major peer-reviewed journal submissions. +The significance will address the growing instability in algorithmic flash crashes. +The significance will ensure safer automated trading environments.<|im_end|> +<|im_start|>assistant +""", + ), +] + + +# ============================================================ +# Engine Definition +# ============================================================ + +@dataclass(frozen=True) +class EngineConfig: + name: str + draft_factory: Callable[[], Optional[object]] + note: str + + +ENGINE_CONFIGS: List[EngineConfig] = [ + EngineConfig( + name="Baseline", + draft_factory=lambda: None, + note="No speculative decoding.", + ), + EngineConfig( + name="PromptLookup-Numpy-n10", + draft_factory=lambda: LlamaPromptLookupDecoding( + max_ngram_size=3, + num_pred_tokens=10, + ), + note="Legacy sliding-window prompt lookup.", + ), + EngineConfig( + name="NGramMap-K-n6", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=6, + mode="k", + min_hits=1, + ), + note="Key-only n-gram map, shorter draft.", + ), + EngineConfig( + name="NGramMap-K-n10", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=1, + ), + note="Key-only n-gram map, default draft length.", + ), + EngineConfig( + name="NGramMap-K4V-n10-cap8", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k4v", + min_hits=1, + max_entries_per_key=8, + ), + note="K4V with bounded per-key memory.", + ), + EngineConfig( + name="NGramMap-K4V-n16-cap8", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=16, + mode="k4v", + min_hits=1, + max_entries_per_key=8, + ), + note="Longer K4V draft; can be faster on highly repetitive outputs.", + ), + EngineConfig( + name="NGramMap-K-minhits2-n10", + draft_factory=lambda: LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + ), + note="More conservative K mode.", + ), +] + + +# ============================================================ +# Measurement Helpers +# ============================================================ + +def cleanup_model(llm: Optional[Llama]) -> None: + if llm is not None: + del llm + gc.collect() + + +def create_llama(draft_model: Optional[object]) -> Llama: + return Llama( + model_path=MODEL_PATH, + n_ctx=N_CTX, + n_gpu_layers=-1, + draft_model=draft_model, + verbose=False, + ) + + +def measure_once( + scenario: Scenario, + engine: EngineConfig, + repeat_idx: int, +) -> Dict[str, object]: + draft_model = engine.draft_factory() + + print(f"\n⏳ [{scenario.name}] Engine={engine.name} | Repeat={repeat_idx + 1}") + print(f" Note: {engine.note}") + + llm: Optional[Llama] = None + + try: + llm = create_llama(draft_model) + + # Warmup: force backend initialization and first-token path. + llm.create_completion( + prompt=scenario.prompt, + max_tokens=1, + temperature=0.0, + echo=False, + ) + + start = time.perf_counter() + + response = llm.create_completion( + prompt=scenario.prompt, + max_tokens=MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + repeat_penalty=1.0, + echo=False, + ) + + end = time.perf_counter() + + duration = end - start + usage = response.get("usage", {}) + completion_tokens = int(usage.get("completion_tokens", 0)) + total_tokens = int(usage.get("total_tokens", 0)) + prompt_tokens = int(usage.get("prompt_tokens", 0)) + + text = response["choices"][0]["text"] + tps = completion_tokens / duration if duration > 0 else 0.0 + + print( + f"✅ {engine.name:<28} " + f"{tps:8.2f} tok/s | " + f"time={duration:7.2f}s | " + f"gen={completion_tokens:4d} | " + f"prompt={prompt_tokens:4d}" + ) + print(f" Snippet: {text[:120].replace(chr(10), ' ')}...") + + return { + "scenario": scenario.name, + "category": scenario.category, + "expected_behavior": scenario.expected_behavior, + "engine": engine.name, + "engine_note": engine.note, + "repeat": repeat_idx + 1, + "duration_sec": duration, + "completion_tokens": completion_tokens, + "prompt_tokens": prompt_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tps, + "snippet": text[:160].replace("\n", "\\n"), + } + + finally: + if hasattr(draft_model, "close"): + draft_model.close() + cleanup_model(llm) + + +# ============================================================ +# Reporting +# ============================================================ + +def summarize_results(rows: List[Dict[str, object]]) -> None: + print("\n\n" + "=" * 90) + print("📊 Benchmark Summary") + print("=" * 90) + + by_scenario: Dict[str, List[Dict[str, object]]] = {} + for row in rows: + by_scenario.setdefault(str(row["scenario"]), []).append(row) + + for scenario_name, scenario_rows in by_scenario.items(): + print(f"\n📂 {scenario_name}") + print("-" * 90) + + grouped: Dict[str, List[float]] = {} + for row in scenario_rows: + grouped.setdefault(str(row["engine"]), []).append(float(row["tokens_per_sec"])) + + baseline_avg = statistics.mean(grouped.get("Baseline", [0.0])) + + print( + f"{'Engine':<32} | {'Avg tok/s':>10} | {'Best':>10} | " + f"{'Worst':>10} | {'Speedup':>8}" + ) + print("-" * 90) + + for engine_name, speeds in grouped.items(): + avg = statistics.mean(speeds) + best = max(speeds) + worst = min(speeds) + speedup = avg / baseline_avg if baseline_avg > 0 else 1.0 + + print( + f"{engine_name:<32} | " + f"{avg:10.2f} | " + f"{best:10.2f} | " + f"{worst:10.2f} | " + f"{speedup:8.2f}x" + ) + + +def save_csv(rows: List[Dict[str, object]], path: str) -> None: + if not rows: + return + + fieldnames = list(rows[0].keys()) + + with open(path, "w", newline="", encoding="utf-8-sig") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + print(f"\n💾 CSV saved to: {path}") + + +# ============================================================ +# Main Benchmark Flow +# ============================================================ + +def run_benchmark() -> None: + print("=" * 90) + print("🏆 llama-cpp-python Speculative Decoding Benchmark") + print("=" * 90) + print(f"Model: {MODEL_PATH}") + print(f"n_ctx={N_CTX}, max_tokens={MAX_TOKENS}, repeats={REPEATS}") + print("=" * 90) + + rows: List[Dict[str, object]] = [] + + for scenario in TEST_SCENARIOS: + print("\n\n" + "#" * 90) + print(f"📂 Scenario: {scenario.name}") + print(f"📌 Category: {scenario.category}") + print(f"🧠 Expected: {scenario.expected_behavior}") + print("#" * 90) + + engines = list(ENGINE_CONFIGS) + if RANDOMIZE_ENGINE_ORDER: + baseline = [e for e in engines if e.name == "Baseline"] + others = [e for e in engines if e.name != "Baseline"] + random.shuffle(others) + engines = baseline + others + + for engine in engines: + for repeat_idx in range(REPEATS): + row = measure_once( + scenario=scenario, + engine=engine, + repeat_idx=repeat_idx, + ) + rows.append(row) + + summarize_results(rows) + save_csv(rows, CSV_OUTPUT) + + +if __name__ == "__main__": + run_benchmark() \ No newline at end of file From 969f5be484ab9f9d602fd73c129906f4ca2ed63e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 08:31:19 +0800 Subject: [PATCH 441/518] docs(speculative): update wiki for NGramMap k/k4v modes and lifecycle APIs Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in the official documentation. - Document the new `__init__` parameters (`mode`, `min_hits`, `max_entries_per_key`, `sync_check_tokens`) and their validation rules. - Add a detailed comparison table explaining the memory and behavior differences between the `"k"` and `"k4v"` lookup modes. - Document the newly exposed lifecycle methods (`clear`, `close`, `accept`). - Add comprehensive usage examples demonstrating `k4v` mode with memory caps. - Update internal state descriptions (replacing `_ngram_map` with `_map_k` and `_map_k4v`). - Add a strong production warning against the legacy `LlamaPromptLookupDecoding` and cross-link the new `benchmark_speculative.py` script. Signed-off-by: JamePeng --- docs/wiki/modules/LlamaSpeculative.md | 343 +++++++++++++++++++------- 1 file changed, 260 insertions(+), 83 deletions(-) diff --git a/docs/wiki/modules/LlamaSpeculative.md b/docs/wiki/modules/LlamaSpeculative.md index 0c0ad099fb..9255d01496 100644 --- a/docs/wiki/modules/LlamaSpeculative.md +++ b/docs/wiki/modules/LlamaSpeculative.md @@ -2,7 +2,7 @@ title: Llama Speculative Decoding module_name: llama_cpp.llama_speculative source_file: llama_cpp/llama_speculative.py -last_updated: 2026-05-02 +last_updated: 2026-05-23 version_target: "latest" --- @@ -10,30 +10,37 @@ version_target: "latest" ## Overview -`llama_speculative.py` provides draft model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. +`llama_speculative.py` defines draft-model interfaces and prompt-based speculative decoding helpers for `llama-cpp-python`. -Speculative decoding uses a lightweight draft model to propose candidate tokens before the main model verifies them. In this module, the draft model does not need to be a neural model. It can also be a prompt lookup decoder that predicts future tokens by finding repeated token patterns in the existing context. +Speculative decoding lets a draft model propose candidate tokens before the main `Llama` model verifies them. In this module, the draft model does not have to be a neural network. It can also be a model-free prompt lookup decoder that predicts future tokens from repeated token patterns in the already verified context. This module currently defines: | Class | Status | Description | |---|---|---| -| `LlamaDraftModel` | public interface | Abstract base class for draft models used by speculative decoding. | -| `LlamaNGramMapDecoding` | public | Fast stateful n-gram map based speculative decoder. | +| `LlamaDraftModel` | public interface | Abstract base class for speculative draft models. | +| `LlamaNGramMapDecoding` | public | Stateful model-free n-gram lookup decoder with `k` and `k4v` modes. | | `LlamaPromptLookupDecoding` | legacy public | Stateless NumPy sliding-window prompt lookup decoder. | ## Role in the Library This module defines the draft-model side of speculative decoding. -A draft model receives the current token sequence and returns predicted draft tokens. These draft tokens can then be verified by the main `Llama` model during generation. +A draft model receives the verified token sequence so far and returns predicted draft token IDs. These tokens are later verified by the main `Llama` model during generation. The module provides two prompt-based implementations: -- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based lookup. -- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window implementation. +- `LlamaNGramMapDecoding`: optimized, stateful, hash-map based n-gram lookup. +- `LlamaPromptLookupDecoding`: older stateless NumPy sliding-window lookup. -For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains an n-gram index instead of scanning the full token history on every call. +For new usage, prefer `LlamaNGramMapDecoding`. It incrementally maintains an n-gram index, supports memory-oriented lookup modes, and avoids scanning the full token history on every call. + +## Choosing Between Related APIs + +| API | Recommended Use | Notes | +|---|---|---| +| `LlamaNGramMapDecoding` | Default prompt lookup decoder for new usage. | Uses stateful n-gram maps and supports `k` / `k4v` modes. | +| `LlamaPromptLookupDecoding` | Compatibility with older prompt lookup behavior. | Stateless and simple, but scans token history with NumPy sliding windows. | ## Classes @@ -41,7 +48,7 @@ For new usage, prefer `LlamaNGramMapDecoding` because it incrementally maintains ```python class LlamaDraftModel(abc.ABC) -```` +``` Abstract base class for speculative draft models. @@ -58,15 +65,15 @@ def __call__( ) -> npt.NDArray[np.intc] ``` -| Parameter | Type | Description | -| ----------- | ---------------------- | ----------------------------------------------------------------- | -| `input_ids` | `npt.NDArray[np.intc]` | Current token sequence. | -| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete verified token sequence so far. | +| `**kwargs` | `Any` | Additional generation arguments. Implementations may ignore them. | Returns: -| Type | Description | -| ---------------------- | -------------------------------------------- | +| Type | Description | +|---|---| | `npt.NDArray[np.intc]` | Draft token IDs proposed by the draft model. | ## `LlamaNGramMapDecoding` @@ -75,9 +82,11 @@ Returns: class LlamaNGramMapDecoding(LlamaDraftModel) ``` -Fast speculative decoder based on an n-gram hash map. +Fast model-free speculative decoder based on prompt n-gram lookup. + +This decoder maintains internal indexes from historical n-grams to either previous positions or cached continuation tokens. When called with the current verified token sequence, it searches for the final n-gram in the already verified history and returns a continuation from the most recent valid historical match. -This decoder maintains an internal inverted index from historical n-grams to their positions. When called with the current token sequence, it looks up the final n-gram in the history and returns the following tokens from the most recent matching context. +It does not own or run a separate draft model. Rejected draft tokens do not require manual rollback inside this class, because the next call receives the verified token history through `input_ids`. ### Constructor @@ -86,52 +95,207 @@ def __init__( self, ngram_size: int = 3, num_pred_tokens: int = 10, -) + mode: Literal["k", "k4v"] = "k", + min_hits: int = 2, + max_entries_per_key: Optional[int] = None, + sync_check_tokens: int = 16, +) -> None ``` -| Parameter | Type | Default | Description | -| ----------------- | ----- | ------- | ------------------------------------------------------------------------------------------------------------------------------- | -| `ngram_size` | `int` | `3` | Length of the token sequence used as the lookup key. Larger values require stricter context matches but may produce fewer hits. | -| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return after a matching n-gram is found. | +| Parameter | Type | Default | Source | Description | +|---|---|---|---|---| +| `ngram_size` | `int` | `3` | `__init__` signature | Number of tokens used as the lookup key. Larger values require stricter matches and may reduce hit rate. | +| `num_pred_tokens` | `int` | `10` | `__init__` signature | Maximum number of draft tokens to return. | +| `mode` | `Literal["k", "k4v"]` | `"k"` | `__init__` signature | Lookup storage mode. `"k"` stores key-to-position mappings. `"k4v"` stores key-to-continuation mappings. | +| `min_hits` | `int` | `2` | `__init__` signature | Minimum number of historical matches required before returning a draft. Use `1` for maximum recall; use values greater than `1` to reduce low-confidence drafts. | +| `max_entries_per_key` | `Optional[int]` | `None` | `__init__` signature and initialization logic | Optional memory cap per n-gram key. If `mode="k4v"` and this is `None`, it is automatically set to `8`. | +| `sync_check_tokens` | `int` | `16` | `__init__` signature | Number of trailing tokens used to detect whether new input is an incremental append without doing a full prefix comparison. | + +### Parameter Validation + +The constructor raises `ValueError` when: + +| Condition | Error Meaning | +|---|---| +| `ngram_size <= 0` | `ngram_size` must be positive. | +| `num_pred_tokens <= 0` | `num_pred_tokens` must be positive. | +| `min_hits <= 0` | `min_hits` must be positive. | +| `max_entries_per_key is not None and max_entries_per_key <= 0` | The memory cap must be `None` or positive. | +| `sync_check_tokens <= 0` | `sync_check_tokens` must be positive. | +| `mode` is not `"k"` or `"k4v"` after lowercasing | Only the two supported lookup modes are valid. | + +### Lookup Modes + +| Mode | Internal Storage | Memory Use | Behavior | +|---|---|---|---| +| `"k"` | `key -> [position, position, ...]` | Lower | Stores historical positions and slices continuations from `_history` during lookup. | +| `"k4v"` | `key -> {position: continuation}` | Higher | Stores continuation tokens directly and returns the latest cached continuation. | + +Use `"k"` as the general-purpose default. Use `"k4v"` when faster continuation retrieval is preferred and the extra memory use is acceptable. For `"k4v"`, `max_entries_per_key` defaults to `8` when not specified. ### Important Attributes / State -| Attribute | Type | Source | Description | -| ----------------- | ---------------------------------- | -------------- | -------------------------------------------------------------------------------- | -| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | -| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | -| `_ngram_map` | `Dict[Tuple[int, ...], List[int]]` | internal cache | Internal inverted index mapping n-gram tuples to positions in the token history. | -| `_history` | `List[int]` | internal cache | Internal token history used to maintain the n-gram map. | +| Attribute | Type | Source | Description | +|---|---|---|---| +| `ngram_size` | `int` | constructor | Number of tokens used as the n-gram lookup key. | +| `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | +| `mode` | `str` | constructor | Active lookup mode: `"k"` or `"k4v"`. | +| `min_hits` | `int` | constructor | Required number of historical matches before returning a draft. | +| `max_entries_per_key` | `Optional[int]` | constructor / initialization logic | Optional per-key memory cap. Automatically becomes `8` for `k4v` mode when not provided. | +| `sync_check_tokens` | `int` | constructor | Trailing-token window used for incremental append detection. | +| `_history` | `List[int]` | internal state | Verified token history mirrored from `input_ids`. | +| `_map_k` | `DefaultDict[Tuple[int, ...], List[int]]` | internal state | Key-to-position index used in `"k"` mode. | +| `_map_k4v` | `DefaultDict[Tuple[int, ...], Dict[int, Tuple[int, ...]]]` | internal state | Key-to-continuation index used in `"k4v"` mode. | +| `_closed` | `bool` | internal state | Marks the decoder as closed. Calling the decoder after `close()` raises `RuntimeError`. | +| `_last_draft_len` | `int` | internal state | Length of the most recent returned draft. Currently internal diagnostic state. | + +Internal state should not be mutated directly. + +### Core Methods + +#### `__call__` + +```python +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] +``` + +Generates draft tokens from verified token history. + +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete verified token sequence so far. | +| `**kwargs` | `Any` | Accepted for interface compatibility and ignored by this implementation. | + +Returns: + +| Type | Description | +|---|---| +| `npt.NDArray[np.intc]` | Predicted draft tokens. Returns an empty array when no reliable match is found. | + +Raises: + +| Exception | Condition | +|---|---| +| `RuntimeError` | The decoder has been closed with `close()` and is called again. | + +#### `clear` + +```python +def clear(self) -> None +``` + +Clears token history and internal indexes while keeping the decoder reusable. + +Use this when starting a completely unrelated generation with the same decoder instance. + +#### `close` + +```python +def close(self) -> None +``` + +Clears internal containers and marks the decoder as closed. + +This class does not own native memory, but explicit cleanup can be useful in long-running applications that may otherwise keep large Python containers alive. + +#### `accept` + +```python +def accept(self, n_accepted: int) -> None +``` + +Compatibility hook for speculative decoding loops. -`_ngram_map` and `_history` are internal state and should not be modified directly. +This implementation is intentionally a no-op. Accepted tokens are reflected by the next `input_ids` passed to `__call__`, so no separate rollback or acceptance state update is required. ### Behavior When called, `LlamaNGramMapDecoding`: -1. Synchronizes its internal history with the provided `input_ids`. -2. Incrementally updates the n-gram map when tokens are appended. -3. Rebuilds the map if the input sequence is no longer a simple continuation, such as after rollback or a new prompt. -4. Uses the last `ngram_size` tokens as the search key. -5. Returns up to `num_pred_tokens` tokens following the most recent historical match. -6. Returns an empty NumPy array if no match is found. +1. Converts `input_ids` to a flat `np.intc` token list. +2. Synchronizes internal history with the verified token sequence. +3. Uses a fast path when the new input is identical to the stored history. +4. Uses an incremental append path when the trailing tokens indicate that the new input extends the previous input. +5. Rebuilds the index after rollback, prompt switch, truncation, or unsafe mutation. +6. Indexes only n-grams with at least one available continuation token, so the current tail n-gram does not match itself. +7. Looks up the final `ngram_size` tokens as the search key. +8. Requires at least `min_hits` historical matches before returning a draft. +9. Returns up to `num_pred_tokens` tokens from the latest valid historical match. +10. Returns an empty NumPy array if no reliable match is available. -### Example +### Example: Direct Prompt Lookup + +Use `min_hits=1` in a small standalone example so that one historical match is enough to return a draft. ```python import numpy as np + from llama_cpp.llama_speculative import LlamaNGramMapDecoding draft_model = LlamaNGramMapDecoding( ngram_size=3, - num_pred_tokens=5, + num_pred_tokens=2, + min_hits=1, ) -input_ids = np.array([1, 2, 3, 4, 1, 2, 3], dtype=np.intc) - +input_ids = np.array([1, 2, 3, 4, 5, 1, 2, 3], dtype=np.intc) draft_tokens = draft_model(input_ids) print(draft_tokens) +# Expected output: +# [4 5] +``` + +### Example: Use with `Llama` + +```python +from llama_cpp import Llama +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +llm = Llama( + model_path="path/to/model.gguf", + n_ctx=4096, + n_gpu_layers=-1, + draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + ), +) + +response = llm.create_chat_completion( + messages=[ + { + "role": "user", + "content": ( + "Write five short Python classes with the same CRUD method layout: " + "User, Product, Order, Review, and Category." + ), + } + ] +) + +print(response["choices"][0]["message"]["content"]) +``` + +### Example: Use `k4v` Mode with a Memory Cap + +```python +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=4, + num_pred_tokens=8, + mode="k4v", + min_hits=2, + max_entries_per_key=8, +) ``` ## `LlamaPromptLookupDecoding` @@ -144,7 +308,7 @@ Legacy speculative decoder based on NumPy sliding-window lookup. This implementation is stateless. Each call scans the input token sequence to find previous occurrences of the current n-gram and returns the following tokens as draft predictions. -> Warning: This implementation may have high computational overhead for long contexts. Prefer `LlamaNGramMapDecoding` for new usage. +> Warning: This implementation is not recommended for production. It may have high computational overhead for long contexts and may degrade output quality. Prefer `LlamaNGramMapDecoding` for new usage. ### Constructor @@ -156,16 +320,16 @@ def __init__( ) ``` -| Parameter | Type | Default | Description | -| ----------------- | ----- | ------- | -------------------------------------------------------------------------- | -| `max_ngram_size` | `int` | `3` | Maximum n-gram size to search for. The decoder tries larger n-grams first. | -| `num_pred_tokens` | `int` | `10` | Maximum number of draft tokens to return. | +| Parameter | Type | Default | Source | Description | +|---|---|---|---|---| +| `max_ngram_size` | `int` | `3` | `__init__` signature | Maximum n-gram size to search for. The decoder tries larger n-grams first. | +| `num_pred_tokens` | `int` | `10` | `__init__` signature | Maximum number of draft tokens to return. | ### Important Attributes / State -| Attribute | Type | Source | Description | -| ----------------- | ----- | ----------- | --------------------------------------------------- | -| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | +| Attribute | Type | Source | Description | +|---|---|---|---| +| `max_ngram_size` | `int` | constructor | Maximum n-gram window size used during lookup. | | `num_pred_tokens` | `int` | constructor | Maximum number of predicted draft tokens to return. | ### Static Method @@ -181,58 +345,71 @@ def find_candidate_pred_tokens( Linearly scans `input_ids` using NumPy sliding windows to find matching n-grams. -| Parameter | Type | Description | -| ----------------- | ---------------------- | ----------------------------------------- | -| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | -| `max_ngram_size` | `int` | Maximum n-gram size to search for. | -| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | +| Parameter | Type | Description | +|---|---|---| +| `input_ids` | `npt.NDArray[np.intc]` | Complete token sequence. | +| `max_ngram_size` | `int` | Maximum n-gram size to search for. | +| `num_pred_tokens` | `int` | Maximum number of draft tokens to return. | Returns: -| Type | Description | -| ---------------------- | --------------------------------------------------------------- | +| Type | Description | +|---|---| | `npt.NDArray[np.intc]` | Candidate draft tokens, or an empty array if no match is found. | -### Example +### Method ```python -from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaNGramMapDecoding - -llama = Llama( - model_path="path/to/qwen-3.6-27b.gguf", - n_ctx=4096, - n_gpu_layers=-1, - draft_model=LlamaNGramMapDecoding( - ngram_size=3, - num_pred_tokens=10 - ) -) - -response = llama.create_chat_completion( - messages=[{"role": "user", "content": """ - Write a Python script using `sqlite3` to define CRUD (Create, Read, Update, Delete) operations for an e-commerce database. -You need to create 5 separate classes for the following entities: `User`, `Product`, `Order`, `Review`, and `Category`. -Each class MUST have exactly the same internal structure and method names (create, get, update, delete). Do not add extra logic, just the standard boilerplate. - """}] -) +def __call__( + self, + input_ids: npt.NDArray[np.intc], + /, + **kwargs: Any, +) -> npt.NDArray[np.intc] ``` +Calls `find_candidate_pred_tokens` with the instance's `max_ngram_size` and `num_pred_tokens`. + ## Best Practices & Common Patterns -* Prefer `LlamaNGramMapDecoding` for new usage. -* Use `LlamaPromptLookupDecoding` only when compatibility with the older stateless prompt lookup behavior is needed. -* Increase `ngram_size` or `max_ngram_size` for stricter context matching. -* Increase `num_pred_tokens` when you want longer draft proposals, but keep in mind that speculative decoding still depends on later verification by the main model. -* Do not mutate `_ngram_map` or `_history` directly. -* If input token history rolls back or changes unexpectedly, `LlamaNGramMapDecoding` automatically rebuilds its internal cache. +- Prefer `LlamaNGramMapDecoding` for new usage. +- Use `mode="k"` as the default memory-efficient mode. +- Use `mode="k4v"` when cached continuations are useful and the additional memory use is acceptable. +- Keep `max_entries_per_key` set for `k4v` mode unless you intentionally want an unbounded per-key cache. +- Use `min_hits=1` for maximum recall in repetitive prompts or benchmarks. +- Use `min_hits > 1` to reduce low-confidence drafts. +- Increase `ngram_size` for stricter pattern matching. +- Increase `num_pred_tokens` to allow longer draft proposals, but remember that the target model still verifies the tokens. +- Call `clear()` before reusing the same decoder for an unrelated prompt or generation session. +- Do not call the decoder again after `close()` unless you create a new instance. +- Do not mutate `_history`, `_map_k`, `_map_k4v`, or other internal state directly. + +## Limitations + +- Prompt lookup only predicts tokens that are already implied by repeated patterns in the verified context. +- It is most useful for repetitive, structured, or boilerplate-heavy output. +- It may return an empty draft when the context has too few repeated n-grams or when `min_hits` is too strict. +- It does not replace target-model verification. +- `LlamaPromptLookupDecoding` is kept for compatibility and is not recommended for production use. ## Deprecated / Changed APIs -`LlamaPromptLookupDecoding` is marked as a legacy NumPy sliding-window implementation in the source code. It is still available, but `LlamaNGramMapDecoding` is the preferred implementation for faster repeated calls over long contexts. +`LlamaPromptLookupDecoding` is the legacy NumPy sliding-window implementation. It remains available, but `LlamaNGramMapDecoding` is the preferred prompt lookup implementation for new code. + +Compared with the older `LlamaNGramMapDecoding` documentation, the current implementation adds: + +- `mode` +- `min_hits` +- `max_entries_per_key` +- `sync_check_tokens` +- `clear()` +- `close()` +- `accept()` +- Separate internal indexes for `k` and `k4v` modes ## Related Links * [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] * [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +* [[Benchmark_Speculative](https://github.com/JamePeng/llama-cpp-python/blob/main/examples/benchmark/benchmark_speculative.py)] From d90895d33c7868d9c949d9c1648d33ad3ebc7f8e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 11:02:33 +0800 Subject: [PATCH 442/518] docs(readme): revamp speculative decoding documentation Expand the Speculative Decoding section to fully document the new `LlamaNGramMapDecoding` capabilities and configuration options. - Clarify that `LlamaNGramMapDecoding` is a model-free prompt lookup decoder that does not require a secondary GGUF draft model. - Add a detailed parameter table explaining `mode` (k vs. k4v), `min_hits`, memory caps, and sync thresholds. - Provide usage examples and tuning recommendations for different hardware (e.g., lowering `num_pred_tokens` for CPU setups). - Demote the older `LlamaPromptLookupDecoding` to a legacy section, warning about its sliding-window overhead on long contexts. - Add practical notes on performance and state management (`clear()`). Signed-off-by: JamePeng --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6c56c034c3..1986a6ca54 100644 --- a/README.md +++ b/README.md @@ -1592,44 +1592,116 @@ emb = llm.create_embedding("text") --- -### Speculative Decoding +## Speculative Decoding -`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model. +`llama-cpp-python` supports speculative decoding through a `draft_model` passed to the `Llama` class. -The fastest way to use speculative decoding is through the `LlamaNGramMapDecoding`(**Recommend**) or `LlamaPromptLookupDecoding` class. +Speculative decoding lets a draft decoder propose candidate tokens before the main model verifies them. This can improve generation speed, especially for repetitive or structured outputs such as code, JSON, boilerplate text, templates, and long-form responses with repeated patterns. -Just pass this as a draft model to the `Llama` class during initialization. +The recommended built-in draft decoder is `LlamaNGramMapDecoding`. + +Unlike neural draft-model speculative decoding, `LlamaNGramMapDecoding` does not require a second GGUF model. It is a model-free prompt n-gram lookup decoder that predicts draft tokens from already verified token history. ```python from llama_cpp import Llama from llama_cpp.llama_speculative import LlamaNGramMapDecoding llama = Llama( - model_path="path/to/qwen-3.6-27b.gguf", + model_path="path/to/model.gguf", n_ctx=4096, n_gpu_layers=-1, draft_model=LlamaNGramMapDecoding( ngram_size=3, - num_pred_tokens=10 - ) + num_pred_tokens=10, + ), ) response = llama.create_chat_completion( - messages=[{"role": "user", "content": "Write a python script..."}] + messages=[ + { + "role": "user", + "content": "Write a Python script using sqlite3 with repeated CRUD classes.", + } + ] +) +```` + +`LlamaNGramMapDecoding` maintains an internal n-gram index and can reuse repeated token patterns from the current prompt and generated context. Compared with the legacy sliding-window prompt lookup decoder, it avoids scanning the full token history on every call, making draft generation much cheaper for long contexts. + +#### Advanced configuration + +```python +from llama_cpp.llama_speculative import LlamaNGramMapDecoding + +draft_model = LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + mode="k", + min_hits=2, + max_entries_per_key=None, + sync_check_tokens=16, +) +``` + +| Parameter | Default | Description | +| --------------------- | ----------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| `ngram_size` | `3` | Number of tokens used as the lookup key. Larger values require stricter matches. | +| `num_pred_tokens` | `10` | Maximum number of draft tokens to propose. | +| `mode` | `"k"` | N-gram map mode. `"k"` stores key-to-position mappings. `"k4v"` stores key-to-continuation mappings. | +| `min_hits` | `2` | Minimum number of historical matches required before returning draft tokens. Use `1` for higher recall, or `2+` to reduce low-confidence drafts. | +| `max_entries_per_key` | `None` in `"k"` mode, `8` in `"k4v"` mode | Optional memory cap per n-gram key. Strongly recommended for `"k4v"` mode. | +| `sync_check_tokens` | `16` | Number of trailing tokens used to detect whether the new input is an incremental append or requires rebuilding the internal index. | + +#### Choosing a mode + +`LlamaNGramMapDecoding` supports two modes: + +* `mode="k"`: stores n-gram keys mapped to historical positions. This is the default and is usually the best starting point. +* `mode="k4v"`: stores n-gram keys mapped directly to continuation tokens. This can make continuation lookup cheaper, but uses more memory. When using `"k4v"`, keeping `max_entries_per_key` enabled is recommended. + +For most users, the default configuration is enough: + +```python +draft_model=LlamaNGramMapDecoding() +``` + +For higher recall, especially when the prompt has fewer repeated patterns, you can lower `min_hits`: + +```python +draft_model=LlamaNGramMapDecoding( + ngram_size=3, + num_pred_tokens=10, + min_hits=1, ) ``` -Note: `LlamaPromptLookupDecoding.num_pred_tokens` is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. Now, `LlamaNGramMapDecoding` with the new Hash Map algorithm, draft generation becomes instantaneous $O(1)$, and the time consumption is almost 0 regardless of whether you set the prediction to 2 or 10 words. -### Adjusting the Context Window +For CPU-only machines, smaller draft lengths such as `num_pred_tokens=2` may still be a better tradeoff. For GPU inference, larger values such as `num_pred_tokens=10` are often reasonable, but the best value depends on model size, prompt structure, backend, and acceptance rate. -The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. +#### Legacy prompt lookup decoder -For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: +`LlamaPromptLookupDecoding` is still available for compatibility: ```python -llm = Llama(model_path="./models/llama-model.gguf", n_ctx=2048) +from llama_cpp.llama_speculative import LlamaPromptLookupDecoding + +draft_model = LlamaPromptLookupDecoding( + max_ngram_size=3, + num_pred_tokens=10, +) ``` +However, it uses a legacy NumPy sliding-window lookup and may have higher overhead on long contexts. For new usage, prefer `LlamaNGramMapDecoding`. + +#### Notes + +* Speculative decoding still requires the main model to verify proposed draft tokens. +* Speedup depends on how many draft tokens are accepted. +* Prompt n-gram speculative decoding works best when the current context contains repeated patterns. +* It is especially useful for code generation, structured text, repeated templates, and boilerplate-heavy completions. +* `LlamaNGramMapDecoding` stores internal Python-side history and indexes. If you want to reuse the same decoder instance for an unrelated generation, call `draft_model.clear()`. + +--- + ## Docker image See here: https://github.com/JamePeng/llama-cpp-python/tree/main/docker#cuda_simple From 5364cf914b590065690eacaaf94ecb2453766a67 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 12:58:02 +0800 Subject: [PATCH 443/518] feat(LlamaContext): add safety checks and docstrings to logits retrieval - Add explicit null pointer validation to `get_logits` and `get_logits_ith`. These methods now raise a `RuntimeError` instead of silently returning invalid pointers when logits are unavailable or the index is out of bounds. - Add comprehensive docstrings to both methods, detailing the underlying buffer shape and memory layout. - Include a performance warning in `get_logits_ith` about the internal synchronization/reordering overhead to discourage its use on the hot path. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index c026440a2d..fda9187855 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -755,12 +755,36 @@ def synchronize(self): llama_cpp.llama_synchronize(self.ctx) def get_logits(self): + """ + Token logits obtained from the last call to llama_decode() + The logits for which llama_batch.logits[i] != 0 are stored contiguously + in the order they have appeared in the batch. + Rows: number of tokens for which llama_batch.logits[i] != 0 + Cols: n_vocab + + Returns: + Pointer to the logits buffer of shape (n_tokens, n_vocab) + """ self._assert_ctx() - return llama_cpp.llama_get_logits(self.ctx) + logits = llama_cpp.llama_get_logits(self.ctx) + if not logits: + raise RuntimeError(f"LlamaContext.get_logits: failed to get logits") + return logits def get_logits_ith(self, i: int): + """ + Return logits for the ith output row from the last llama_decode call. + + Note: + This calls llama_get_logits_ith(), which may reorder/synchronize + the output buffer internally. Avoid calling it on the hot path unless + Python-side logits are required. + """ self._assert_ctx() - return llama_cpp.llama_get_logits_ith(self.ctx, i) + logits = llama_cpp.llama_get_logits_ith(self.ctx, i) + if not logits: + raise RuntimeError(f"LlamaContext.get_logits_ith: invalid logits index {i}") + return logits def set_embeddings(self, embeddings: bool): self._assert_ctx() From 7e0cd122d0af2f9971ebdfc40fb177366c394280 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 13:02:09 +0800 Subject: [PATCH 444/518] build(cmake): disable building of upstream unified binary Set `LLAMA_BUILD_APP` to `OFF` to prevent the compilation of the new unified `llama` binary introduced in upstream llama.cpp. Since the Python package only requires the underlying shared libraries and specific targets, explicitly disabling the standalone application reduces build times and prevents unnecessary executable artifacts from being compiled. Signed-off-by: JamePeng --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6dfb7c136..6f09cdb783 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,9 @@ if (LLAMA_BUILD) # Disable building of server set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example" FORCE) + # Disable building of unified binary + set(LLAMA_BUILD_APP OFF CACHE BOOL "llama: build the unified binary" FORCE) + # Disable build the embedded Web UI for server set(LLAMA_BUILD_UI OFF CACHE BOOL "llama: build the embedded Web UI for server" FORCE) set(LLAMA_USE_PREBUILT_UI OFF CACHE BOOL "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" FORCE) From 615e45a47f47387e741c12eeca397339fee0e74b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 13:20:06 +0800 Subject: [PATCH 445/518] perf(eval): skip unnecessary logit array copies during native sampling - Introduce the `copy_logits` parameter to `Llama.eval()` to control whether C-level logits are copied into the Python `self.scores` array. - Automatically disable `copy_logits` during the generation loop unless Python-side hooks (`logits_processor`, `stopping_criteria`) or `logits_all` explicitly require them. - Skip logit copies entirely for intermediate prompt evaluations (e.g., before hybrid checkpoints). - Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch the final token's logits when copying is required. In a PDF-reading summarization workload, this reduced the end-to-end completion time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path also improved noticeably: - `_create_completion`: 41.32s -> 25.93s - `generate`: 37.82s -> below the top sampled entries - `eval`: 35.14s -> 21.96s - logits retrieval/copy path: 29.89s `get_logits()` -> 18.68s `get_logits_ith()` - `decode`: 3.89s -> 2.25s - `detokenize`: 2.60s -> 1.33s - `sample`: 2.35s -> 2.03s This significantly reduces CPU overhead and memory bandwidth during generation, as the native `llama.cpp` sampler reads directly from the C context without needing to expose the `n_vocab` array to Python on every token. Signed-off-by: JamePeng --- llama_cpp/llama.py | 52 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 734485802e..e9d16438e5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1035,11 +1035,20 @@ def eval( tokens: Sequence[int], active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + copy_logits: bool = True, ): """Evaluate a list of tokens. Args: - tokens: The list of tokens to evaluate. + tokens: The token ids to evaluate. + active_loras: Optional LoRA adapters to apply for this evaluation. + Each item should contain a ``name`` and an optional ``scale``. + control_vector: Optional control vector configuration to apply during + this evaluation. + copy_logits: Whether to copy the final logits into ``self.scores`` when + ``logits_all`` is disabled. Set to ``False`` for native sampler paths + that sample directly from the llama context and do not need + Python-side logits. """ n_eval = len(tokens) if n_eval == 0: @@ -1246,9 +1255,11 @@ def eval( if self.verbose: print(f"Llama.eval: [Periodic Checkpoint] HybridCheckpoint save failed at pos {current_pos}, skipping update", file=sys.stderr) - # Save the final logit if not in _logits_all mode - if not self._logits_all: - logits_ptr = self._ctx.get_logits() + # Save the final logits only when Python-side logits are required. + # Native sampler can sample directly from ctx, so normal generation does not + # need to copy n_vocab floats into self.scores on every token. + if not self._logits_all and copy_logits: + logits_ptr = self._ctx.get_logits_ith(-1) logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,)) self.scores[0, :] = logits_view @@ -1666,6 +1677,14 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): self._sampling_ctx = LlamaSamplingContext(params, self._model) + # Native sampler samples directly from ctx. Python-side logits are only needed + # for compatibility hooks that explicitly consume self._scores. + copy_logits = ( + self._logits_all + or logits_processor is not None + or stopping_criteria is not None + ) + sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) @@ -1685,8 +1704,13 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): body_tokens = tokens[:-1] last_token = [tokens[-1]] - # 1. Evaluate up to N-1 - self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector) + # 1. Evaluate up to N-1 without copying logits. + self.eval( + body_tokens, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=False, + ) # 2. Save the N-1 state snapshot current_history = self._input_ids[:self.n_tokens].tolist() @@ -1695,11 +1719,21 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array): tokens=current_history, seq_id=0 ) - # 3. Evaluate the final token to refresh logits - self.eval(last_token, active_loras=active_loras, control_vector=control_vector) + # 3. Evaluate final token. Copy logits only if Python-side hooks need them. + self.eval( + last_token, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=copy_logits, + ) else: # Standard evaluation or single-token generation step - self.eval(tokens, active_loras=active_loras, control_vector=control_vector) + self.eval( + tokens, + active_loras=active_loras, + control_vector=control_vector, + copy_logits=copy_logits, + ) # Sample loop while sample_idx < self.n_tokens: From 4d50e5860798ac4e1706e0de250e01c298a0f126 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 16:34:00 +0800 Subject: [PATCH 446/518] =?UTF-8?q?docs(CUDA):=20Add=20note=20about=20PDL?= =?UTF-8?q?=20optimization=20for=20newer=20NVIDIA=20GPUs=20(CC=20=E2=89=A5?= =?UTF-8?q?=2090)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: JamePeng --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1986a6ca54..cc83e9814c 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,9 @@ $env:CMAKE_ARGS = "-DGGML_CUDA=on" pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" ``` +Note: **Programmatic Dependent Launch (PDL)** is a CUDA optimization for newer NVIDIA GPUs (CC >= 90; does not include Ada). +It enables stream-level dependency-driven concurrent execution of CUDA kernels within the same stream, achieving similar kernel launch overhead reduction as CUDA Graphs. If you have a newer NVIDIA GPU (e.g. `Hoppper`, `Blackwell` and above), you can achieve significant speedups and latency reduction in token generation across nearly all models when compiling with ` -DGGML_CUDA_PDL=ON`. + **Pre-built Wheel (New)** It is also possible to install a pre-built wheel with CUDA support. Make sure your system meets the following requirements: From 8a107375f0e4e2d482ce64ad1e886ffb6ac5df37 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 16:51:25 +0800 Subject: [PATCH 447/518] docs(development): add AI agent prompt for git commit generation Introduce `git-commit-generation-agent.md` to the development wiki to standardize the creation of high-quality git commit messages using LLM assistants. - Define the system persona, core principles (Conventional Commits, DCO), and strict formatting rules for generating commits. - Provide concrete template examples for build, performance, and documentation updates. - Ensure future maintainers and contributors can easily generate consistent, maintainer-level commits that explicitly explain the "Why" and "How" of code changes. Signed-off-by: JamePeng --- .../git-commit-generation-agent.md | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 docs/wiki/development/git-commit-generation-agent.md diff --git a/docs/wiki/development/git-commit-generation-agent.md b/docs/wiki/development/git-commit-generation-agent.md new file mode 100644 index 0000000000..4cce635154 --- /dev/null +++ b/docs/wiki/development/git-commit-generation-agent.md @@ -0,0 +1,214 @@ +--- +title: Git Commit Generation Agent +page_type: development-helper +source_file: docs/wiki/development/git-commit-generation-agent.md +last_updated: 2026-05-23 +version_target: "latest" +author: JamePeng +audience: maintainers +--- + +# Git Commit Generation Agent for `llama-cpp-python` + +## Overview + +This page defines a maintainer-facing LLM helper workflow for generating +high-quality, descriptive, and standardized Git commit messages for +`llama-cpp-python`. + +## System Persona +You are an expert C++/Python developer and a core maintainer of the +`llama-cpp-python` project. Your task is to generate clear, accurate, and +standardized Git commit messages based on provided diffs, source snippets, +benchmark notes, issue references, or maintainer summaries. + +## Core Principles + +The project follows the **Conventional Commits** specification and requires a +**Developer Certificate of Origin (DCO) Sign-off**. + +Generated commit messages must prioritize: + +- **Why** the change was needed. +- **How** the change was implemented. +- **What** user-visible, runtime, build, packaging, or documentation behavior + changed. +- **What** future maintainers need to know when reading the project history. + +## Input Requirements + +The agent may receive: + +- A full Git diff +- A changed file list +- Source snippets +- Benchmark results +- Maintainer notes +- Issue or PR references +- A natural-language summary of changes + +When the input is incomplete, generate the best possible commit message from the +provided information, but do not invent implementation details. + +## Formatting Rules + +### 1. Header Line (Subject) +Use the following format: + +```text +(): +```` + +Allowed types: + +| Type | Use for | +| ---------- | ----------------------------------------------------------- | +| `feat` | New features or user-facing capabilities | +| `fix` | Bug fixes | +| `docs` | Documentation-only changes | +| `build` | CMake, build scripts, compiler flags, packaging build logic | +| `perf` | Performance optimizations | +| `ci` | GitHub Actions or other workflow changes | +| `chore` | Maintenance, cleanup, or non-user-facing changes | +| `refactor` | Internal restructuring without behavior change | +| `test` | Test additions or updates | + +Recommended scopes: + +* `llama` +* `core` +* `bindings` +* `sampling` +* `speculative` +* `cache` +* `chat` +* `multimodal` +* `embedding` +* `types` +* `cmake` +* `windows` +* `cuda` +* `metal` +* `ci` +* `docs` +* `readme` +* `packaging` + +Subject rules: + +* Use imperative mood, such as `add`, `fix`, `update`, `skip`, `expose`. +* Do not use past tense, such as `added`, `fixed`, or `updated`. +* Keep the subject under 72 characters when possible. +* Use lowercase unless a proper noun, symbol, or API name requires otherwise. +* Do not end the subject with a period. + +### 2. Body +Leave one blank line between the header and the body. +The body should: +* Start with a short paragraph explaining the motivation or problem. +* Use bullets when the diff contains multiple logical changes. +* Mention important files, classes, functions, flags, or APIs using Markdown + backticks. +* Keep lines wrapped at around 72-80 characters. +* Mention user-visible behavior changes when relevant. +* Mention performance impact only when supported by the input. + +### 3. Footer (Sign-off) +* Leave one blank line after the body. +* You MUST append a generic DCO sign-off line at the very end. +* **Format:** `Signed-off-by: Developer Name ` + +--- + +## Accuracy Rules + +* Do not invent changed files, functions, APIs, benchmarks, flags, or behavior. +* Do not claim performance improvements unless benchmark data is provided or the + diff clearly supports the optimization. +* Do not mention issue or PR numbers unless provided by the user. +* Do not include migration notes unless the change affects user-facing APIs. +* If the change is documentation-only, do not imply runtime behavior changed. +* If the change is internal-only, do not overstate it as a user-facing feature. +* Prefer specific technical descriptions over generic wording. + +## Output Rules + +When the user provides a code diff or a summary of changes, analyze the intent +and output only the raw Git commit message. + +Do not: + +* Wrap the commit message in Markdown code fences. +* Add explanations before or after the commit message. +* Add headings such as `Commit message:`. +* Include alternative versions unless explicitly requested. + +## Output Examples + +### Example 1: Build System Change +```text +build(cmake): package LLVM OpenMP runtime DLL for Windows wheels + +Dynamically loaded GGML CPU backends compiled with LLVM/Clang and OpenMP +require `libomp140.x86_64.dll` at runtime. Since this dependency is not +always caught by `$`, it must be packaged manually. + +- Add `llama_cpp_python_install_windows_runtime_file` to handle installing + arbitrary extra DLLs with proper CMake path normalization. +- Add fallback search logic to locate the OpenMP DLL in common Visual Studio + directories. +- Execute the installation before the dev-file cleanup step to ensure the + DLL is correctly packaged in the final Python wheel. + +Signed-off-by: Developer Name + +``` + +### Example 2: Performance Optimization + +```text +perf(eval): skip unnecessary logit array copies during native sampling + +Introduce a `copy_logits` flag to `Llama.eval()` to control whether C-level +logits are copied into the Python `self.scores` array. + +- Automatically disable `copy_logits` during the generation loop unless + Python-side hooks (`logits_processor`, `stopping_criteria`) explicitly + require them. +- Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch + the final token's logits when copying is required. + +This significantly reduces CPU overhead and memory bandwidth during generation, +as the native `llama.cpp` sampler reads directly from the C context without +needing to expose the `n_vocab` array to Python on every token. + +Signed-off-by: Developer Name + +``` + +### Example 3: Documentation Update + +```text +docs(speculative): document n-gram map k/k4v modes and new parameters + +Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in +the official documentation. + +- Document the new `__init__` parameters (`mode`, `min_hits`, + `max_entries_per_key`) and their validation rules. +- Add a detailed comparison table explaining the memory and behavior + differences between the `"k"` and `"k4v"` lookup modes. +- Add a strong production warning against the legacy `LlamaPromptLookupDecoding` + implementation. + +Signed-off-by: Developer Name + +``` + +## Execution + +When the user provides a code diff or a summary of changes, analyze the intent and output ONLY the raw Git commit message following the exact structure and tone demonstrated above. + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] From 1b0ae7097688a0c328f5c4149afa7b9f519318fd Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 23 May 2026 17:14:35 +0800 Subject: [PATCH 448/518] docs(wiki): add development helper to index Introduce the development section in the wiki index so maintainer-facing workflows and LLM-assisted helper tools are discoverable from the main navigation. - Add a Development section with a link to the Git commit generation agent. Include the helper in the recommended reading order for new wiki users. - Add development/git-commit-generation-agent.md to the available pages list. Signed-off-by: JamePeng jame_peng@sina.com --- docs/wiki/index.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index 143d6e629b..c721fc4e89 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -34,6 +34,18 @@ These pages document major source modules and related classes. --- +### Development + +This section contains maintainer-facing development notes, workflows, and LLM-assisted helper tools for working on `llama-cpp-python`. + +#### Pages + +| Page | Description | +|---|---| +| [[development/Git Commit Generation Agent]] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | + +--- + ### Wiki Maintenance These pages define how the wiki should be written, updated, and reviewed. @@ -55,9 +67,9 @@ If you are new to this wiki, read the pages in this order: 4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] 5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] 6. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] +7. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] If you are contributing documentation, start with: - 1. [[SCHEMA|Wiki Schema](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md)] 2. [[contributing-to-wiki|Contributing to the Wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md)] @@ -75,6 +87,7 @@ Currently available pages: - `modules/LlamaGrammar.md` - `modules/LlamaSpeculative.md` - `modules/Logger.md` +- `development/git-commit-generation-agent.md` - `SCHEMA.md` - `contributing-to-wiki.md` From 0239328f3f22ba87fd74351d96c5c65f6c95f95a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 25 May 2026 19:55:26 +0800 Subject: [PATCH 449/518] Update Submodule vendor/llama.cpp 1acee6b..328874d Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 1 + vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ec2b665a16..238a1a4fe1 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2804,6 +2804,7 @@ def llama_state_seq_load_file( LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1 # // keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +# // Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag. LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2 llama_state_seq_flags = ctypes.c_uint32 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1acee6bf89..328874d054 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1acee6bf8939948f9bcbf4b14034e4b475f06069 +Subproject commit 328874d054e0eb44591202a23c209cf02c18e3cb From a32daf797a5e1aea527c8957da1f25f631ba98e9 Mon Sep 17 00:00:00 2001 From: Jay0360 Date: Wed, 27 May 2026 21:12:15 +0800 Subject: [PATCH 450/518] fix: wire LFM VL chat handlers into server loader --- llama_cpp/server/model.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 37c5195687..6b3fd1dd15 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -199,6 +199,34 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) + elif settings.chat_format == "lfm2-vl": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.LFM2VLChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.LFM2VLChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "lfm2.5-vl": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.LFM25VLChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.LFM25VLChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "hf-autotokenizer": assert ( settings.hf_pretrained_model_name_or_path is not None From d9cc25bcb4e563eed910454f6fb5faa5b736124a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 27 May 2026 22:10:47 +0800 Subject: [PATCH 451/518] Update Submodule vendor/llama.cpp 328874d..617255d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 328874d054..617255d437 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 328874d054e0eb44591202a23c209cf02c18e3cb +Subproject commit 617255d437898fcef6c3d80d4994b307454da850 From 4a6c311364ca3463619c107d37e0ae8a4c0cd98b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 28 May 2026 00:55:17 +0800 Subject: [PATCH 452/518] refactor(internals): align model metadata wrappers with llama.cpp API - Use `llama_vocab_n_tokens()` instead of the old vocab size helper. - Add Python wrappers for model description, size, chat template, and trained RoPE frequency scaling. - Clarify model capability helpers with docstrings matching llama.cpp semantics. - Rename `desc()` and `size()` to `model_desc()` and `model_size()` to make their scope explicit. - Drop the unused `get_tensor()` stub since llama.cpp does not expose it. - Route rerank template lookup through `LlamaModel.model_chat_template()` for consistency with the internal model abstraction. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 65 +++++++++++++++++++++++++++--------- llama_cpp/llama.py | 9 ++++- llama_cpp/llama_embedding.py | 4 +-- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index fda9187855..5416ce2416 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -102,7 +102,7 @@ def vocab_type(self) -> int: return llama_cpp.llama_vocab_type(self.model) def n_vocab(self) -> int: - return llama_cpp.llama_n_vocab(self.vocab) + return llama_cpp.llama_vocab_n_tokens(self.vocab) def n_ctx_train(self) -> int: return llama_cpp.llama_model_n_ctx_train(self.model) @@ -131,41 +131,76 @@ def n_head_kv(self) -> int: def n_swa(self) -> int: return llama_cpp.llama_model_n_swa(self.model) + def rope_freq_scale_train(self) -> float: + """ + Get the model's RoPE frequency scaling factor + """ + return llama_cpp.llama_model_rope_freq_scale_train(self.model) + + def model_desc(self) -> str: + """ + Get a string describing the model type + """ + buf = ctypes.create_string_buffer(256) + llama_cpp.llama_model_desc(self.model, buf, 256) + return buf.value.decode("utf-8") + + def model_size(self) -> int: + """ + Returns the total size of all the tensors in the model in bytes + """ + return llama_cpp.llama_model_size(self.model) + + def model_chat_template(self, name: bytes) -> str: + """ + Get the default chat template. Returns nullptr if not available + If name is NULL, returns the default chat template + """ + return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8") + def n_params(self) -> int: + """ + Returns the total number of parameters in the model + """ return llama_cpp.llama_model_n_params(self.model) def has_encoder(self) -> bool: + """ + Returns true if the model contains an encoder that requires llama_encode() call + """ return llama_cpp.llama_model_has_encoder(self.model) def has_decoder(self) -> bool: + """ + Returns true if the model contains a decoder that requires llama_decode() call + """ return llama_cpp.llama_model_has_decoder(self.model) def decoder_start_token(self) -> int: + """ + For encoder-decoder models, this function returns id of the token that must be provided + to the decoder to start generating output sequence. For other models, it returns -1. + """ return llama_cpp.llama_model_decoder_start_token(self.model) def is_recurrent(self) -> bool: + """ + Returns true if the model is recurrent (like Mamba, RWKV, etc.) + """ return llama_cpp.llama_model_is_recurrent(self.model) def is_hybrid(self) -> bool: + """ + Returns true if the model is hybrid (like Jamba, Granite, etc.) + """ return llama_cpp.llama_model_is_hybrid(self.model) def is_diffusion(self) -> bool: + """ + Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) + """ return llama_cpp.llama_model_is_diffusion(self.model) - def rope_freq_scale_train(self) -> float: - return llama_cpp.llama_model_rope_freq_scale_train(self.model) - - def desc(self) -> str: - buf = ctypes.create_string_buffer(1024) - llama_cpp.llama_model_desc(self.model, buf, 1024) - return buf.value.decode("utf-8") - - def size(self) -> int: - return llama_cpp.llama_model_size(self.model) - - def get_tensor(self, name: str) -> ctypes.c_void_p: - raise NotImplementedError("get_tensor is not implemented in llama.cpp") - # Vocab def token_get_text(self, token: int) -> str: diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e9d16438e5..c2d2757e13 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -696,13 +696,20 @@ def __init__( try: self.metadata = self._model.metadata() + self.model_desc = self._model.model_desc() + # The total size of all the tensors in the model in bytes + self.model_size = self._model.model_size() + except Exception as e: self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + print(f"Model desc: {self.model_desc}, " + f"Model size: {self.model_size / (1024 * 1024):.2f} MB, " + f"Model metadata: {self.metadata}", + file=sys.stderr) eos_token_id = self.token_eos() bos_token_id = self.token_bos() diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py index 7c8ad1e90f..0c1df339ce 100644 --- a/llama_cpp/llama_embedding.py +++ b/llama_cpp/llama_embedding.py @@ -303,9 +303,7 @@ def rank(self, query: str, documents: List[str]) -> List[float]: # 1. Attempt to retrieve the built-in 'rerank' chat template from model metadata. # Modern GGUF models often include a template for formatting query/document pairs. - rerank_template = llama_cpp.llama_model_chat_template(self._model.model, b"rerank") - if rerank_template: - rerank_template = rerank_template.decode("utf-8") + rerank_template = self._model.model_chat_template(b"rerank") batch_inputs: List[List[int]] = [] From 677db7b0d5b834ae3d3831af4702ec21986ab335 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Thu, 28 May 2026 00:12:35 +0200 Subject: [PATCH 453/518] Resolve file conflicts. --- .github/workflows/build-wheels-cu131-win.yml | 25 -------------------- 1 file changed, 25 deletions(-) diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml index 5f77003a5f..14bea65d19 100644 --- a/.github/workflows/build-wheels-cu131-win.yml +++ b/.github/workflows/build-wheels-cu131-win.yml @@ -67,31 +67,6 @@ jobs: echo LIB=%LIB%>>%GITHUB_ENV% echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% - - name: Copy LLVM OpenMP runtime - shell: pwsh - run: | - # GGML CPU all-variant backends are built with LLVM OpenMP on Windows. - # The dynamically loaded ggml-cpu-*.dll files depend on this runtime. - # If it is missing from the wheel, ggml_backend_load_all_from_path() - # may fail to load CPU backend DLLs at runtime. - $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib" - New-Item -ItemType Directory -Force $packageLibDir | Out-Null - - $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" ` - -Recurse ` - -Filter "libomp140.x86_64.dll" ` - -ErrorAction SilentlyContinue | - Where-Object { $_.FullName -match "OpenMP\.LLVM" } | - Select-Object -First 1 - - if (!$omp) { - Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables." - exit 1 - } - - Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force - Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)" - - name: Build wheel run: | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') From 4794c8c20ee731838cbc2c8d601ccb2c245d6893 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Thu, 28 May 2026 01:52:48 +0200 Subject: [PATCH 454/518] Added support when using the keyword 'audio' instead of 'audio_url'. --- llama_cpp/llama_chat_format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f9b9d52367..254195f95a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2996,13 +2996,13 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa media_items.append({"url": url, "type": "image"}) # 2. Audio Processing - elif content_type in ["audio_url", "input_audio"]: + elif content_type in ["audio", "audio_url", "input_audio"]: if not self.is_support_audio: raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url": - audio_url = content["audio_url"] + if content_type == "audio_url" or content_type == "audio": + audio_url = content[content_type] url = audio_url if isinstance(audio_url, str) else audio_url["url"] media_items.append({"url": url, "type": "audio"}) # Case B: Handle OpenAI standard input_audio format From 103639ce04b72d09e09ce895f3c8d8cfba518e13 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 28 May 2026 21:48:01 +0800 Subject: [PATCH 455/518] Update Submodule vendor/llama.cpp 617255d..6ed481e Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 617255d437..6ed481eea4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 617255d437898fcef6c3d80d4994b307454da850 +Subproject commit 6ed481eea4cf4ed40777db2fa29e8d08eb712b3b From 6c9e7bf92c346806f91ef06f2522b0def7611f10 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Fri, 29 May 2026 22:51:33 +0800 Subject: [PATCH 456/518] feat(chat_handler): update multimodal handlers for Qwen2.5-VL, Qwen3-VL, and PaddleOCR - Update PaddleOCRChatHandler to support version 1.6 - Add token configuration and stop sequences for Qwen2.5-VL and Qwen3-VL - Standardize input_ids initialization in __call__ methods for Qwen2.5-VL, Qwen3-ASR, and Qwen3-VL handlers Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0365d8f871..cf5dca2492 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -5324,7 +5324,7 @@ def __call__(self, **kwargs): class PaddleOCRChatHandler(MTMDChatHandler): """ - Handler for PaddleOCR 1.5 multimodal models. + Handler for PaddleOCR 1.5/1.6 multimodal models. """ PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" @@ -5431,6 +5431,11 @@ def __call__(self, **kwargs): class Qwen25VLChatHandler(MTMDChatHandler): + + QWEN25_VL_BOS_TOKEN = "<|endoftext|>" + QWEN25_VL_PAD_TOKEN = "<|endoftext|>" + QWEN25_VL_EOS_TOKEN = "<|im_end|>" + CHAT_FORMAT = ( "{% set image_count = namespace(value=0) %}" "{% for message in messages %}" @@ -5462,6 +5467,8 @@ class Qwen25VLChatHandler(MTMDChatHandler): ) def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] + llama = kwargs['llama'] if hasattr(llama, 'input_ids'): @@ -5547,12 +5554,22 @@ def __call__(self, **kwargs): # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + if self.verbose: print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") return super().__call__(**kwargs) class Qwen3VLChatHandler(MTMDChatHandler): + + QWEN3_VL_BOS_TOKEN = "<|endoftext|>" + QWEN3_VL_PAD_TOKEN = "<|endoftext|>" + QWEN3_VL_EOS_TOKEN = "<|im_end|>" + CHAT_FORMAT = ( "{{- '<|im_start|>system\n' -}}" "{%- if messages[0].content is string and messages[0].role == 'system' -%}" @@ -5661,6 +5678,8 @@ def __init__( self.extra_template_arguments["add_vision_id"] = add_vision_id def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] + llama = kwargs['llama'] if hasattr(llama, 'input_ids'): From 69e740ce51b064be36fa5e28214839429f89c94e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 30 May 2026 01:21:11 +0800 Subject: [PATCH 457/518] Update Submodule vendor/llama.cpp 6ed481e..06d26df Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6ed481eea4..06d26dfdff 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6ed481eea4cf4ed40777db2fa29e8d08eb712b3b +Subproject commit 06d26dfdff4097dc51eac20155371a9cfd53e094 From e7976f42b23ce29491d1b48bd044682ce4f261a2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 30 May 2026 01:24:56 +0800 Subject: [PATCH 458/518] feat(mtmd): improve fallback chat template for multimodal models - Add BOS/EOS token handling to the default MTMD chat format. - Use a clearer role-based template with explicit USER and ASSISTANT prefixes. - Append a newline after each message to keep generated prompts readable. - Treat EOS as the end marker for the serialized conversation history before the optional generation prompt. - Improve fallback behavior for multimodal GGUF models that do not provide a chat template, such as OCR-oriented models like DeepSeek-OCR 1/2. - Make the default system prompt a single normalized string while preserving its original meaning. - Clean up minor formatting around MTMD context parameter initialization. This improves prompt compatibility for multimodal models that either lack a GGUF chat template or are not yet covered by a complete custom chat handler. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index cf5dca2492..71228d0627 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2811,21 +2811,20 @@ def generate_streaming(tools, functions, function_call, prompt): class MTMDChatHandler: DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"""You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, -while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful.""" +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." ) CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" "{% for message in messages %}" "{% if message.role == 'system' %}" "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" + "{% elif message.role == 'user' %}" + "USER: " "{% if message.content is string %}" - "\nUSER: {{ message.content }}" + "{{ message.content }}" "{% elif message.content is iterable %}" - "\nUSER: " "{% for content in message.content %}" "{% if content.type == 'image_url' %}" "{{ content.image_url if content.image_url is string else content.image_url.url }}" @@ -2842,15 +2841,19 @@ class MTMDChatHandler: "{% endif %}" "{% endfor %}" "{% endif %}" - "{% endif %}" - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" "{% endif %}" + "{{ \"\n\" }}" "{% endfor %}" + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + "{% if add_generation_prompt %}" - "\nASSISTANT: " + "ASSISTANT: " "{% endif %}" ) @@ -2906,7 +2909,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): self.mctx_params.use_gpu = self.use_gpu self.mctx_params.print_timings = self.verbose self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO self.mctx_params.warmup = True if self.image_min_tokens > 0: self.mctx_params.image_min_tokens = self.image_min_tokens From 1df7ffc07b7a8f52000614d9f63a90f8b80f0d6f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 30 May 2026 02:16:12 +0800 Subject: [PATCH 459/518] docs(Readme): Update Deepseek-OCR-2-GGUF Link Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index cc83e9814c..c39df4abd7 100644 --- a/README.md +++ b/README.md @@ -953,6 +953,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [granite-docling](https://huggingface.co/ibm-granite/granite-docling-258M-GGUF) | `GraniteDoclingChatHandler` | `granite-docling` | | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | +| [deepseek-ocr](https://huggingface.co/JamePeng2023/DeepSeek-OCR-2-GGUF) | `MTMDChatHandler` | `None` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | From c4efcff5c1534e0a3946809bec6d0e97e374bf4a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 19:13:39 +0800 Subject: [PATCH 460/518] Update Submodule vendor/llama.cpp 06d26df..d4c8e2c Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 06d26dfdff..d4c8e2c29c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 06d26dfdff4097dc51eac20155371a9cfd53e094 +Subproject commit d4c8e2c29ce2fb9a251a0a4a16d6c857b4f70f8c From 6a7fde40a2d96bee1da4c004bf3ac0c31b2432d4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 19:27:11 +0800 Subject: [PATCH 461/518] ci : update metal build/test job to macos-26/macos-15-intel - Build on the Tahoe runners in order to enable the tensor API for M5 and A19. Signed-off-by: JamePeng --- .github/workflows/build-wheels-metal.yaml | 7 +++---- .github/workflows/test.yaml | 12 ++++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index a809909720..2b00d1abaa 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -8,8 +8,8 @@ permissions: jobs: build_wheels: - name: Build wheels (Metal macos) - runs-on: macos-latest + name: Build wheels (Metal macos-26) + runs-on: macos-26 outputs: version: ${{steps.get_version.outputs.version}} @@ -53,8 +53,7 @@ jobs: -DCMAKE_CROSSCOMPILING=on -DGGML_METAL=on -DGGML_METAL_USE_BF16=on - -DGGML_METAL_EMBED_LIBRARY=off - -DGGML_METAL_SHADER_DEBUG=on" + -DGGML_METAL_EMBED_LIBRARY=on" with: package-dir: . output-dir: wheelhouse2 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 420c5e9495..a9f359d1cd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -28,21 +28,21 @@ jobs: python-version: ["3.9", "3.14"] include: # macOS Non-Metal - - os: macos-14 + - os: macos-15-intel python-version: "3.9" - cmake_args: "-DLLAMA_METAL=off" + cmake_args: "-DLLAMA_METAL=off -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3" metal_status: "(No Metal)" - - os: macos-14 + - os: macos-15-intel python-version: "3.14" - cmake_args: "-DLLAMA_METAL=off" + cmake_args: "-DLLAMA_METAL=off -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3" metal_status: "(No Metal)" # macOS Metal - - os: macos-14 + - os: macos-26 python-version: "3.9" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" metal_status: "(Metal)" - - os: macos-14 + - os: macos-26 python-version: "3.14" cmake_args: "-DLLAMA_METAL=on -DGGML_METAL_USE_BF16=on -DGGML_METAL_EMBED_LIBRARY=on" metal_status: "(Metal)" From ea0907d3870aabbeaf669f42bd1b484a2d7e7c83 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 20:06:49 +0800 Subject: [PATCH 462/518] refactor(llama_cpp): wrap llama constants into enum.IntEnum - Group global `LLAMA_*` constants into `enum.IntEnum` classes (`llama_vocab_type`, `llama_vocab_pre_type`, `llama_rope_type`, etc.) for better type safety and organization. - Sync new values for `llama_vocab_pre_type` (`SARVAM_MOE`, `MINICPM5`, `WHITESPACE`). Signed-off-by: JamePeng --- llama_cpp/llama_cpp.py | 245 +++++++++++++++++++++-------------------- 1 file changed, 128 insertions(+), 117 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 238a1a4fe1..62c4c81ef9 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -122,20 +122,21 @@ # LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization # LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming # }; -LLAMA_VOCAB_TYPE_NONE = 0 -"""For models without vocab""" -LLAMA_VOCAB_TYPE_SPM = 1 -"""LLaMA tokenizer based on byte-level BPE with byte fallback""" -LLAMA_VOCAB_TYPE_BPE = 2 -"""GPT-2 tokenizer based on byte-level BPE""" -LLAMA_VOCAB_TYPE_WPM = 3 -"""BERT tokenizer based on WordPiece""" -LLAMA_VOCAB_TYPE_UGM = 4 -"""T5 tokenizer based on Unigram""" -LLAMA_VOCAB_TYPE_RWKV = 5 -"""RWKV tokenizer based on greedy tokenization""" -LLAMA_VOCAB_TYPE_PLAMO2 = 6 -"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" +class llama_vocab_type(enum.IntEnum): + LLAMA_VOCAB_TYPE_NONE = 0 + """For models without vocab""" + LLAMA_VOCAB_TYPE_SPM = 1 + """LLaMA tokenizer based on byte-level BPE with byte fallback""" + LLAMA_VOCAB_TYPE_BPE = 2 + """GPT-2 tokenizer based on byte-level BPE""" + LLAMA_VOCAB_TYPE_WPM = 3 + """BERT tokenizer based on WordPiece""" + LLAMA_VOCAB_TYPE_UGM = 4 + """T5 tokenizer based on Unigram""" + LLAMA_VOCAB_TYPE_RWKV = 5 + """RWKV tokenizer based on greedy tokenization""" + LLAMA_VOCAB_TYPE_PLAMO2 = 6 + """PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" # NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) @@ -193,58 +194,65 @@ # LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, # LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, # LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, +# LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, +# LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, +# LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, # }; -LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 -LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 -LLAMA_VOCAB_PRE_TYPE_FALCON = 4 -LLAMA_VOCAB_PRE_TYPE_MPT = 5 -LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 -LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 -LLAMA_VOCAB_PRE_TYPE_REFACT = 8 -LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 -LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 -LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 -LLAMA_VOCAB_PRE_TYPE_OLMO = 12 -LLAMA_VOCAB_PRE_TYPE_DBRX = 13 -LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 -LLAMA_VOCAB_PRE_TYPE_PORO = 15 -LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 -LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 -LLAMA_VOCAB_PRE_TYPE_VIKING = 18 -LLAMA_VOCAB_PRE_TYPE_JAIS = 19 -LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 -LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 -LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 -LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 -LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 -LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 -LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 -LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 -LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 -LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 -LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 -LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 -LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 -LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 -LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 -LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 -LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 -LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 -LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38 -LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39 -LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 -LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 -LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 -LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 -LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 -LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 -LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 -LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 -LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 -LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 -LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 +class llama_vocab_pre_type(enum.IntEnum): + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3 + LLAMA_VOCAB_PRE_TYPE_FALCON = 4 + LLAMA_VOCAB_PRE_TYPE_MPT = 5 + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 + LLAMA_VOCAB_PRE_TYPE_REFACT = 8 + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 + LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 + LLAMA_VOCAB_PRE_TYPE_OLMO = 12 + LLAMA_VOCAB_PRE_TYPE_DBRX = 13 + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 + LLAMA_VOCAB_PRE_TYPE_PORO = 15 + LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16 + LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17 + LLAMA_VOCAB_PRE_TYPE_VIKING = 18 + LLAMA_VOCAB_PRE_TYPE_JAIS = 19 + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 + LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 + LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 + LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 + LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 + LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 + LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33 + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 + LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 + LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38 + LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39 + LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40 + LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41 + LLAMA_VOCAB_PRE_TYPE_AFMOE = 42 + LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43 + LLAMA_VOCAB_PRE_TYPE_YOUTU = 44 + LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45 + LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46 + LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47 + LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48 + LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49 + LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50 + LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51 + LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52 + LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53 # // note: these values should be synchronized with ggml_rope @@ -257,12 +265,13 @@ # LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE, # LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, # }; -LLAMA_ROPE_TYPE_NONE = -1 -LLAMA_ROPE_TYPE_NORM = 0 -LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 -LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 -LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 -LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 +class llama_rope_type(enum.IntEnum): + LLAMA_ROPE_TYPE_NONE = -1 + LLAMA_ROPE_TYPE_NORM = 0 + LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 + LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 + LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 + LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 # enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file @@ -274,13 +283,14 @@ # LLAMA_TOKEN_TYPE_UNUSED = 5, # LLAMA_TOKEN_TYPE_BYTE = 6, # }; -LLAMA_TOKEN_TYPE_UNDEFINED = 0 -LLAMA_TOKEN_TYPE_NORMAL = 1 -LLAMA_TOKEN_TYPE_UNKNOWN = 2 -LLAMA_TOKEN_TYPE_CONTROL = 3 -LLAMA_TOKEN_TYPE_USER_DEFINED = 4 -LLAMA_TOKEN_TYPE_UNUSED = 5 -LLAMA_TOKEN_TYPE_BYTE = 6 +class llama_token_type(enum.IntEnum): + LLAMA_TOKEN_TYPE_UNDEFINED = 0 + LLAMA_TOKEN_TYPE_NORMAL = 1 + LLAMA_TOKEN_TYPE_UNKNOWN = 2 + LLAMA_TOKEN_TYPE_CONTROL = 3 + LLAMA_TOKEN_TYPE_USER_DEFINED = 4 + LLAMA_TOKEN_TYPE_UNUSED = 5 + LLAMA_TOKEN_TYPE_BYTE = 6 # enum llama_token_attr { @@ -355,45 +365,46 @@ # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; -LLAMA_FTYPE_ALL_F32 = 0 -LLAMA_FTYPE_MOSTLY_F16 = 1 -LLAMA_FTYPE_MOSTLY_Q4_0 = 2 -LLAMA_FTYPE_MOSTLY_Q4_1 = 3 -LLAMA_FTYPE_MOSTLY_Q8_0 = 7 -LLAMA_FTYPE_MOSTLY_Q5_0 = 8 -LLAMA_FTYPE_MOSTLY_Q5_1 = 9 -LLAMA_FTYPE_MOSTLY_Q2_K = 10 -LLAMA_FTYPE_MOSTLY_Q3_K_S = 11 -LLAMA_FTYPE_MOSTLY_Q3_K_M = 12 -LLAMA_FTYPE_MOSTLY_Q3_K_L = 13 -LLAMA_FTYPE_MOSTLY_Q4_K_S = 14 -LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 -LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 -LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 -LLAMA_FTYPE_MOSTLY_Q6_K = 18 -LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 -LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 -LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 -LLAMA_FTYPE_MOSTLY_IQ3_XS = 22 -LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 -LLAMA_FTYPE_MOSTLY_IQ1_S = 24 -LLAMA_FTYPE_MOSTLY_IQ4_NL = 25 -LLAMA_FTYPE_MOSTLY_IQ3_S = 26 -LLAMA_FTYPE_MOSTLY_IQ3_M = 27 -LLAMA_FTYPE_MOSTLY_IQ2_S = 28 -LLAMA_FTYPE_MOSTLY_IQ2_M = 29 -LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 -LLAMA_FTYPE_MOSTLY_IQ1_M = 31 -LLAMA_FTYPE_MOSTLY_BF16 = 32 -# LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33 -# LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34 -# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 -LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 -LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 -LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 -LLAMA_FTYPE_MOSTLY_NVFP4 = 39 -LLAMA_FTYPE_MOSTLY_Q1_0 = 40 -LLAMA_FTYPE_GUESSED = 1024 +class llama_ftype(enum.IntEnum): + LLAMA_FTYPE_ALL_F32 = 0 + LLAMA_FTYPE_MOSTLY_F16 = 1 + LLAMA_FTYPE_MOSTLY_Q4_0 = 2 + LLAMA_FTYPE_MOSTLY_Q4_1 = 3 + LLAMA_FTYPE_MOSTLY_Q8_0 = 7 + LLAMA_FTYPE_MOSTLY_Q5_0 = 8 + LLAMA_FTYPE_MOSTLY_Q5_1 = 9 + LLAMA_FTYPE_MOSTLY_Q2_K = 10 + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11 + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12 + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13 + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14 + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 + LLAMA_FTYPE_MOSTLY_Q6_K = 18 + LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 + LLAMA_FTYPE_MOSTLY_IQ3_XS = 22 + LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 + LLAMA_FTYPE_MOSTLY_IQ1_S = 24 + LLAMA_FTYPE_MOSTLY_IQ4_NL = 25 + LLAMA_FTYPE_MOSTLY_IQ3_S = 26 + LLAMA_FTYPE_MOSTLY_IQ3_M = 27 + LLAMA_FTYPE_MOSTLY_IQ2_S = 28 + LLAMA_FTYPE_MOSTLY_IQ2_M = 29 + LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 + LLAMA_FTYPE_MOSTLY_IQ1_M = 31 + LLAMA_FTYPE_MOSTLY_BF16 = 32 + # LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33 + # LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34 + # LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35 + LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 + LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 + LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 + LLAMA_FTYPE_MOSTLY_NVFP4 = 39 + LLAMA_FTYPE_MOSTLY_Q1_0 = 40 + LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { # LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, From ca81fd457969bba20d183d27962e28b45d9207ea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 20:33:38 +0800 Subject: [PATCH 463/518] feat: add `ReasoningBudgetState` enum and `TokenMatcher` helper class to _internals.py Introduce `ReasoningBudgetState` enum and `TokenMatcher` helper class to `_internals.py`. This lays the groundwork for the upcoming `ReasoningBudgetSampler`, mirroring the state machine defined in `common/reasoning-budget.h`. - `ReasoningBudgetState`: Tracks the lifecycle of the first reasoning block. - `TokenMatcher`: Handles incremental matching for multi-token sequences. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 5416ce2416..5b5c533c52 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1265,6 +1265,60 @@ class CommonSamplerType(enum.IntEnum): CUSTOM = 99 + +# common/reasssoning-budget.h +# +# enum common_reasoning_budget_state { +# REASONING_BUDGET_IDLE, // waiting for start sequence +# REASONING_BUDGET_COUNTING, // counting down tokens +# REASONING_BUDGET_FORCING, // forcing budget message + end sequence +# REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion +# REASONING_BUDGET_DONE, // passthrough forever +# }; +class ReasoningBudgetState(enum.IntEnum): + """ + State machine for the generic first-reasoning-block budget controller. + + This sampler only controls the first reasoning block. Once the first block + naturally ends or is forcibly closed, the sampler enters DONE and becomes a + permanent passthrough. + """ + + IDLE = 0 # Waiting for the first reasoning_start sequence. + COUNTING = 1 # Counting generated tokens inside the first reasoning block. + FORCING = 2 # Forcing reasoning_budget_message + reasoning_end. + WAITING_UTF8 = 3 # Budget exhausted; waiting for a complete UTF-8 boundary. + DONE = 4 # Permanent passthrough; later reasoning tags are ignored. + + +class TokenMatcher: + """ + Incremental matcher for a multi-token sequence. + Accepts None as tokens to represent no matcher. + """ + def __init__(self, tokens: Optional[Sequence[int]]): + # If None, matcher never matches anything + self.tokens = list(tokens) if tokens is not None else [] + self.pos = 0 + + def advance(self, token: int) -> bool: + if not self.tokens: + return False + if token == self.tokens[self.pos]: + self.pos += 1 + if self.pos >= len(self.tokens): + self.pos = 0 + return True + else: + self.pos = 0 + if token == self.tokens[0]: + self.pos = 1 + return False + + def reset(self) -> None: + self.pos = 0 + + @dataclass class LlamaSamplingParams: seed: int = llama_cpp.LLAMA_DEFAULT_SEED # the seed used to initialize llama_sampler From ab42b8664313a30c390fcf26caaec9602199c0f4 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 22:48:42 +0800 Subject: [PATCH 464/518] docs(readme): update supported embeddings models table - Add jina-embeddings-v2-base-zh - Add jina-embeddings-v3 - Minor table formatting clean up Signed-off-by: JamePeng --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c39df4abd7..c5aa1a1b26 100644 --- a/README.md +++ b/README.md @@ -1463,7 +1463,9 @@ run_inference( | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-m3`| Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`jina-embeddings-v2-base-zh`| Embedding |[jina-embeddings-v2-base-zh-GGUF](https://huggingface.co/gpustack/jina-embeddings-v2-base-zh-GGUF) | Useful ✅ | +|`jina-embeddings-v3`| Embedding |[jina-embeddings-v3-GGUF](https://huggingface.co/second-state/jina-embeddings-v3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | |`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | From 90d610ffd7b491603ca23c3b0027629553731658 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 31 May 2026 23:11:38 +0800 Subject: [PATCH 465/518] docs(llama_embedding): update supported embeddings models table - Add jina-embeddings-v2-base-zh - Add jina-embeddings-v3 - Minor table formatting clean up Signed-off-by: JamePeng --- docs/wiki/modules/LlamaEmbedding.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/wiki/modules/LlamaEmbedding.md b/docs/wiki/modules/LlamaEmbedding.md index 1279db5cab..3aa2427227 100644 --- a/docs/wiki/modules/LlamaEmbedding.md +++ b/docs/wiki/modules/LlamaEmbedding.md @@ -3,7 +3,7 @@ title: Llama Embedding module_name: llama_cpp.llama_embedding source_file: llama_cpp/llama_embedding.py class_name: LlamaEmbedding -last_updated: 2026-05-01 +last_updated: 2026-05-31 version_target: "latest" --- @@ -18,7 +18,9 @@ version_target: "latest" | Model | Type | Link | Status | |--------------------|-----------|--------------------------------------------------------|--------------| -| `bge-m3` | Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`bge-m3`| Embedding |[bge-m3-GGUF](https://huggingface.co/gpustack/bge-m3-GGUF) | Useful ✅ | +|`jina-embeddings-v2-base-zh`| Embedding |[jina-embeddings-v2-base-zh-GGUF](https://huggingface.co/gpustack/jina-embeddings-v2-base-zh-GGUF) | Useful ✅ | +|`jina-embeddings-v3`| Embedding |[jina-embeddings-v3-GGUF](https://huggingface.co/second-state/jina-embeddings-v3-GGUF) | Useful ✅ | |`bge-reranker-v2-m3`| Rerank |[bge-reranker-v2-m3-GGUF](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF) | Useful ✅ | |`qwen3-reranker`| Rerank |[Qwen3-Reranker-GGUF](https://huggingface.co/JamePeng2023/Qwen3-Reranker-GGUF) | Useful ✅ | From e174c1073c3c9408b6325ea1fac63688efacbb2e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 1 Jun 2026 09:07:29 +0800 Subject: [PATCH 466/518] feat(sampling): add reasoning budget configurations Introduce reasoning budget and block control parameters to `LlamaSamplingParams` to mirror llama.cpp CLI semantics. This includes: - `reasoning_budget` - `reasoning_start` / `reasoning_end` - `reasoning_budget_message` - `reasoning_start_in_prompt` - `reasoning_start_max_tokens` - Fix typo from typ_p to typical_p in logs Also updated `print_params()` to include these new metrics. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 64 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 5b5c533c52..9a22096a26 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1363,6 +1363,59 @@ class LlamaSamplingParams: default_factory=lambda: ["\n", ":", "\"", "*"] # default sequence breakers for DRY ) + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. + # + # This is intentionally model-agnostic: + # - It does not infer model families. + # - It does not guess reasoning tags from chat templates. + # - Downstream code should pass reasoning_start / reasoning_end explicitly + # for models that do not use the default ... tags. + # + # The sampler only controls the first visible reasoning block. After that + # block naturally ends or is forcibly closed, later reasoning tags are ignored. + # Matches llama.cpp CLI semantics: + # --reasoning-budget N + reasoning_budget: int = -1 # -1 = unrestricted / disabled, 0 = immediate end, N > 0 = token budget + + # Token/text sequence that marks the beginning of the first reasoning block. + # This sequence is tokenized with add_bos=False, special=True before building + # the ReasoningBudgetSampler. + reasoning_start: str = "" + + # Token/text sequence that marks the natural end of the reasoning block. + # When the budget is exhausted, the sampler forces: + # reasoning_budget_message + reasoning_end + reasoning_end: str = "" + + # Optional message injected before reasoning_end when the budget is exhausted. + # Mirrors llama.cpp CLI semantics: + # --reasoning-budget-message MESSAGE + # + # Example forced text: + # "[reasoning budget exhausted]\n" + reasoning_budget_message: Optional[str] = None + + # True when the prompt/chat template has already inserted reasoning_start. + # + # In that case, the sampler will not see the start tag during generation, so + # it must start directly in COUNTING state from the first generated token. + reasoning_start_in_prompt: bool = False + + # Safety window for non-reasoning models. + # + # If reasoning_start is not generated within this many output tokens, the + # sampler permanently switches to DONE and becomes a no-op. This prevents + # later literal mentions of "" in normal answer text from accidentally + # activating the budget controller. + # + # Ignored when reasoning_start_in_prompt=True because counting starts from + # the first generated token. + # + # Set to None to keep waiting for reasoning_start indefinitely. + reasoning_start_max_tokens: Optional[int] = 32 + custom_samplers: List['CustomSampler'] = field(default_factory=list) samplers: List[CommonSamplerType] = field( @@ -1402,11 +1455,18 @@ def print_params(self) -> str: f"\ttop_k = {self.top_k}, top_p = {self.top_p:.3f}, min_p = {self.min_p:.3f}, " f"xtc_probability = {self.xtc_probability:.3f}, xtc_threshold = {self.xtc_threshold:.3f}, " - f"typical_p = {self.typ_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" + f"typical_p = {self.typical_p:.3f}, top_n_sigma = {self.top_n_sigma:.3f}, temp = {self.temp:.3f}\n" f"\tmirostat = {self.mirostat}, mirostat_lr = {self.mirostat_eta:.3f}, " f"mirostat_ent = {self.mirostat_tau:.3f}, adaptive_target = {self.adaptive_target:.3f}, " - f"adaptive_decay = {self.adaptive_decay:.3f}" + f"adaptive_decay = {self.adaptive_decay:.3f}\n" + + f"\treasoning_budget = {self.reasoning_budget}, " + f"reasoning_start = {self.reasoning_start!r}, reasoning_end = {self.reasoning_end!r}\n" + + f"\treasoning_budget_message = {self.reasoning_budget_message!r}, " + f"reasoning_start_in_prompt = {self.reasoning_start_in_prompt}, " + f"reasoning_start_max_tokens = {self.reasoning_start_max_tokens}" ) return result From 9bb06dacc676cda4678e20ba3171f90e4e9e9362 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 01:01:31 +0800 Subject: [PATCH 467/518] Update Submodule vendor/llama.cpp d4c8e2c..27d9ed8 Signed-off-by: JamePeng --- llama_cpp/llama.py | 11 +++++++++-- llama_cpp/llama_cpp.py | 6 ++++++ vendor/llama.cpp | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c2d2757e13..b9a1265b49 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -120,6 +120,7 @@ def __init__( n_ubatch: int = 512, n_seq_max: int = 1, n_rs_seq: int = 0, + n_outputs_max: int = 0, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, ctx_type: Optional[ @@ -478,7 +479,8 @@ def __init__( self.n_batch = min(n_ctx, n_batch) # ??? self.n_keep = n_keep if n_keep > 0 else 256 self.n_seq_max = n_seq_max - self.n_rs_seq = n_rs_seq + self.n_rs_seq = n_rs_seq + self.n_outputs_max = n_outputs_max self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -490,8 +492,13 @@ def __init__( self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch self.context_params.n_ubatch = min(self.n_batch, n_ubatch) - self.context_params.n_seq_max = self.n_seq_max + + self.context_params.n_seq_max = max(1, self.n_seq_max) + if self.context_params.n_seq_max > llama_cpp_lib.LLAMA_MAX_SEQ: + raise RuntimeError(f"n_seq_max must be <= {llama_cpp_lib.LLAMA_MAX_SEQ}") + self.context_params.n_rs_seq = self.n_rs_seq + self.context_params.n_outputs_max = self.n_batch if self.n_outputs_max == 0 else self.n_outputs_max self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 62c4c81ef9..01aa8cce9b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -55,6 +55,8 @@ LLAMA_MAX_DEVICES = _lib.llama_max_devices() +LLAMA_MAX_SEQ = 256 + # define LLAMA_DEFAULT_SEED 0xFFFFFFFF LLAMA_DEFAULT_SEED = 0xFFFFFFFF @@ -847,6 +849,7 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) # uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] +# uint32_t n_outputs_max; // max outputs in a ubatch (0 = n_batch) # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing @@ -905,6 +908,7 @@ class llama_context_params(ctypes.Structure): n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) n_rs_seq (int): number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] + n_outputs_max (int): max outputs in a ubatch (0 = n_batch) n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing @@ -949,6 +953,7 @@ class llama_context_params(ctypes.Structure): n_ubatch: int n_seq_max: int n_rs_seq: int + n_outputs_max: int n_threads: int n_threads_batch: int ctx_type: int @@ -985,6 +990,7 @@ class llama_context_params(ctypes.Structure): ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), ("n_rs_seq", ctypes.c_uint32), + ("n_outputs_max", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), ("ctx_type", ctypes.c_int), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d4c8e2c29c..27d9ed8397 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d4c8e2c29ce2fb9a251a0a4a16d6c857b4f70f8c +Subproject commit 27d9ed839713e31c7a0ba45e342109a04549834f From a7db23afd86269bb9c08c00b00f2d23288880e50 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 03:09:17 +0800 Subject: [PATCH 468/518] feat(chat-format): improve Jinja2ChatFormatter HF compatibility Enhance Jinja2ChatFormatter to better support HuggingFace-style chat templates while keeping the formatter lightweight and aligned with llama-cpp-python's prompt-rendering needs. This change adds a custom Jinja extension for `{% generation %}` blocks. HuggingFace Transformers uses this tag to track assistant-token spans for assistant masks, but llama-cpp-python only needs the final rendered prompt. The new IgnoreGenerationTags extension therefore treats the tag as a transparent wrapper: it removes the generation/endgeneration tag pair while rendering the inner template body normally. This allows templates that contain `{% generation %}` blocks to render successfully without introducing span tracking overhead. The Jinja environment is also expanded to more closely match Transformers' chat-template runtime behavior. It now enables `jinja2.ext.loopcontrols` for templates that use `{% break %}` or `{% continue %}`, registers a plain JSON `tojson` filter that avoids Jinja's HTML escaping behavior, and exposes `raise_exception` and `strftime_now` as globals instead of passing them on every render call. The formatter now accepts an optional `special_tokens_map`, making additional tokenizer special tokens available to templates. This improves compatibility with templates that reference variables such as `pad_token`, `unk_token`, `sep_token`, or model-specific special tokens beyond `bos_token` and `eos_token`. This also adds optional `documents` support to `__call__`, allowing RAG-style or document-aware chat templates to receive a `documents` variable in the render context. Finally, static stop fields are precomputed during initialization. Text stop sequences and token-id stopping criteria are now built once instead of being recreated for every chat formatting call. The token-id stopping callback also guards against empty token arrays before reading the last token. Key changes: - Add IgnoreGenerationTags Jinja extension for HF `{% generation %}` blocks. - Enable Jinja loop controls for chat templates using break/continue. - Register Transformers-compatible `tojson` behavior. - Register `raise_exception` and `strftime_now` as Jinja globals. - Add `special_tokens_map` support for additional template variables. - Add optional `documents` argument for document-aware templates. - Precompute text stop sequences and token-id stopping criteria. - Improve type normalization for `stop_token_ids`. - Expand docstrings for formatter initialization and render-time variables. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 264 +++++++++++++++++++++++++++++---- 1 file changed, 232 insertions(+), 32 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 71228d0627..f91844bbb7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -26,6 +26,7 @@ ) import jinja2 +from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment import numpy as np @@ -220,6 +221,46 @@ def __call__( class Jinja2ChatFormatter(ChatFormatter): + class IgnoreGenerationTags(Extension): + """Render HuggingFace `{% generation %}` blocks without tracking. + + HuggingFace chat templates may wrap assistant text with: + + {% generation %} + ... + {% endgeneration %} + + Transformers uses this tag to compute assistant-token masks. In + llama-cpp-python chat formatting we only need the final rendered prompt, + so this extension simply removes the tag pair and renders the inner + content as normal Jinja template content. + + This keeps compatibility with HF templates while avoiding the overhead + of span tracking. + + More information see: + https://github.com/huggingface/transformers/blob/39603d0e5cdb6f00e8d473d7fcbb01032d709181/src/transformers/utils/chat_template_utils.py#L425 + """ + + tags = {"generation"} + + def parse(self, parser: jinja2.parser.Parser): + # Consume the opening `{% generation %}` token. + lineno = next(parser.stream).lineno + + # Parse and return the block body until `{% endgeneration %}`. + # Returning the body directly makes the tag a transparent wrapper. + body = parser.parse_statements( + ("name:endgeneration",), + drop_needle=True, + ) + + # Preserve line numbers for better template error messages. + for node in body: + node.set_lineno(lineno) + + return body + def __init__( self, template: str, @@ -227,21 +268,118 @@ def __init__( bos_token: str, add_generation_prompt: bool = True, stop_token_ids: Optional[List[int]] = None, + special_tokens_map: Optional[Dict[str, str]] = None, ): - """A chat formatter that uses jinja2 templates to format the prompt.""" + """Format chat messages with a HuggingFace-style Jinja2 chat template. + + Args: + template: + Raw HuggingFace chat template string. + eos_token: + Text form of the model EOS token. + bos_token: + Text form of the model BOS token. + add_generation_prompt: + Whether to ask the template to append the assistant generation + prefix. This mirrors Transformers' `add_generation_prompt`. + stop_token_ids: + Optional token ids that should stop generation when they appear + as the last generated token. This is llama-cpp-python specific. + special_tokens_map: + Optional tokenizer special-token map. Some HF templates may + reference extra variables such as `pad_token`, `unk_token`, + `sep_token`, or model-specific special tokens. + """ self.template = template self.eos_token = eos_token self.bos_token = bos_token self.add_generation_prompt = add_generation_prompt + self.special_tokens_map = special_tokens_map or {} + self.stop_token_ids = ( - set(stop_token_ids) if stop_token_ids is not None else None + {int(token_id) for token_id in stop_token_ids} + if stop_token_ids is not None + else None ) - self._environment = ImmutableSandboxedEnvironment( + environment = ImmutableSandboxedEnvironment( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, - ).from_string(self.template) + # Keep this aligned with Transformers' chat-template Jinja setup: + # - IgnoreGenerationTags supports `{% generation %}` blocks. + # - loopcontrols supports `{% break %}` and `{% continue %}`. + extensions=[ + Jinja2ChatFormatter.IgnoreGenerationTags, + jinja2.ext.loopcontrols, + ], + ) + + # Match Transformers' chat-template JSON behavior. + # Jinja's default `tojson` escapes HTML characters, which is not what + # plain-text chat templates usually expect. + environment.filters["tojson"] = self.tojson + + # Register these as globals once instead of passing them on every render. + environment.globals["raise_exception"] = self.raise_exception + environment.globals["strftime_now"] = self.strftime_now + + self._environment = environment + self._template = environment.from_string(self.template) + + # Precompute static stop fields once. This avoids rebuilding closures and + # StoppingCriteriaList objects for every chat completion request. + self._stop = [self.eos_token] if self.eos_token else [] + self._stopping_criteria = self._build_stopping_criteria() + + @staticmethod + def raise_exception(message: str): + """Raise a Jinja template error from inside a chat template.""" + raise jinja2.exceptions.TemplateError(message) + + @staticmethod + def strftime_now(format_string: str = "%Y-%m-%d %H:%M:%S") -> str: + """Return the current local time formatted with `datetime.strftime`.""" + return datetime.datetime.now().strftime(format_string) + + @staticmethod + def tojson( + x: Any, + ensure_ascii: bool = False, + indent: Optional[int] = None, + separators: Optional[Tuple[str, str]] = None, + sort_keys: bool = False, + ) -> str: + """Serialize an object to JSON for chat-template rendering. + + This intentionally bypasses Jinja's built-in `tojson` filter because + the built-in filter escapes HTML-sensitive characters. HuggingFace chat + templates expect plain JSON text instead. + """ + return json.dumps( + x, + ensure_ascii=ensure_ascii, + indent=indent, + separators=separators, + sort_keys=sort_keys, + ) + + def _build_stopping_criteria(self): + """Create stopping criteria once during initialization.""" + if self.stop_token_ids is None: + return None + + stop_token_ids = self.stop_token_ids + + def stop_on_last_token( + tokens: npt.NDArray[np.intc], + logits: npt.NDArray[np.single], + ) -> bool: + # Defensive guard: generation normally calls this with at least one + # token, but the callback should never crash on empty input. + return len(tokens) > 0 and int(tokens[-1]) in stop_token_ids + + return llama_core.StoppingCriteriaList([stop_on_last_token]) def __call__( self, @@ -251,44 +389,106 @@ def __call__( function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, tools: Optional[List[llama_types.ChatCompletionTool]] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + documents: Optional[List[Dict[str, Any]]] = None, **kwargs: Any, ) -> ChatFormatterResponse: - def raise_exception(message: str): - raise ValueError(message) + """Render OpenAI-style chat messages into a model prompt. - def strftime_now(format_string="%Y-%m-%d %H:%M:%S") -> str: - """ - Returns the current time formatted as a string. - """ - return datetime.datetime.now().strftime(format_string) + The method builds the variable context expected by HuggingFace-style + Jinja chat templates and renders the final prompt string used by + llama-cpp-python. - prompt = self._environment.render( - messages=messages, - eos_token=self.eos_token, - bos_token=self.bos_token, - raise_exception=raise_exception, - strftime_now=strftime_now, - add_generation_prompt=self.add_generation_prompt, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - ) + Template variables provided by default: + messages: + The chat history to render. Each item is expected to be an + OpenAI-style message dictionary, usually containing at least + `role` and `content`. - stopping_criteria = None - if self.stop_token_ids is not None: + eos_token: + The model's end-of-sequence token string. + + bos_token: + The model's beginning-of-sequence token string. + + add_generation_prompt: + Whether the template should append the assistant generation + prefix. This mirrors Transformers' `add_generation_prompt`. + + functions: + Legacy OpenAI-compatible function definitions, if provided. - def stop_on_last_token( - tokens: npt.NDArray[np.intc], logits: npt.NDArray[np.single] - ) -> bool: - return tokens[-1] in self.stop_token_ids + function_call: + Legacy OpenAI-compatible function-call selection, if provided. - stopping_criteria = llama_core.StoppingCriteriaList([stop_on_last_token]) + tools: + OpenAI/HuggingFace-compatible tool definitions, if provided. + This formatter expects tools to already be normalized into + JSON-schema-like dictionaries. It does not auto-convert Python + callables into JSON schemas like Transformers can. + + tool_choice: + Optional tool-choice instruction, such as `"auto"`, `"none"`, + or a specific tool/function selection object. + + documents: + Optional RAG/document context. Some HF chat templates reference + this variable when rendering retrieval-augmented prompts. + + **kwargs: + Extra model-specific or template-specific variables. These are + merged into the template context last, so they can intentionally + override the defaults above when needed. + + Additional variables: + Values from `special_tokens_map` are also exposed to the template, + such as `pad_token`, `unk_token`, `sep_token`, or custom + model-specific special tokens. Core variables like `messages`, + `eos_token`, and `bos_token` override `special_tokens_map` entries + by default. + + Returns: + ChatFormatterResponse: + Contains the rendered prompt, text stop sequences, optional + token-id stopping criteria, and `added_special=True` because the + chat template is responsible for adding model special tokens. + + Raises: + jinja2.exceptions.TemplateError: + If the template calls `raise_exception(...)` or Jinja rendering + fails. + """ + template_kwargs: Dict[str, Any] = {} + + # Make extra tokenizer special tokens available to templates, e.g. + # `pad_token`, `unk_token`, `sep_token`, or model-specific tokens. + template_kwargs.update(self.special_tokens_map) + + # Explicit core variables should override values from special_tokens_map. + template_kwargs.update( + { + "messages": messages, + "eos_token": self.eos_token, + "bos_token": self.bos_token, + "add_generation_prompt": self.add_generation_prompt, + "functions": functions, + "function_call": function_call, + "tools": tools, + "tool_choice": tool_choice, + "documents": documents, + } + ) + + # Let caller-provided kwargs extend the template context. + # If a caller intentionally passes a same-name key, it will override the + # defaults above. This is useful for model-specific template variables. + template_kwargs.update(kwargs) + + prompt = self._template.render(**template_kwargs) return ChatFormatterResponse( prompt=prompt, - stop=[self.eos_token], - stopping_criteria=stopping_criteria, + stop=self._stop, + stopping_criteria=self._stopping_criteria, added_special=True, ) From bbede198b8012b702bc1e6d241f0887b6e3336a2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 03:40:31 +0800 Subject: [PATCH 469/518] feat(llama): enhance chat template initialization with full special tokens Update Llama.__init__ to register additional tokenizer special tokens and improve stop token handling for chat templates. - Expose extra special tokens (EOT, SEP, NL, PAD, MASK) via `special_tokens_map` to Jinja2ChatFormatter. - Keep BOS and EOS tokens as explicit parameters, no longer redundantly put them in `special_tokens_map`. - Build `stop_token_ids` once, including EOS and EOT tokens, skipping invalid (-1) ids. - Update try-block comment: now `{% generation %}` blocks are supported, guard only against malformed or model-specific templates. - This ensures better compatibility with HuggingFace-style chat templates while maintaining llama-cpp-python prompt-rendering behavior. Signed-off-by: JamePeng --- llama_cpp/llama.py | 48 +++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b9a1265b49..43e3d6f1fd 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -692,9 +692,6 @@ def __init__( self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - self._token_nl = self.token_nl() - self._token_eos = self.token_eos() - self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 @@ -720,13 +717,38 @@ def __init__( eos_token_id = self.token_eos() bos_token_id = self.token_bos() + eot_token_id = self.token_eot() + sep_token_id = self.token_sep() + nl_token_id = self.token_nl() + pad_token_id = self.token_pad() + mask_token_id = self.token_mask() + + def _token_text(token_id: int) -> str: + return self._model.token_get_text(token_id) if token_id != -1 else "" + + bos_token = _token_text(bos_token_id) + eos_token = _token_text(eos_token_id) + + special_tokens_map = { + name: text + for name, token_id in { + "eot_token": eot_token_id, + "sep_token": sep_token_id, + "nl_token": nl_token_id, + "pad_token": pad_token_id, + "mask_token": mask_token_id, + }.items() + if token_id != -1 and (text := _token_text(token_id)) + } - eos_token = ( - self._model.token_get_text(eos_token_id) if eos_token_id != -1 else "" - ) - bos_token = ( - self._model.token_get_text(bos_token_id) if bos_token_id != -1 else "" - ) + stop_token_ids = [ + token_id + for token_id in (eos_token_id, eot_token_id) + if token_id != -1 + ] + + if not stop_token_ids: + stop_token_ids = None # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates template_choices = dict( @@ -750,14 +772,14 @@ def __init__( for name, template in template_choices.items(): try: # Attempt to parse and register the template as a valid chat handler. - # We wrap this in a try-block because some models (like LLaVA) contain - # non-standard Jinja2 tags (e.g., {% generation %}) that cause the - # standard parser to crash. + # Keep this guarded because model metadata may contain malformed or + # model-specific Jinja templates that still cannot be rendered by this runtime. self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( template=template, eos_token=eos_token, bos_token=bos_token, - stop_token_ids=[eos_token_id], + stop_token_ids=stop_token_ids, + special_tokens_map=special_tokens_map, ).to_chat_handler() except Exception as e: # If parsing fails (e.g., TemplateSyntaxError), log a warning but do not crash. From e6b58356323d116df141b163f40be3ec988cf290 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 04:54:18 +0800 Subject: [PATCH 470/518] docs: update SCHEMA.md to v0.4 with full wiki path layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added comprehensive docs/wiki/ directory structure overview. - Reorganized modules description; removed hardcoded module page list. - Clarified top-level file purposes and update guidance. - Updated page type examples and templates (Class/Module, Feature, Example, Development). - Strengthened cross-linking rules and update/placeholder guidance. - Bumped schema version from 0.3 → 0.4 and last_modified date. Signed-off-by: JamePeng --- docs/wiki/SCHEMA.md | 141 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 21 deletions(-) diff --git a/docs/wiki/SCHEMA.md b/docs/wiki/SCHEMA.md index 23954a156e..1ffcb1e227 100644 --- a/docs/wiki/SCHEMA.md +++ b/docs/wiki/SCHEMA.md @@ -4,14 +4,15 @@ - **Author**: JamePeng - **Maintainer**: LLM-assisted documentation workflow - **Project**: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) wiki -- **Last Modified**: 2026-05-16 +- **Last Modified**: 2026-06-02 - **Version Target**: latest source code -- **Schema Version**: 0.3 +- **Schema Version**: 0.4 **Purpose**: - Maintain a living, always-up-to-date, structured documentation wiki for the `llama-cpp-python` library, with LLMs acting as the primary documentation maintainer. - The wiki must help users understand the latest public API, core classes, modules, configuration options, examples, and migration paths based on the current source code. - The wiki should explain not only *how to call an API*, but also *what role the class/module plays in the library*, *how its state is configured*, and *how users should choose between related APIs*. +- The schema also defines the expected wiki directory layout, page ownership, and update rules so new pages can be generated consistently. **Core Principles**: - The source of truth is the latest code in `llama_cpp/`, especially: @@ -29,7 +30,7 @@ - Prefer documenting public and user-facing APIs first. Internal implementation details may be documented only when they help users understand behavior, extension points, debugging, or advanced usage. - All examples must be complete, runnable with the latest API, and include necessary imports. - Clearly mark deprecated, legacy, or changed usage with a warning and show the modern replacement. -- Use internal wiki links (e.g. [[Llama]], [[Qwen35ChatHandler]]) for cross-referencing. +- Use internal wiki links, such as `[[Llama]]`, `[[LlamaCache]]`, `[[LlamaSpeculative]]`, or `[[Qwen35ChatHandler]]`, for cross-referencing. - Keep pages concise, professional, and user-friendly. **Documentation Language**: @@ -38,9 +39,53 @@ - Code comments inside examples should also be in English by default. - If the source code contains Chinese comments or non-English notes, translate them into clear English while preserving the original meaning. +**Wiki Directory Layout**: + +The wiki should be organized by documentation purpose rather than by source-file location alone. + +```text +docs/wiki/ +├─ core/ # Core classes and modules (e.g., Llama, main API objects) +├─ development/ # Developer-focused pages, tools, agents, CI/CD workflows +├─ examples/ # Complete runnable examples for users +├─ features/ # High-level features spanning multiple classes/modules +├─ modules/ # Specialized modules (cache, embeddings, logging, speculative decoding, bindings) +├─ types/ # Type definitions and data structures used across the library +├─ .gitkeep # Placeholder for Git to track empty directories +├─ contributing-to-wiki.md # Guidelines for contributing to the wiki +├─ index.md # Entry point and table of contents +├─ install.md # Installation instructions +├─ SCHEMA.md # Documentation schema and style guide (this file) +├─ troubleshooting.md # Known issues, debugging tips, FAQ +``` + +### Top-Level Files + +| Path | Purpose | Update Guidance | +|---|---|---| +| `docs/wiki/SCHEMA.md` | Defines the documentation contract, directory structure, page templates, and LLM update rules. | Update when adding a new page type, directory, documentation standard, or structural convention. | +| `docs/wiki/index.md` | Main wiki landing page and navigation entry. | Update when important pages are added, renamed, reorganized, or promoted. | +| `docs/wiki/contributing-to-wiki.md` | Human and LLM contribution guide for maintaining the wiki. | Keep aligned with this schema, especially source-reading and accuracy rules. | +| `docs/wiki/install.md` | Installation guide placeholder or final installation documentation. | Convert from placeholder to complete page when installation docs are ready. | +| `docs/wiki/troubleshooting.md` | Troubleshooting guide placeholder or final diagnostics documentation. | Expand with common runtime, build, backend, model loading, and environment issues. | +| `docs/wiki/.gitkeep` | Keeps the wiki directory tracked when needed. | No documentation content is required. | + +### Directory Ownership + +| Directory | Purpose | Typical Content | Primary Audience | +|---|---|---|---| +| `core/` | High-level public entry points and central user APIs. | `Llama`, model lifecycle, generation APIs, chat/completion interfaces. | General users and advanced users. | +| `modules/` | Focused subsystem pages, user-facing modules, low-level bindings, helpers, and advanced API areas. | Cache, embeddings, grammar, speculative decoding, logging, llama.cpp bindings, MTMD bindings. | Advanced users, extension authors, maintainers. | +| `features/` | Workflow-oriented guides that span multiple APIs or modules. | Chat formatting, structured output, multimodal usage, backend loading, caching workflows, speculative decoding workflows. | Users solving a specific task. | +| `examples/` | Complete runnable examples. | Minimal inference, chat completion, embeddings, grammar-constrained generation, speculative decoding, multimodal usage. | Users who want copy-paste starting points. | +| `types/` | Type and schema documentation. | Request/response structures, typed dictionaries, protocol-style types, OpenAI-compatible payloads. | Users integrating with typed code or API-compatible workflows. | +| `development/` | Maintainer-facing documentation and contribution workflows. | Build notes, CI notes, release notes, commit generation workflow, documentation maintenance rules. | Maintainers and contributors. | + **Page Types and Templates**: -1. **Class / Module Page** (e.g. core/Llama.md, modules/LlamaEmbedding.md) +1. **Class / Module Page** + Examples: `core/Llama.md`, `modules/LlamaEmbedding.md`, `modules/LlamaCache.md` + - Frontmatter (YAML): ```yaml --- @@ -51,14 +96,15 @@ version_target: "latest" --- ``` - - Sections (in order): + + - Sections, in order: - Overview - Role in the Library - Constructor (`__init__`) – full parameter table with types, defaults, and explanations - Important Attributes / State - - Core Methods (with signatures and usage examples) + - Core Methods, with signatures and usage examples - Best Practices & Common Patterns - - Deprecated / Changed APIs (with migration notes) + - Deprecated / Changed APIs, with migration notes - Related Links - The **Overview** should briefly explain: @@ -81,24 +127,77 @@ - Only document attributes that affect user understanding, configuration, lifecycle, inference behavior, caching, chat formatting, embeddings, or debugging. Do not document every trivial private variable. -2. **Feature Page** (features/xxx.md) - - Overview, When to use, Related APIs, Code examples, Configuration Notes, Limitations, Related features - - Feature pages should explain workflows across multiple classes or modules. - -3. **Example Page** (examples/xxx.md) - - Goal, Prerequisites, Complete runnable code block, Expected output, Tips - - Rules: - * Use the latest API. - * Include all imports as need. - * Avoid pseudo-code. - * Keep examples focused. - * Mention required model assumptions when needed, such as GGUF file path or chat format. +2. **Feature Page** + Example: `features/speculative-decoding.md`, `features/embeddings-rerank.md` + + Feature pages should explain workflows across multiple classes or modules. + + Required sections: + - Overview + - When to Use + - Related APIs + - Code Examples + - Configuration Notes + - Limitations + - Related Features + +3. **Example Page** + Example: `examples/chat-completion.md` + + Required sections: + - Goal + - Prerequisites + - Complete Runnable Code + - Expected Output + - Tips + + Rules: + - Use the latest API. + - Include all required imports. + - Avoid pseudo-code. + - Keep examples focused. + - Mention required model assumptions when needed, such as GGUF file path, embedding mode, grammar file, chat format, or multimodal assets. + +4. **Development Page** + Example: `development/GitCommitGenerationAgent.md` + + Development pages are maintainer-facing and may document repository workflows, CI, release notes, build matrix decisions, or documentation maintenance conventions. + + Required sections: + - Overview + - Scope + - Workflow + - Inputs / Outputs + - Rules and Constraints + - Examples + - Related Links + +**Cross-Linking Rules**: + +- Use wiki-style internal links for pages that exist or should exist, such as `[[Llama]]`, `[[LlamaCache]]`, `[[LlamaSpeculative]]`, and `[[Logger]]`. +- Link from high-level pages to lower-level module pages when the module explains advanced details. +- Link from feature pages back to the relevant class/module pages. +- Avoid circular explanations. A page may link to another page for details instead of repeating the same explanation. **Update Rules**: + - Before updating any page, the LLM must read the relevant source files. - Update the `last_updated` date. -- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, or backend option, create or expand the corresponding page. +- If a new feature appears, such as a new chat handler, sampler, cache type, embedding API, multimodal API, backend option, or binding wrapper, create or expand the corresponding page. - If behavior is inferred from implementation rather than explicitly documented in code, mark the explanation as implementation-based. +- Empty files should be converted into explicit placeholder pages instead of being left blank. - Maintain a high standard of readability and accuracy. -This schema is the contract. All generated content must follow it. \ No newline at end of file +**Quality Checklist**: + +Before finalizing a wiki page, verify: + +- The page reflects the latest source code. +- All parameters, defaults, and return values are accurate. +- Examples are runnable and include necessary imports. +- Internal links point to the correct wiki page names. +- Advanced or low-level APIs are clearly labeled. +- Deprecated behavior is clearly separated from current usage. +- The page avoids undocumented claims, speculative behavior, or outdated assumptions. + +This schema is the contract. All generated content must follow it. From 2fbe63ddf829ab596ce359339f49dd7f110bbe89 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 05:33:37 +0800 Subject: [PATCH 471/518] build(deps): align Jinja2 minimum with Transformers Require Jinja2 >= 3.1.0 for HuggingFace-style chat template support. The updated Jinja2ChatFormatter relies on behavior aligned with Transformers' chat-template runtime, which also requires Jinja2 3.1 or newer. Updating the minimum dependency avoids parser/runtime differences with older Jinja versions. Signed-off-by: JamePeng --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eb4b879dd6..dea9b48ff3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "typing-extensions>=4.8.0", "numpy>=1.21.6,<=2.3.2", "diskcache>=5.6.2", - "jinja2>=2.11.3", + "jinja2>=3.1.0", "Pillow>=9.5.0", ] requires-python = ">=3.9" From acf896381f7b18a92bc0477a0c3939e3a79d910b Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 20:21:08 +0800 Subject: [PATCH 472/518] Update Submodule vendor/llama.cpp 27d9ed8..60130d1 Signed-off-by: JamePeng --- llama_cpp/_internals.py | 7 ------- llama_cpp/llama_cpp.py | 10 +++++++--- vendor/llama.cpp | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 9a22096a26..92ff51447f 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -774,13 +774,6 @@ def set_causal_attn(self, causal_attn: bool): """ llama_cpp.llama_set_causal_attn(self.ctx, causal_attn) - def set_warmup(self, warmup: bool): - """ - Set whether the model is in warmup mode or not - If true, all model tensors are activated during llama_decode() to load and cache their weights. - """ - llama_cpp.llama_set_warmup(self.ctx, warmup) - def synchronize(self): """ Wait until all computations are finished diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 01aa8cce9b..9c911bcb14 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -3085,11 +3085,15 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /): # // Set whether the model is in warmup mode or not # // If true, all model tensors are activated during llama_decode() to load and cache their weights. -# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); +# // +# // note: using this can cause extra graph reallocations because it changes the graph topology with MoE models, +# // so it is generally not recommended to use in practice. will be removed in the future +# DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup), +# "user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]"); @ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None) def llama_set_warmup(ctx: llama_context_p, warmup: bool, /): - """ Set whether the model is in warmup mode or not - If true, all model tensors are activated during llama_decode() to load and cache their weights""" + """DEPRECATED: using this can cause extra graph reallocations because it changes the graph topology with MoE models, + so it is generally not recommended to use in practice. will be removed in the future""" ... # // Set abort callback diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 27d9ed8397..60130d18f9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 27d9ed839713e31c7a0ba45e342109a04549834f +Subproject commit 60130d18f9ac7f42cb4d7f6060b088a45d8f242e From a29c75495d69dd0bcd9596fecd99789d07a09ffa Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 23:22:47 +0800 Subject: [PATCH 473/518] docs(install): add source-aligned build and backend guide Document installation workflows for llama-cpp-python with a focus on the underlying llama.cpp CMake build configuration. - Add virtual environment, source install, editable install, rebuild, and verification guidance. - Document common CMake options such as GGML_NATIVE, GGML_BACKEND_DL, GGML_CPU_ALL_VARIANTS, and compiler selection. - Summarize backend-specific build flags for CUDA, BLAS, Metal, Vulkan, OpenVINO, HIP, SYCL, OpenCL, CANN, ZenDNN, and zDNN. - Include backend runtime notes and common installation pitfalls while keeping server-related installation content out of the page. Signed-off-by: JamePeng --- docs/wiki/install.md | 775 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 775 insertions(+) diff --git a/docs/wiki/install.md b/docs/wiki/install.md index e69de29bb2..576ca14c6f 100644 --- a/docs/wiki/install.md +++ b/docs/wiki/install.md @@ -0,0 +1,775 @@ +--- +title: Installation +page_type: guide +source_files: + - README.md + - vendor/llama.cpp/docs/build.md + - vendor/llama.cpp/docs/backend/ +last_updated: 2026-06-02 +author: JamePeng +version_target: "latest" +--- + +# Installation + +## Overview + +This page explains how to install `llama-cpp-python` from source, with or +without hardware acceleration. + +`llama-cpp-python` builds the native `llama.cpp` libraries during installation +and installs them inside the Python package. The exact build depends on your +Python version, compiler, CMake version, operating system, and selected +`llama.cpp` backend. + +For most users, the safest installation path is: + +1. Create a clean Python virtual environment. +2. Upgrade `pip`. +3. Install from the GitHub repository. +4. Pass `CMAKE_ARGS` only when you need a specific backend. + +--- + +## Requirements + +| Requirement | Notes | +|---|---| +| Python | Python 3.9 or newer. The package metadata currently lists Python 3.9 through 3.14. | +| CMake | CMake 3.21 or newer. | +| C/C++ compiler | Required because the package builds `llama.cpp` native libraries. | +| Git | Required when installing from the GitHub repository or cloning recursively. | +| Backend SDKs | Required only for GPU or accelerator builds, such as CUDA, Vulkan, OpenVINO, ROCm/HIP, or SYCL. | + +Platform compiler guidance: + +| Platform | Typical compiler setup | +|---|---| +| Linux | `gcc` or `clang` plus Python development headers if required by your distribution. | +| Windows | Visual Studio 2022 Build Tools or MinGW. For most native builds, Visual Studio Build Tools is recommended. | +| macOS | Xcode Command Line Tools. Metal is enabled by default on supported macOS builds. | + +--- + +## Use a Virtual Environment + +Using a virtual environment avoids mixing build artifacts and dependencies from +different Python installations. + +### Linux and macOS + +```bash +python3 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip setuptools wheel +``` + +### Windows PowerShell + +```powershell +py -3 -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install --upgrade pip setuptools wheel +``` + +If PowerShell blocks activation scripts, run: + +```powershell +Set-ExecutionPolicy -Scope CurrentUser RemoteSigned +``` + +Then activate the environment again. + +--- + +## Basic Installation + +Install directly from the project repository: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Windows PowerShell: + +```powershell +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +This builds `llama.cpp` from source and installs the generated native runtime +libraries alongside the Python package. + +Use verbose output when diagnosing build failures: + +```bash +python -m pip install --verbose "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Install From a Local Clone + +Clone recursively so the `vendor/llama.cpp` submodule is available: + +```bash +git clone https://github.com/JamePeng/llama-cpp-python --recursive +cd llama-cpp-python +python -m pip install --upgrade pip +python -m pip install . +``` + +If you already cloned without `--recursive`, initialize the submodule manually: + +```bash +git submodule update --init --recursive +``` + +For editable development installs: + +```bash +python -m pip install -e . +``` + +--- + +## Passing CMake Options + +`llama.cpp` backend options are passed through CMake. There are two common +ways to pass those options during `pip install`. + +### Environment Variable + +Linux and macOS: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows PowerShell: + +```powershell +$env:CMAKE_ARGS = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Clear the variable after the build if you do not want it reused: + +```powershell +Remove-Item Env:CMAKE_ARGS +``` + +### `pip --config-settings` + +You can also pass CMake arguments through `pip`: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" \ + -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS" +``` + +Use semicolons inside `cmake.args` when passing multiple CMake definitions. + +--- + +## Common CMake Options + +The Python package forwards CMake options to the bundled `vendor/llama.cpp` +build. These options are useful across many backends. + +| Option | Typical values | Use | +|---|---|---| +| `CMAKE_BUILD_TYPE` | `Release`, `Debug` | Selects build type for single-config generators such as Ninja or Unix Makefiles. Release is the normal install choice. | +| `GGML_NATIVE` | `ON`, `OFF` | Controls whether ggml builds for the current host CPU/GPU. Use `OFF` for more portable wheels; use `ON` for local machine-specific optimization. | +| `BUILD_SHARED_LIBS` | `ON`, `OFF` | Controls shared versus static native libraries. The Python package normally installs shared runtime libraries. | +| `GGML_BACKEND_DL` | `ON`, `OFF` | Builds backend libraries so they can be loaded dynamically at runtime when supported by the build. | +| `GGML_CPU_ALL_VARIANTS` | `ON`, `OFF` | Builds multiple CPU backend variants for x86 feature sets when supported. Useful for portable x64 wheels. | +| `GGML_OPENMP` | `ON`, `OFF` | Enables OpenMP CPU parallelism. On Windows, OpenMP runtime DLLs may need to be packaged beside backend DLLs. | +| `CMAKE_PREFIX_PATH` | path list | Helps CMake find SDKs or libraries installed outside default locations. | +| `CMAKE_C_COMPILER` / `CMAKE_CXX_COMPILER` | compiler paths or names | Selects compilers, often needed for SYCL, HIP, or custom toolchains. | + +Example portable CUDA build: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_NATIVE=OFF" \ + python -m pip install --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Example dynamic CPU backend build: + +```bash +CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_NATIVE=OFF" \ + python -m pip install --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Backend Quick Reference + +Choose one backend path that matches your hardware and installed SDKs. + +| Backend | Typical CMake option | Notes | +|---|---|---| +| CPU only | none | Default portable path. Performance depends on CPU features and build options. | +| OpenBLAS | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS` | CPU BLAS acceleration for prompt processing and larger batches. | +| BLIS | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME` | CPU BLAS route using BLIS. | +| Intel oneMKL | `-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp` | Intel CPU BLAS route. This is not the Intel GPU path. | +| CUDA | `-DGGML_CUDA=on` | Requires NVIDIA CUDA Toolkit matching your driver and GPU. | +| Metal | `-DGGML_METAL=on` | Enabled by default on supported macOS builds. Use `-DGGML_METAL=OFF` to disable. | +| Vulkan | `-DGGML_VULKAN=on` | Requires Vulkan SDK and platform-specific setup. | +| OpenVINO | `-DGGML_OPENVINO=ON` | Useful for Intel CPU, GPU, and NPU workflows after OpenVINO environment setup. | +| HIP / ROCm | `-DGGML_HIP=ON` | For supported AMD GPUs. May require `GPU_TARGETS`. | +| SYCL | `-DGGML_SYCL=on` | Usually used with Intel oneAPI compilers. | +| OpenCL | `-DGGML_OPENCL=ON` | Primarily documented for Qualcomm Adreno and Snapdragon workflows; can also apply to some other OpenCL devices. | +| CANN | `-DGGML_CANN=ON` | Ascend NPU backend. Requires Ascend drivers and CANN toolkit. | +| ZenDNN | `-DGGML_ZENDNN=ON` | AMD Zen CPU acceleration, mainly matrix multiplication paths. | +| zDNN | `-DGGML_ZDNN=ON -DZDNN_ROOT=/path/to/zdnn` | IBM Z / LinuxONE acceleration path. | + +For the full list of backend options, check the upstream llama.cpp build +documentation and the current `vendor/llama.cpp` source. + +--- + +## CUDA + +CUDA builds require the NVIDIA CUDA Toolkit. Choose a toolkit version that is +compatible with your driver and GPU. + +Linux: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows PowerShell: + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +For newer NVIDIA GPUs with compute capability 90 or higher, the README notes +that Programmatic Dependent Launch can be enabled with: + +```bash +-DGGML_CUDA_PDL=ON +``` + +Example: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on -DGGML_CUDA_PDL=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If `nvcc` produces large volumes of non-blocking template warnings, the README +documents optional CUDA warning suppression: + +```bash +-DCMAKE_CUDA_FLAGS="--diag-suppress=177 --diag-suppress=221 --diag-suppress=550" +``` + +### CUDA Portability and Architecture Selection + +By default, llama.cpp may build for the GPU detected on the build machine. For +a wheel intended to run across multiple CUDA GPUs, disable native detection: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DGGML_NATIVE=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If `nvcc` cannot detect your GPU, or if you want to control the generated +binary size, specify CUDA architectures explicitly: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=86;89" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Use NVIDIA's compute capability table to choose architecture numbers. For +example, RTX 30-series GPUs commonly use `86`, and RTX 4090 uses `89`. + +If multiple CUDA toolkits are installed, point CMake at the intended compiler: + +```bash +CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-12.8/bin/nvcc" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Runtime variables that may matter after installation: + +| Variable | Use | +|---|---| +| `CUDA_VISIBLE_DEVICES` | Selects or hides CUDA devices for the current process. | +| `GGML_CUDA_ENABLE_UNIFIED_MEMORY` | Enables unified-memory fallback on Linux when VRAM is exhausted. On Windows, similar behavior may be controlled by NVIDIA driver settings. | +| `GGML_CUDA_P2P` | Enables peer-to-peer access between GPUs when driver and hardware support it. | +| `GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F` | Forces FP32 compute in selected cuBLAS paths, trading speed for numerical headroom. | +| `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` | Forces FP16 compute in selected cuBLAS paths when supported. | + +--- + +## BLAS and CPU Acceleration + +BLAS acceleration mainly improves prompt processing and larger batch prefill. +It generally does not improve single-token generation speed as much as GPU +offload. + +### OpenBLAS + +Use OpenBLAS when the OpenBLAS development package is available on your system. + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Linux, install the OpenBLAS development package with your system package +manager before building. Package names vary by distribution. + +### BLIS + +BLIS is selected through the `FLAME` BLAS vendor after BLIS is installed: + +```bash +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The upstream BLIS guide also notes that runtime variables such as +`BLIS_NUM_THREADS` and OpenMP affinity settings can affect CPU performance. + +### Intel oneMKL for CPU + +Intel oneMKL is a CPU BLAS path. It is different from Intel GPU acceleration, +which is usually handled through SYCL or OpenVINO. + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## Metal on macOS + +On macOS, Metal is enabled by default by this project when building on Apple +platforms. A normal install is usually enough: + +```bash +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +To disable Metal at build time: + +```bash +CMAKE_ARGS="-DGGML_METAL=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +At runtime, use `n_gpu_layers=0` when you want CPU inference even though the +package was built with Metal support. + +--- + +## Vulkan + +Vulkan builds require the Vulkan SDK and any platform-specific environment +setup required by the SDK. + +```bash +CMAKE_ARGS="-DGGML_VULKAN=on" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +On Linux and macOS, make sure the Vulkan SDK setup script has been sourced in +the same shell session before running `pip install`. + +On Windows, install the Vulkan SDK and make sure its environment variables are +available in the shell that runs the build. + +On Linux, system packages can also provide the Vulkan loader and shader tools. +The upstream guide notes that SPIR-V headers may be required separately from +the Vulkan loader development package on some distributions. + +For macOS Vulkan builds, Vulkan usually runs through a Metal translation layer. +The upstream guide builds Vulkan with Metal disabled: + +```bash +CMAKE_ARGS="-DGGML_VULKAN=ON -DGGML_METAL=OFF" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +--- + +## OpenVINO + +OpenVINO builds require the OpenVINO runtime and environment setup first. + +Linux: + +```bash +source /opt/intel/openvino/setupvars.sh +CMAKE_ARGS="-DGGML_OPENVINO=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Windows: + +```powershell +# Run this from a shell where OpenVINO setupvars.bat has been initialized, +# such as an OpenVINO command prompt, or initialize it through cmd first. +$env:CMAKE_ARGS = "-DGGML_OPENVINO=ON" +python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The OpenVINO backend is intended for Intel CPU, GPU, and NPU workflows when the +OpenVINO runtime supports the target device. + +Runtime variables: + +| Variable | Use | +|---|---| +| `GGML_OPENVINO_DEVICE` | Selects `CPU`, `GPU`, `NPU`, or a specific GPU such as `GPU.0`. Defaults to CPU if unset or unavailable. | +| `GGML_OPENVINO_CACHE_DIR` | Enables OpenVINO model caching when set. Not supported on NPU devices according to upstream docs. | +| `GGML_OPENVINO_STATEFUL_EXECUTION` | Enables stateful KV-cache execution. Upstream docs recommend it for CPU/GPU performance and note it is not effective on NPU. | +| `GGML_OPENVINO_PREFILL_CHUNK_SIZE` | Controls NPU prefill chunk size. | +| `GGML_OPENVINO_PROFILING` | Enables OpenVINO profiling. | + +Important limitations from the upstream OpenVINO backend docs: + +- GPU stateless execution has known issues; use `GGML_OPENVINO_STATEFUL_EXECUTION=1` for GPU workflows. +- NPU runs may fail when context size is too large. Keep context size small for NPU workflows. +- Encoder models such as embedding and reranking models are not supported by the current OpenVINO backend implementation. +- Some benchmark workflows require Flash Attention enabled in the llama.cpp tool layer; in Python, verify behavior against your target model and backend. + +--- + +## HIP / ROCm + +HIP builds are for supported AMD GPUs. + +Linux example: + +```bash +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1030" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +`GPU_TARGETS` is optional in some setups, but specifying your GPU architecture +can reduce build time and avoid unsupported target issues. + +Windows ROCm builds are more environment-sensitive. The README currently +documents a TheRock ROCm workflow that sets `HIP_PATH`, `ROCM_PATH`, +`HIP_DEVICE_LIB_PATH`, compiler paths, `CMAKE_GENERATOR`, and `CMAKE_ARGS` +before running `pip install`. + +For RDNA3 or CDNA hardware, upstream docs mention optional Flash Attention +acceleration through rocWMMA: + +```bash +CMAKE_ARGS="-DGGML_HIP=ON -DGPU_TARGETS=gfx1100 -DGGML_HIP_ROCWMMA_FATTN=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Runtime variables that may matter: + +| Variable | Use | +|---|---| +| `HIP_VISIBLE_DEVICES` | Selects visible HIP devices. | +| `HSA_OVERRIDE_GFX_VERSION` | Can help unsupported Linux GPUs use a nearby architecture value. Upstream docs note this is not supported on Windows. | +| `HIP_DEVICE_LIB_PATH` | Points to ROCm device bitcode libraries when clang cannot find them. | + +--- + +## SYCL + +SYCL builds are usually used with Intel oneAPI compilers. + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +To request FP16 support: + +```bash +CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Useful SYCL build options from the upstream backend docs: + +| Option | Use | +|---|---| +| `GGML_SYCL_F16` | Enables FP16 build path. Test both FP32 and FP16 for your model and device. | +| `GGML_SYCL_TARGET` | Selects SYCL target type. Intel is the default target in upstream docs. | +| `GGML_SYCL_DEVICE_ARCH` | Selects device architecture when known. | +| `GGML_SYCL_GRAPH` | Enables the experimental SYCL graph extension. | +| `GGML_SYCL_DNN` | Enables oneDNN integration. | +| `GGML_SYCL_HOST_MEM_FALLBACK` | Allows host-memory fallback when device memory is full, at reduced speed. | +| `GGML_SYCL_SUPPORT_LEVEL_ZERO` | Enables Level Zero support for Intel GPU memory allocation. | + +Useful SYCL runtime variables: + +| Variable | Use | +|---|---| +| `ONEAPI_DEVICE_SELECTOR` | Selects a SYCL device, such as a specific Level Zero GPU. | +| `GGML_SYCL_ENABLE_FLASH_ATTN` | Enables or disables Flash Attention in the SYCL backend. | +| `GGML_SYCL_ENABLE_LEVEL_ZERO` | Uses Level Zero allocation when support was built in. | +| `GGML_SYCL_DISABLE_DNN` | Disables oneDNN path and uses oneMKL path. | +| `ZES_ENABLE_SYSMAN` | Helps query free GPU memory in some Intel GPU setups. | + +--- + +## OpenCL + +OpenCL support is documented upstream mainly for Qualcomm Adreno GPUs and +Snapdragon devices. It may also work on certain other OpenCL-capable GPUs, but +SYCL is usually preferred for modern Intel GPU workflows. + +```bash +CMAKE_ARGS="-DGGML_OPENCL=ON" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +Useful OpenCL CMake options: + +| Option | Default | Use | +|---|---|---| +| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embeds OpenCL kernels into the built binary or library. | +| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Enables kernels optimized for Adreno. | + +For Linux builds where OpenCL headers and ICD loader are installed in a custom +prefix, pass that location through `CMAKE_PREFIX_PATH`. + +--- + +## CANN + +CANN is the Ascend NPU backend. It requires Ascend drivers and the CANN toolkit +before building. + +```bash +CMAKE_ARGS="-DGGML_CANN=ON -DCMAKE_BUILD_TYPE=Release" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +The upstream CANN documentation focuses on Linux and Ascend devices such as +Atlas 300I A2 and Atlas 300I Duo. Supported model families and data types vary +by device generation. + +--- + +## ZenDNN and zDNN + +ZenDNN and zDNN are different backends. + +| Backend | Hardware | CMake option | +|---|---|---| +| ZenDNN | AMD Zen CPUs, especially AMD EPYC | `-DGGML_ZENDNN=ON` | +| zDNN | IBM Z / LinuxONE with NNPA acceleration | `-DGGML_ZDNN=ON -DZDNN_ROOT=/path/to/zdnn` | + +ZenDNN can be downloaded and built automatically by CMake: + +```bash +CMAKE_ARGS="-DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +If you already have a ZenDNN installation: + +```bash +CMAKE_ARGS="-DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/ZenDNN/build/install" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +zDNN requires a zDNN library installation first: + +```bash +CMAKE_ARGS="-DGGML_ZDNN=ON -DZDNN_ROOT=/opt/zdnn-libs" \ + python -m pip install "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +ZenDNN currently accelerates matrix multiplication paths and may fall back to +the standard CPU backend for other operations. + +--- + +## Dynamic Backend Wheels + +The README notes that newer preview wheels may be built with: + +```text +GGML_BACKEND_DL=ON +GGML_CPU_ALL_VARIANTS=ON +``` + +In that build mode, CPU backend variants are installed as separate runtime +libraries under: + +```text +site-packages/llama_cpp/lib +``` + +Examples include: + +```text +ggml-cpu-x64 +ggml-cpu-sse42 +ggml-cpu-haswell +ggml-cpu-skylakex +ggml-cpu-alderlake +ggml-cpu-zen4 +``` + +On Windows, dynamic CPU backend DLLs may also need the LLVM OpenMP runtime +next to them: + +```text +libomp140.x86_64.dll +``` + +Based on the current top-level `CMakeLists.txt`, this project installs many +`llama`, `ggml`, CPU-variant, accelerator backend, and `mtmd` targets into the +Python package runtime directory when those targets are available. + +--- + +## Upgrading and Rebuilding + +Use `--upgrade`, `--force-reinstall`, and `--no-cache-dir` when you need to +force a rebuild with new CMake options: + +```bash +CMAKE_ARGS="-DGGML_CUDA=on" \ + python -m pip install --upgrade --force-reinstall --no-cache-dir \ + "llama-cpp-python @ git+https://github.com/JamePeng/llama-cpp-python.git" +``` + +This is important because `pip` may otherwise reuse cached wheels or build +artifacts from a previous backend configuration. + +For local editable builds, clean old native artifacts before rebuilding when +switching backends: + +```bash +make clean +python -m pip install --verbose -e . +``` + +On Windows, if `make` is not available, remove `_skbuild` and old native +libraries under `llama_cpp/lib` manually before reinstalling. + +--- + +## Verify Installation + +Check that the package imports: + +```bash +python -c "import llama_cpp; print(llama_cpp.__version__)" +``` + +Check where the package was installed: + +```bash +python -c "import llama_cpp, pathlib; print(pathlib.Path(llama_cpp.__file__).parent)" +``` + +Check the bundled native runtime libraries: + +```bash +python -c "import llama_cpp, pathlib; print(list((pathlib.Path(llama_cpp.__file__).parent / 'lib').glob('*')))" +``` + +Run a minimal model load after downloading a GGUF model: + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model.gguf", + n_gpu_layers=0, + verbose=False, +) + +output = llm("Hello,", max_tokens=8) +print(output["choices"][0]["text"]) +``` + +For GPU builds, set `n_gpu_layers=-1` or another positive value to offload +layers: + +```python +from llama_cpp import Llama + +llm = Llama( + model_path="./model.gguf", + n_gpu_layers=-1, +) +``` + +--- + +## Development Workflow + +Common local development commands: + +```bash +git clone https://github.com/JamePeng/llama-cpp-python --recursive +cd llama-cpp-python +python -m pip install --upgrade pip +python -m pip install -e . +python -m pytest +``` + +The repository also includes a `Makefile` with useful targets: + +| Target | Purpose | +|---|---| +| `make build` | Editable build with verbose output. | +| `make build.cuda` | Editable build with `GGML_CUDA=on`. | +| `make build.openblas` | Editable build with OpenBLAS. | +| `make build.openvino` | Editable build with OpenVINO. | +| `make build.vulkan` | Editable build with Vulkan. | +| `make build.sycl` | Editable build with SYCL. | +| `make test` | Run pytest with verbose tracing. | +| `make clean` | Remove local native build artifacts. | + +When testing a different `llama.cpp` commit, update the `vendor/llama.cpp` +submodule, clean the local build, and reinstall. If the upstream C API changes, +the ctypes declarations in `llama_cpp/llama_cpp.py` may also need to be updated. + +--- + +## Common Installation Pitfalls + +| Symptom | Likely cause | What to try | +|---|---|---| +| CMake cannot find a compiler | Build tools are missing or not available in the current shell. | Install platform build tools and reopen the terminal. On Windows, use a Developer PowerShell or initialize Visual Studio build variables. | +| Build ignores new backend flags | `pip` reused a cached wheel or previous build. | Reinstall with `--force-reinstall --no-cache-dir`, and clean `_skbuild` for local builds. | +| CUDA backend does not build | CUDA Toolkit is missing, incompatible, or not on `PATH`. | Verify `nvcc --version`, CUDA driver compatibility, and `CUDA_PATH` on Windows. | +| CUDA build targets the wrong GPU generation | Native architecture detection picked the build machine GPU, or `nvcc` could not detect it. | Use `-DGGML_NATIVE=OFF` for portability or set `-DCMAKE_CUDA_ARCHITECTURES=...` explicitly. | +| Native library fails to load on Windows | Required DLLs are missing from `PATH` or `llama_cpp/lib`. | Check `llama_cpp/lib` for `llama.dll`, `ggml*.dll`, backend DLLs, and runtime DLLs such as OpenMP or CUDA dependencies. | +| GPU is not used at runtime | The package was built without that backend or `n_gpu_layers` is `0`. | Rebuild with the correct CMake backend flag and set `n_gpu_layers` to a positive value or `-1`. | +| OpenVINO GPU or NPU behaves unexpectedly | Runtime device selection or context size is unsuitable. | Set `GGML_OPENVINO_DEVICE`, enable `GGML_OPENVINO_STATEFUL_EXECUTION=1` for GPU, and keep context size smaller for NPU workflows. | +| SYCL device is not selected | oneAPI environment or device selector is missing. | Source oneAPI setup and set `ONEAPI_DEVICE_SELECTOR` for the intended device. | +| Submodule files are missing | Repository was cloned without `--recursive`. | Run `git submodule update --init --recursive`. | + +For detailed diagnostics, see [[Troubleshooting]]. + +--- + +## Related Links + +* [[Index-Home](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/index.md)] +* [[Llama Core](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +* [README Installation](https://github.com/JamePeng/llama-cpp-python/blob/main/README.md#installation) +* [llama.cpp build documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) +* [llama.cpp backend documentation](https://github.com/ggml-org/llama.cpp/tree/master/docs/backend) From 3bcd8010fb89909d6780acb06de8ae0e537e95d9 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 23:26:59 +0800 Subject: [PATCH 474/518] docs(wiki): link installation guide from index Promote the completed installation guide into the wiki entry point so new users can find build and backend setup instructions before reading API-specific documentation. - Add a Getting Started section that links to install.md. - Move installation to the top of the recommended reading order. - Mark install.md as an available page. - Remove installation from the planned documentation areas. Signed-off-by: JamePeng --- docs/wiki/index.md | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/docs/wiki/index.md b/docs/wiki/index.md index c721fc4e89..8e5dbed14b 100644 --- a/docs/wiki/index.md +++ b/docs/wiki/index.md @@ -10,6 +10,16 @@ The documentation is maintained with the help of LLMs, but the source of truth i ## Quick Navigation +### Getting Started + +Start here if you are installing or rebuilding `llama-cpp-python`. + +| Page | Description | +|---|---| +| [install\|Installation] | Source installation guide covering Python setup, CMake options, llama.cpp backend selection, hardware acceleration, rebuilds, and verification. | + +--- + ### Core API Start here if you are using `llama-cpp-python` directly. @@ -42,7 +52,7 @@ This section contains maintainer-facing development notes, workflows, and LLM-as | Page | Description | |---|---| -| [[development/Git Commit Generation Agent]] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | +| [development/Git Commit Generation Agent] | Helper workflow for generating clear, structured, and source-aware Git commit messages. | --- @@ -61,13 +71,14 @@ These pages define how the wiki should be written, updated, and reviewed. If you are new to this wiki, read the pages in this order: -1. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] -2. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] -3. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] -4. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] -5. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] -6. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] -7. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] +1. [[install|Installation](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md)] +2. [[core/Llama|Llama](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/core/Llama.md)] +3. [[modules/LlamaCache|Llama Cache](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaCache.md)] +4. [[modules/LlamaEmbedding|Llama Embedding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaEmbedding.md)] +5. [[modules/LlamaGrammar|Llama Grammar](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaGrammar.md)] +6. [[modules/LlamaSpeculative|Llama Speculative Decoding](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/LlamaSpeculative.md)] +7. [[modules/Logger\|Logger](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/modules/Logger.md)] +8. [[development/Git Commit Generation Agent](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/development/git-commit-generation-agent.md)] If you are contributing documentation, start with: 1. [[SCHEMA|Wiki Schema](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md)] @@ -81,6 +92,7 @@ The wiki is still being expanded. Currently available pages: +- `install.md` - `core/Llama.md` - `modules/LlamaCache.md` - `modules/LlamaEmbedding.md` @@ -99,7 +111,6 @@ Some planned pages may already exist as empty placeholder files. Empty pages are Future documentation may cover: -- Installation and build options - Chat formats and chat handlers - Low-level ctypes bindings - Multimodal APIs @@ -126,5 +137,6 @@ This wiki follows a few core rules: ## Project Links - GitHub: [llama-cpp-python](https://github.com/JamePeng/llama-cpp-python) +- Installation guide: [install](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md) - Wiki schema: [SCHEMA](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/SCHEMA.md) -- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) \ No newline at end of file +- Contribution guide: [contributing-to-wiki](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/contributing-to-wiki.md) From 7cd0f6081251fba1852b4ecd61378d36a229de6e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 2 Jun 2026 23:33:13 +0800 Subject: [PATCH 475/518] docs(readme): link detailed installation wiki guide Signed-off-by: JamePeng --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c5aa1a1b26..ba1969793c 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ Thank you for your continuous support! ## Installation +For a structured source-install and backend build guide, see [docs/wiki/install.md](https://github.com/JamePeng/llama-cpp-python/blob/main/docs/wiki/install.md). + Requirements: - Python 3.9+ From 14b3b4624065a4b054f4d07a8ac25f999bc7bd87 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Wed, 3 Jun 2026 22:26:28 +0800 Subject: [PATCH 476/518] Update Submodule vendor/llama.cpp 60130d1..9e58d4d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 60130d18f9..9e58d4d692 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 60130d18f9ac7f42cb4d7f6060b088a45d8f242e +Subproject commit 9e58d4d692ed3d350591cc86d06c73c61c122509 From fed47f2d398fcb971595f53b423f59fd7fe0d3c1 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Thu, 4 Jun 2026 21:39:53 +0800 Subject: [PATCH 477/518] Update Submodule vendor/llama.cpp 9e58d4d..7c158fb Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9e58d4d692..7c158fbb4a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9e58d4d692ed3d350591cc86d06c73c61c122509 +Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae From fff6812e071d3d24fe57e1f635ed2ced51b8cd4e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 00:54:06 +0800 Subject: [PATCH 478/518] Update Submodule vendor/llama.cpp 7c158fb..c4a278d Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7c158fbb4a..c4a278d68e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae +Subproject commit c4a278d68efa17811006f2123a84081dac03fac7 From be123f1c55c4ae503d4ae5edc845c310a313d2b2 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 03:41:22 +0800 Subject: [PATCH 479/518] feat(internals): Add `ReasoningBudgetSampler` support - Add Python-backed ReasoningBudgetSampler for first reasoning-block control - Install the sampler before probability filters to preserve forced end tokens - Support reasoning_budget -1/0/N semantics in sampling params - Force reasoning_budget_message + reasoning_end when the budget is exhausted - Add manual force_reasoning_budget() at the sampling-context level - Match llama.cpp force behavior by allowing only COUNTING -> FORCING - Keep DONE as permanent passthrough and ignore later reasoning tags - Support prefilled reasoning starts with reasoning_start_in_prompt - Preserve UTF-8 boundary safety before forcing the end sequence - Keep Python-backed custom sampler callbacks alive across C sampler usage - Avoid shallow-copying custom_samplers when cloning sampler chains Signed-off-by: JamePeng --- llama_cpp/_internals.py | 525 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 504 insertions(+), 21 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 92ff51447f..c308fae056 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1527,7 +1527,7 @@ def __init__( _existing_sampler: Optional[LlamaSampler] = None, # Internal use for cloning ): if model is None: - raise RuntimeError("model must not be None") + raise RuntimeError("LlamaSamplingContext: model must not be None") self.model = model self.params = params @@ -1537,8 +1537,8 @@ def __init__( lparams = llama_cpp.llama_sampler_chain_default_params() lparams.no_perf = params.no_perf - # history (bounded) - # last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) + # History (bounded) + # Last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size) if self.params.penalty_last_n == -1: # full context self.params.penalty_last_n = self.model.n_ctx_train() @@ -1551,10 +1551,10 @@ def __init__( ) self.prev = deque(maxlen=max(self.params.n_prev, 32)) - # reusable token data array + # Reusable token data array self._cur_p = LlamaTokenDataArray(n_vocab=self.n_vocab) - # reusable numpy logits view + # Reusable numpy logits view self._logits_view = None self._logits_ptr_addr = None @@ -1566,14 +1566,14 @@ def __init__( sorted=False, ) - # sampler chain + # Sampler chain if _existing_sampler: self.sampler_chain = _existing_sampler else: self.sampler_chain = LlamaSampler() self._build_sampler_chain() - # grammar sampler + # Grammar sampler self.grammar_sampler = None if params.grammar: self.grammar_sampler = GrammarSampler( @@ -1583,6 +1583,9 @@ def __init__( params.grammar_triggers, ) + # Active Python reasoning-budget sampler for this sampling context. + self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None + def _build_sampler_chain(self): """ Build sampler chain aligned with llama.cpp common_sampler_init @@ -1594,7 +1597,7 @@ def _build_sampler_chain(self): m = self.model if m is None: - raise RuntimeError("Model required to build sampler chain firstly") + raise RuntimeError("LlamaSamplingContext: Model required to build sampler chain firstly") use_adaptive_p = False @@ -1628,7 +1631,66 @@ def _build_sampler_chain(self): p.dry_sequence_breakers ) - # --- 5. Core Sampling Strategies (The "Filter" Loop) --- + # --- 5. Reasoning Budget --- + # + # Install before top-k/top-p/min-p filters so the forced end token cannot + # be removed from the candidate set before forcing happens. + # This sampler only controls the first reasoning block. Later blocks are ignored. + if p.reasoning_budget < -1: + raise ValueError( + "LlamaSamplingContext: reasoning_budget must be -1, 0, or a positive integer" + ) + + if p.reasoning_budget >= 0: + start_tokens = None + if not p.reasoning_start_in_prompt: + start_tokens = m.tokenize( + p.reasoning_start.encode("utf-8"), + add_bos=False, + special=True, + ) + if not start_tokens: + raise ValueError("LlamaSamplingContext: reasoning_start produced no tokens") + + end_tokens = m.tokenize( + p.reasoning_end.encode("utf-8"), + add_bos=False, + special=True, + ) + if not end_tokens: + raise ValueError("LlamaSamplingContext: reasoning_end produced no tokens") + + forced_text = (p.reasoning_budget_message or "") + p.reasoning_end + forced_tokens = m.tokenize( + forced_text.encode("utf-8"), + add_bos=False, + special=True, + ) + if not forced_tokens: + raise ValueError("LlamaSamplingContext: reasoning forced text produced no tokens") + + rb_sampler = ReasoningBudgetSampler( + model=m, + reasoning_budget=p.reasoning_budget, + start_tokens=start_tokens, + end_tokens=end_tokens, + forced_tokens=forced_tokens, + initial_state=( + ReasoningBudgetState.COUNTING + if p.reasoning_start_in_prompt + else ReasoningBudgetState.IDLE + ), + start_max_tokens=p.reasoning_start_max_tokens, + wait_utf8=True, + ) + + # Keep a direct Python reference so force_reasoning_budget() can + # manually transition COUNTING -> FORCING at runtime. + self.reasoning_budget_sampler = rb_sampler + + s.add_custom(rb_sampler) + + # --- 6. Core Sampling Strategies (The "Filter" Loop) --- # We iterate through the list to preserve user-defined order for these specific samplers for stype in p.samplers: if stype == CommonSamplerType.CUSTOM: @@ -1660,7 +1722,7 @@ def _build_sampler_chain(self): elif stype == CommonSamplerType.ADAPTIVE_P: use_adaptive_p = True - # --- 6. Final Distribution / Selection --- + # --- 7. Final Distribution / Selection --- # Mirostat overrides standard greedy/dist sampling if p.mirostat == 1 and m: s.add_mirostat(m.n_vocab(), p.seed, p.mirostat_tau, p.mirostat_eta, 100) @@ -1839,6 +1901,10 @@ def close(self): self.sampler_chain.close() self.sampler_chain = None + # Clear the convenience reference used for manual reasoning-budget force. + # The actual sampler lifetime is owned by sampler_chain.close(). + self.reasoning_budget_sampler = None + # Release large token data buffer used during sampling. # Important for high-vocab models to avoid memory retention. if hasattr(self, "_cur_p"): @@ -1885,24 +1951,53 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str: # Use the model linked to the context to detokenize return ctx_main.model.detokenize(last_n_tokens).decode("utf-8", errors="replace") + def force_reasoning_budget(self) -> bool: + """ + Manually force the active reasoning-budget sampler to end thinking. + + This mirrors llama.cpp's common_sampler_reasoning_budget_force() + behavior at the Python sampling-context level. + + Returns: + True if the sampler was actively COUNTING inside the first reasoning + block and was transitioned to FORCING. + + False if: + - no reasoning-budget sampler is installed + - the sampler is IDLE + - the sampler is WAITING_UTF8 + - the sampler is already FORCING + - the sampler is DONE + + Important: + Calling this while already FORCING must not rewind force_pos. The + underlying ReasoningBudgetSampler.force() handles this by allowing + only COUNTING -> FORCING. + """ + if self.reasoning_budget_sampler is None: + return False + + return self.reasoning_budget_sampler.force() + class CustomSampler: """ - Python wrapper for llama.cpp custom sampler. + Base class for Python-backed custom samplers in the Llama sampler chain. - apply_func: - Callable receiving llama_token_data_array - and modifying logits in-place. + Responsibilities: + - Provides apply, accept, reset, free and clone callbacks for the C sampler chain. + - Keeps Python references alive to prevent GC while C sampler still holds function pointers. + - Implements safe close to clear all callback references. """ def __init__( self, apply_func: Callable[[llama_cpp.llama_token_data_array], None], - name: str = "custom", accept_func: Optional[Callable] = None, reset_func: Optional[Callable] = None, free_func: Optional[Callable] = None, clone_func: Optional[Callable] = None, + name: str = "custom", ): if not callable(apply_func): raise TypeError("apply_func must be callable") @@ -2002,6 +2097,389 @@ def __del__(self): self.close() +class ReasoningBudgetSampler(CustomSampler): + """ + Generic first-reasoning-block budget sampler. + + This sampler is intentionally model-agnostic. It does not infer model + families, inspect chat templates, or guess reasoning tags. The caller is + responsible for passing the correct reasoning_start and reasoning_end token + sequences. + + Behavior: + 1. Wait for the first reasoning_start token sequence, unless the prompt + already inserted it and initial_state is COUNTING. + 2. Count accepted tokens inside the first reasoning block. + 3. If reasoning_end appears naturally, switch to DONE. + 4. If the budget is exhausted first, force: + reasoning_budget_message + reasoning_end + token by token. + 5. Once DONE, remain passthrough forever. Later reasoning tags are ignored. + + This mirrors the core idea of llama.cpp's reasoning-budget sampler while + keeping the Python API small and explicit. + """ + + def __init__( + self, + *, + model: LlamaModel, + reasoning_budget: int, + start_tokens: Optional[Sequence[int]], + end_tokens: Sequence[int], + forced_tokens: Sequence[int], + initial_state: ReasoningBudgetState = ReasoningBudgetState.IDLE, + start_max_tokens: Optional[int] = 32, + wait_utf8: bool = True, + ): + """ + Initialize the reasoning budget sampler. + + Args: + model: + The active LlamaModel wrapper. Used for token_to_piece() when + checking UTF-8 boundaries. + + reasoning_budget: + Token budget inside the first reasoning block. + Must be >= 0 here. The disabled value -1 is handled before this + sampler is created. + + 0: + Force the end sequence immediately after reasoning starts. + + N > 0: + Allow at most N accepted tokens inside the reasoning block. + + start_tokens: + Token sequence that starts reasoning budget counting. + Must be provided when initial_state is IDLE. + Can be None when initial_state is COUNTING, which is used when + the prompt/chat template has already inserted reasoning_start. + + end_tokens: + Token sequence that naturally ends the reasoning block. + + forced_tokens: + Token sequence forced when the budget is exhausted. This should + normally be tokenized from: + reasoning_budget_message + reasoning_end + + initial_state: + Initial state of the sampler. + IDLE: + Wait for start_tokens during generation. + COUNTING: + Start counting from the first generated token. Use this when + reasoning_start is already present in the prompt. + + start_max_tokens: + Safety window for non-reasoning models. If start_tokens are not + observed within this many generated tokens, the sampler switches + to DONE and becomes a no-op. Set to None to wait indefinitely. + + wait_utf8: + If True, when the budget is exhausted on an incomplete UTF-8 + token piece, wait until a complete UTF-8 boundary before forcing + the end sequence. + """ + if model is None: + raise ValueError("model must not be None") + + if reasoning_budget < 0: + raise ValueError("reasoning_budget must be >= 0") + + self.model = model + + # Maximum number of tokens allowed inside the first reasoning block. + # The disabled value (-1) should be handled before constructing this sampler. + self.reasoning_budget = int(reasoning_budget) + + # Remaining tokens in the active reasoning block. + self.remaining = int(reasoning_budget) + + # Incremental matcher for the first reasoning_start sequence. + # Empty matcher is allowed only when initial_state=COUNTING. + self.start_matcher = TokenMatcher(start_tokens) + + # Incremental matcher for the natural reasoning_end sequence. + self.end_matcher = TokenMatcher(end_tokens) + + # Token sequence forced after budget exhaustion: + # reasoning_budget_message + reasoning_end + self.forced_tokens = list(forced_tokens) + + if initial_state == ReasoningBudgetState.IDLE and not self.start_matcher.tokens: + raise ValueError( + "start_tokens must not be empty when initial_state=IDLE" + ) + + if not self.end_matcher.tokens: + raise ValueError("end_tokens must not be empty") + + if not self.forced_tokens: + raise ValueError("forced_tokens must not be empty") + + # State used by reset(). This is important for templates that already + # insert reasoning_start into the prompt: reset must return to COUNTING, + # not always IDLE. + self.initial_state = ReasoningBudgetState(initial_state) + + # Current runtime state. + self.state = ReasoningBudgetState(initial_state) + + # Index of the next token in forced_tokens to force. + self.force_pos = 0 + + # Count of generated tokens observed by this sampler. + # Used only in IDLE to enforce start_max_tokens. + self.generated_tokens = 0 + + # Maximum number of generated tokens to wait for reasoning_start. + # None means wait indefinitely. + self.start_max_tokens = start_max_tokens + + # Whether to delay forcing until a complete UTF-8 boundary. + self.wait_utf8 = wait_utf8 + + # Keep cloned Python sampler objects alive when llama.cpp clones the + # sampler chain. Without this, cloned Python callbacks could be garbage + # collected while C still holds function pointers to them. + self._clone_keep_alive: List["ReasoningBudgetSampler"] = [] + + if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + super().__init__( + apply_func=self._apply, + accept_func=self._accept, + reset_func=self._reset, + clone_func=self._clone, + name="reasoning-budget", + ) + + def force(self) -> bool: + """ + Manually transition the active reasoning block into forced ending. + + This method is useful for external interruption scenarios, such as: + - user clicks "stop thinking" + - server-side thinking timeout + - UI wants to skip the rest of the reasoning block while still allowing + the model to continue with the final answer + + The transition is allowed only from COUNTING. This matches llama.cpp's + common_reasoning_budget_force() behavior and avoids unsafe rewinding when + the sampler is already FORCING. + """ + if self.state != ReasoningBudgetState.COUNTING: + return False + + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + return True + + def _token_utf8_complete(self, token: int) -> bool: + """ + Return whether the token piece is a complete UTF-8 byte sequence. + + This is a safety feature. If the budget is exhausted in the middle of a + multi-byte UTF-8 sequence, the sampler waits until a complete boundary + before forcing reasoning_budget_message + reasoning_end. + """ + if not self.wait_utf8: + return True + + try: + piece = self.model.token_to_piece(token, special=False) + if not piece: + return True + piece.decode("utf-8") + return True + except UnicodeDecodeError: + return False + except Exception: + # Avoid getting stuck forever if token_to_piece behaves unexpectedly. + return True + + def _start_counting(self) -> None: + """ + Enter COUNTING state and initialize the budget window. + + If reasoning_budget is 0, immediately enter FORCING state. + """ + self.state = ReasoningBudgetState.COUNTING + self.remaining = self.reasoning_budget + self.end_matcher.reset() + self.force_pos = 0 + + if self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + def _accept(self, token: int) -> None: + """ + Update sampler state after one token has been accepted. + + This method does not modify logits. It only tracks: + - whether reasoning_start has appeared + - whether reasoning_end has appeared + - how much budget remains + - where we are in the forced token sequence + """ + self.generated_tokens += 1 + + if self.state == ReasoningBudgetState.IDLE: + if self.start_matcher.advance(token): + self._start_counting() + return + + # Safety for non-reasoning models: + # + # If no reasoning_start appears near the beginning, assume this + # completion has no visible reasoning block. Switch to DONE forever + # so later literal mentions of reasoning_start do not accidentally + # activate the budget controller. + if ( + self.start_max_tokens is not None + and self.generated_tokens >= self.start_max_tokens + ): + self.state = ReasoningBudgetState.DONE + return + + if self.state in ( + ReasoningBudgetState.COUNTING, + ReasoningBudgetState.WAITING_UTF8, + ): + if self.end_matcher.advance(token): + self.state = ReasoningBudgetState.DONE + return + + utf8_complete = self._token_utf8_complete(token) + + if self.state == ReasoningBudgetState.WAITING_UTF8: + if utf8_complete: + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + return + + self.remaining -= 1 + if self.remaining <= 0: + if utf8_complete: + self.state = ReasoningBudgetState.FORCING + self.force_pos = 0 + self.end_matcher.reset() + else: + self.state = ReasoningBudgetState.WAITING_UTF8 + self.end_matcher.reset() + return + + if self.state == ReasoningBudgetState.FORCING: + self.force_pos += 1 + if self.force_pos >= len(self.forced_tokens): + self.state = ReasoningBudgetState.DONE + return + + if self.state == ReasoningBudgetState.DONE: + # Only the first reasoning block is budget-controlled. + # Later reasoning tags are normal generated text. + return + + def _apply(self, cur_p: llama_cpp.llama_token_data_array) -> None: + """ + Apply logits forcing before sampling. + + In FORCING state, only forced_tokens[force_pos] is allowed. All other + candidate logits are set to -inf. The forced token is set to +inf to make + the intent explicit and robust against previous logit modifications. + """ + if self.state != ReasoningBudgetState.FORCING: + return + + if self.force_pos >= len(self.forced_tokens): + return + + forced = self.forced_tokens[self.force_pos] + data = cur_p.data + found = False + + for i in range(cur_p.size): + if data[i].id == forced: + data[i].logit = float("inf") + found = True + else: + data[i].logit = float("-inf") + + cur_p.sorted = False + cur_p.selected = -1 + + if not found: + raise RuntimeError( + f"ReasoningBudgetSampler: forced token {forced} is not present " + "in the candidate array. Move ReasoningBudgetSampler earlier in " + "the sampler chain." + ) + + def _reset(self) -> None: + """ + Reset the sampler to its configured initial state. + + Uses self.initial_state to determine whether to start in: + - IDLE: wait for reasoning_start token sequence + - COUNTING: prompt already contains start token, begin counting immediately + + Also resets internal counters and matchers: + - remaining budget + - generated_tokens + - start_matcher / end_matcher positions + - force_pos + """ + self.state = self.initial_state + self.remaining = self.reasoning_budget + self.generated_tokens = 0 + self.force_pos = 0 + + if self.start_matcher: + self.start_matcher.reset() + self.end_matcher.reset() + + # If initial_state = COUNTING and budget is zero, immediately enter FORCING + if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: + self.state = ReasoningBudgetState.FORCING + + def _clone(self): + """ + Clone the full runtime state. + + This mirrors the newer llama.cpp reasoning-budget sampler behavior where + clone copies the full sampler context, not only the static configuration. + """ + cloned = ReasoningBudgetSampler( + model=self.model, + reasoning_budget=self.reasoning_budget, + start_tokens=self.start_matcher.tokens, + end_tokens=self.end_matcher.tokens, + forced_tokens=self.forced_tokens, + initial_state=self.initial_state, + start_max_tokens=self.start_max_tokens, + wait_utf8=self.wait_utf8, + ) + + cloned.remaining = self.remaining + cloned.state = self.state + cloned.force_pos = self.force_pos + cloned.generated_tokens = self.generated_tokens + cloned.start_matcher.pos = self.start_matcher.pos + cloned.end_matcher.pos = self.end_matcher.pos + + # Keep the cloned Python object alive on the source sampler. The cloned + # LlamaSampler wrapper does not own this object directly because the C + # sampler clone is created through the callback. + self._clone_keep_alive.append(cloned) + + return cloned.get_sampler() + class LlamaSampler: def __init__(self, existing_sampler_p: Optional[llama_cpp.llama_sampler_p] = None): if existing_sampler_p: @@ -2055,12 +2533,13 @@ def clone(self) -> 'LlamaSampler': new_sampler = LlamaSampler(existing_sampler_p=new_sampler_p) - # copy _keep_alive and custom_samplers list to new sampler - if self._keep_alive: - new_sampler._keep_alive = self._keep_alive.copy() - - if self.custom_samplers: - new_sampler.custom_samplers = self.custom_samplers.copy() + # llama_sampler_clone() clones C samplers internally. For Python-backed + # custom samplers, the clone_func returns a new C sampler whose Python + # callback object is kept alive by the original custom sampler. Shallow + # copying custom_samplers would make the cloned chain close the original + # Python custom sampler, causing premature close/double-free issues. + new_sampler._keep_alive = self._keep_alive.copy() if self._keep_alive else [] + new_sampler.custom_samplers = [] return new_sampler @@ -2250,6 +2729,10 @@ def add_custom(self, custom_sampler: CustomSampler): [llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler] ) + # Keep the Python callback object alive while the C sampler chain holds + # function pointers to it. + self._keep_alive.append(custom_sampler) + def get_seed(self) -> int: assert self.sampler is not None return llama_cpp.llama_sampler_get_seed(self.sampler) From 82a026687eda143c2877fcee96d5f9dbf64d45e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 12:10:10 +0800 Subject: [PATCH 480/518] feat(internals): add verbose logging to ReasoningBudgetSampler - Add `verbose` parameter to ReasoningBudgetSampler to print high-level state transitions to stderr. - Log key events: initialization, reasoning_start matched, budget exhausted, forced end sequence, UTF-8 boundary waiting, manual force, natural end, reset. - Pass `verbose=getattr(model, "verbose", False)` from LlamaSamplingContext when building the sampler chain. - Preserve verbose flag when cloning the sampler. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index c308fae056..434921e6bd 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -3,6 +3,7 @@ import ctypes import enum import os +import sys from typing import ( Callable, @@ -1566,6 +1567,9 @@ def __init__( sorted=False, ) + # Active Python reasoning-budget sampler for this sampling context. + self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None + # Sampler chain if _existing_sampler: self.sampler_chain = _existing_sampler @@ -1583,9 +1587,6 @@ def __init__( params.grammar_triggers, ) - # Active Python reasoning-budget sampler for this sampling context. - self.reasoning_budget_sampler: Optional[ReasoningBudgetSampler] = None - def _build_sampler_chain(self): """ Build sampler chain aligned with llama.cpp common_sampler_init @@ -1682,6 +1683,7 @@ def _build_sampler_chain(self): ), start_max_tokens=p.reasoning_start_max_tokens, wait_utf8=True, + verbose=getattr(m, "verbose", False), ) # Keep a direct Python reference so force_reasoning_budget() can @@ -2131,6 +2133,7 @@ def __init__( initial_state: ReasoningBudgetState = ReasoningBudgetState.IDLE, start_max_tokens: Optional[int] = 32, wait_utf8: bool = True, + verbose: bool = False, ): """ Initialize the reasoning budget sampler. @@ -2182,6 +2185,11 @@ def __init__( If True, when the budget is exhausted on an incomplete UTF-8 token piece, wait until a complete UTF-8 boundary before forcing the end sequence. + + verbose: + If True, print high-level reasoning-budget state transitions to + stderr. Logging is intentionally limited to transitions instead + of per-token events to avoid noisy generation output. """ if model is None: raise ValueError("model must not be None") @@ -2242,6 +2250,10 @@ def __init__( # Whether to delay forcing until a complete UTF-8 boundary. self.wait_utf8 = wait_utf8 + # Whether to print high-level state transition logs. + # This follows the model/runtime verbose flag and avoids per-token spam. + self.verbose = verbose + # Keep cloned Python sampler objects alive when llama.cpp clones the # sampler chain. Without this, cloned Python callbacks could be garbage # collected while C still holds function pointers to them. @@ -2258,6 +2270,19 @@ def __init__( name="reasoning-budget", ) + if self.verbose: + print( + f"ReasoningBudgetSampler: initialized " + f"(state={self.state.name}, budget={self.reasoning_budget}, " + f"start_max_tokens={self.start_max_tokens}, wait_utf8={self.wait_utf8}).", + file=sys.stderr, + ) + + def _log(self, message: str) -> None: + """Print a verbose reasoning-budget state transition message.""" + if self.verbose: + print(f"ReasoningBudgetSampler: {message}", file=sys.stderr) + def force(self) -> bool: """ Manually transition the active reasoning block into forced ending. @@ -2278,6 +2303,7 @@ def force(self) -> bool: self.state = ReasoningBudgetState.FORCING self.force_pos = 0 self.end_matcher.reset() + self._log("manual force requested; entering FORCING state.") return True def _token_utf8_complete(self, token: int) -> bool: @@ -2313,9 +2339,11 @@ def _start_counting(self) -> None: self.remaining = self.reasoning_budget self.end_matcher.reset() self.force_pos = 0 + self._log(f"reasoning_start matched; entering COUNTING state (budget={self.reasoning_budget}).") if self.remaining <= 0: self.state = ReasoningBudgetState.FORCING + self._log("budget is 0; entering FORCING state immediately.") def _accept(self, token: int) -> None: """ @@ -2345,6 +2373,10 @@ def _accept(self, token: int) -> None: and self.generated_tokens >= self.start_max_tokens ): self.state = ReasoningBudgetState.DONE + self._log( + f"reasoning_start not found within {self.start_max_tokens} generated tokens; " + "switching to DONE passthrough." + ) return if self.state in ( @@ -2353,6 +2385,7 @@ def _accept(self, token: int) -> None: ): if self.end_matcher.advance(token): self.state = ReasoningBudgetState.DONE + self._log("reasoning_end matched naturally; switching to DONE passthrough.") return utf8_complete = self._token_utf8_complete(token) @@ -2362,6 +2395,7 @@ def _accept(self, token: int) -> None: self.state = ReasoningBudgetState.FORCING self.force_pos = 0 self.end_matcher.reset() + self._log("UTF-8 boundary reached; entering FORCING state.") return self.remaining -= 1 @@ -2370,15 +2404,18 @@ def _accept(self, token: int) -> None: self.state = ReasoningBudgetState.FORCING self.force_pos = 0 self.end_matcher.reset() + self._log("reasoning budget exhausted; entering FORCING state.") else: self.state = ReasoningBudgetState.WAITING_UTF8 self.end_matcher.reset() + self._log("reasoning budget exhausted; waiting for UTF-8 boundary before forcing.") return if self.state == ReasoningBudgetState.FORCING: self.force_pos += 1 if self.force_pos >= len(self.forced_tokens): self.state = ReasoningBudgetState.DONE + self._log("forced end sequence completed; switching to DONE passthrough.") return if self.state == ReasoningBudgetState.DONE: @@ -2448,6 +2485,8 @@ def _reset(self) -> None: if self.state == ReasoningBudgetState.COUNTING and self.remaining <= 0: self.state = ReasoningBudgetState.FORCING + self._log(f"reset to {self.state.name} state.") + def _clone(self): """ Clone the full runtime state. @@ -2464,6 +2503,7 @@ def _clone(self): initial_state=self.initial_state, start_max_tokens=self.start_max_tokens, wait_utf8=self.wait_utf8, + verbose=self.verbose, ) cloned.remaining = self.remaining From 1b472b354b6d0dbb841b8b29e260a8544453ecf3 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 12:13:18 +0800 Subject: [PATCH 481/518] feat: pass reasoning budget params through Llama APIs - Add reasoning budget params to public completion and chat entry points - Forward the params from chat handlers into create_completion - Propagate reasoning budget controls down to generate and sampling params - Document -1/0/N reasoning_budget behavior in completion docstrings - Support custom reasoning_start and reasoning_end tags without model-specific inference - Support reasoning_budget_message and reasoning_start_in_prompt - Wire MTMD chat handler to the same reasoning budget controls Signed-off-by: JamePeng --- llama_cpp/llama.py | 122 +++++++++++++++++++++++++++++++++ llama_cpp/llama_chat_format.py | 40 +++++++++++ 2 files changed, 162 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 43e3d6f1fd..2bab3709e6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1355,6 +1355,13 @@ def sample( grammar_lazy: bool = False, idx: Optional[int] = None, seed: Optional[int] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ): """Sample a token from the model. Returns: @@ -1413,6 +1420,16 @@ def sample( logit_bias=self._convert_logit_bias(logit_bias), grammar=grammar.grammar if grammar else "", grammar_lazy=grammar_lazy, + + # Reasoning Budget + # This generic controller only counts the first visible reasoning + # block. Use reasoning_budget=-1 to leave it disabled. + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) # LogitsProcessor Adapter @@ -1487,6 +1504,13 @@ def generate( seed: Optional[int] = None, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -1532,6 +1556,18 @@ def generate( grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax. grammar_lazy: If True, activates grammar constraints only on specific trigger tokens. seed: RNG seed for sampling. Overrides the instance seed. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the reasoning budget sampler, 0 forces the block to end + immediately after it starts, and N > 0 allows at most N generated tokens. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + Defaults to "". Pass a model-specific value for non-default tags. + reasoning_end: Token/text sequence that marks the natural and forced end of the reasoning block. + Defaults to "". + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template has already inserted reasoning_start, + so counting starts from the first generated token. + reasoning_start_max_tokens: Safety window for non-reasoning models. If reasoning_start is not + generated within this many output tokens, the sampler becomes a no-op. Set None to wait indefinitely. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -1682,6 +1718,16 @@ def generate( grammar=grammar._grammar if grammar else "", grammar_lazy=grammar_lazy, seed=seed if seed is not None else self._seed, + + # Reasoning Budget + # Keeps the core sampler model-agnostic: callers provide the visible + # reasoning start/end tags, and -1 keeps the controller disabled. + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) # Register custom python-level logits processors if provided @@ -2065,6 +2111,13 @@ def _create_completion( seed: Optional[int] = None, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -2253,6 +2306,12 @@ def _create_completion( seed=seed if seed is not None else self._seed, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ): if llama_cpp_lib.llama_token_is_eog(self._model.vocab, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -2717,6 +2776,13 @@ def create_completion( grammar_lazy: bool = False, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2761,6 +2827,14 @@ def create_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2820,6 +2894,12 @@ def create_completion( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -2871,6 +2951,13 @@ def __call__( grammar_lazy: bool = False, active_loras: Optional[List[Dict[str, Union[str, float]]]] = None, control_vector: Optional[Dict[str, Any]] = None, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -2915,6 +3002,14 @@ def __call__( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -2974,6 +3069,12 @@ def __call__( grammar_lazy=grammar_lazy, active_loras=active_loras, control_vector=control_vector, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) def create_chat_completion( @@ -3025,6 +3126,13 @@ def create_chat_completion( top_logprobs: Optional[int] = None, assistant_prefill: bool = False, add_generation_prompt: bool = True, + # Reasoning Budget Params + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -3072,6 +3180,14 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. grammar_lazy: If True, enables lazy evaluation. + reasoning_budget: Token budget for the first visible reasoning block. + -1 disables the sampler, 0 forces an immediate end after reasoning starts, + and N > 0 allows at most N generated tokens inside the block. + reasoning_start: Token/text sequence that marks the beginning of the first reasoning block. + reasoning_end: Token/text sequence that naturally and forcibly ends the reasoning block. + reasoning_budget_message: Optional message inserted before reasoning_end when the budget is exhausted. + reasoning_start_in_prompt: Set True when the prompt/template already inserted reasoning_start. + reasoning_start_max_tokens: Safety window before disabling the sampler for non-reasoning outputs. active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation. Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`) and an optional "scale" key (float, defaults to 1.0). @@ -3138,6 +3254,12 @@ def create_chat_completion( control_vector=control_vector, assistant_prefill=assistant_prefill, add_generation_prompt=add_generation_prompt, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f91844bbb7..f502d68dc9 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -131,6 +131,17 @@ def __call__( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. These parameters are + # passed through to llama.create_completion() without model-specific + # inference or template guessing. + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -829,6 +840,17 @@ def chat_completion_handler( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, assistant_prefill: bool = False, + # Reasoning Budget Params + # + # Generic first-reasoning-block budget control. These parameters are + # passed through to llama.create_completion() without model-specific + # inference or template guessing. + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -964,6 +986,12 @@ def chat_completion_handler( stopping_criteria=stopping_criteria, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if tool is not None: tool_name = tool["function"]["name"] @@ -3512,6 +3540,12 @@ def __call__( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -3772,6 +3806,12 @@ def __call__( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, ) if tool is not None: From fb65ed793957f6a197b910af65e2fe7c7cf215e6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 20:01:50 +0800 Subject: [PATCH 482/518] Update Submodule vendor/llama.cpp c4a278d..6b80c74 Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 ++- llama_cpp/mtmd_cpp.py | 34 ++++++++++++++++++++++++++++------ vendor/llama.cpp | 2 +- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f502d68dc9..f929bcd150 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3292,7 +3292,8 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), - len(media_bytes) + len(media_bytes), + False, ) if bitmap is None: diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 839c718ccd..4542555c65 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -326,7 +326,10 @@ def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: # // if bitmap is audio: # // length of data must be n_samples * sizeof(float) # // the data is in float format (PCM F32) - +# // if data == nullptr: +# // the bitmap is considered "empty", and will be treated as a placeholder for counting tokens +# // you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens +# // note: passing a placeholder bitmap to mtmd_encode() will return an error # MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); @ctypes_function_mtmd( "mtmd_bitmap_init", [ @@ -787,11 +790,22 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # # // it calls mtmd_helper_bitmap_init_from_buf() internally # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); @ctypes_function_mtmd( - "mtmd_helper_bitmap_init_from_file", [mtmd_context_p_ctypes, c_char_p], mtmd_bitmap_p_ctypes) -def mtmd_helper_bitmap_init_from_file(ctx: mtmd_context_p, fname: c_char_p) -> mtmd_bitmap_p: + "mtmd_helper_bitmap_init_from_file", [ + mtmd_context_p_ctypes, + c_char_p, + c_bool, + ], + mtmd_bitmap_p_ctypes +) +def mtmd_helper_bitmap_init_from_file( + ctx: mtmd_context_p, + fname: c_char_p, + placeholder: c_bool, + /, +) -> mtmd_bitmap_p: """ helper function to construct a mtmd_bitmap from a file it calls mtmd_helper_bitmap_init_from_buf() internally @@ -807,13 +821,21 @@ def mtmd_helper_bitmap_init_from_file(ctx: mtmd_context_p, fname: c_char_p) -> m # // note: audio files will be auto-detected based on magic bytes # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); +# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); @ctypes_function_mtmd( - "mtmd_helper_bitmap_init_from_buf", [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t], mtmd_bitmap_p_ctypes) + "mtmd_helper_bitmap_init_from_buf", [ + mtmd_context_p_ctypes, + POINTER(c_uint8), + c_size_t, + c_bool, + ], + mtmd_bitmap_p_ctypes +) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], len: c_size_t, + placeholder: c_bool, /, ) -> mtmd_bitmap_p: """ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c4a278d68e..6b80c74f28 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c4a278d68efa17811006f2123a84081dac03fac7 +Subproject commit 6b80c74f285390368b3c99c5e750f19e9b096e98 From e001886f18d574b818a1963035fbc35ecfe1287c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 20:15:51 +0800 Subject: [PATCH 483/518] fix(mtmd): memory_can_shift() logic bug Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f929bcd150..4e7c045127 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3641,7 +3641,7 @@ def __call__( # Stage 5: Multimodal Physical OOM Defense if n_past + chunk_n_tokens > llama.n_ctx(): - if llama._ctx.memory_can_shift(): + if not llama._ctx.memory_can_shift(): raise RuntimeError( f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " f"(n_pos_per_embd > 1 or incompatible M-RoPE). " From 07afd3bc02cad3af20f10f83973b9a87c770dccb Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 21:07:34 +0800 Subject: [PATCH 484/518] docs(README): document reasoning budget sampler usage - Add README section for first reasoning-block budget control - Document reasoning_budget -1/0/N semantics and related sampler parameters - Explain reasoning_budget_message injection before reasoning_end - Add examples for default tags, Mistral [THINK] tags, and Gemma4 channel tags - Clarify when to use reasoning_start_in_prompt for prefilled thinking tags - Note that reasoning_start_in_prompt is not a generic thinking-enabled switch - Mention verbose transition logs for reasoning-budget state changes Signed-off-by: JamePeng --- README.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/README.md b/README.md index ba1969793c..03f66a8cf9 100644 --- a/README.md +++ b/README.md @@ -899,6 +899,83 @@ Mirostat actively maintains a target entropy (`tau`) during generation to preven * **`logits_processor`** (`LogitsProcessorList`, optional): Custom Python callbacks to modify the logits tensor in-place before sampling. * **`stopping_criteria`** (`StoppingCriteriaList`, optional): Custom Python callbacks to halt generation based on the current sequence or scores. + +### Reasoning Budget (First Reasoning Block) + +`llama-cpp-python` provides a generic reasoning-budget sampler for models that expose their thinking content with visible start/end tags. It controls only the **first visible reasoning block** in the generated output. After that block naturally ends or is forcibly closed, the sampler switches to passthrough mode and later reasoning tags are ignored. + +This feature is intentionally model-agnostic. It does not infer model families, inspect chat templates, or guess thinking tags. If a model uses tags other than `...`, pass the correct `reasoning_start` and `reasoning_end` explicitly. + +| Parameter | Default | Description | +| --- | --- | --- | +| `reasoning_budget` | `-1` | Token budget for the first visible reasoning block. `-1` disables the sampler, `0` forces an immediate end after the block starts, and `N > 0` allows at most `N` generated tokens inside the block. | +| `reasoning_start` | `""` | Token/text sequence that marks the beginning of the first reasoning block. | +| `reasoning_end` | `""` | Token/text sequence that naturally ends the reasoning block. When the budget is exhausted, the sampler forces this sequence. | +| `reasoning_budget_message` | `None` | Optional message inserted before `reasoning_end` when the budget is exhausted. | +| `reasoning_start_in_prompt` | `False` | Set to `True` only when the prompt/chat template has already inserted `reasoning_start`, so the sampler should start counting from the first generated token. | +| `reasoning_start_max_tokens` | `32` | Safety window for non-reasoning outputs. If `reasoning_start` is not generated within this many output tokens, the sampler becomes a no-op. Set to `None` to wait indefinitely. | + +Basic usage with the default `...` tags: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_budget_message="\n[reasoning budget exhausted]\n", + # You can also inject a natural-language transition before reasoning_end: + # reasoning_budget_message="\n...Wait, I have been thinking long enough. Let me start answering the user's question.\n", +) +``` +When the budget is exhausted, the sampler forces: `reasoning_budget_message` + `reasoning_end` + +For Mistral-style thinking tags, pass the tags explicitly: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="[THINK]", + reasoning_end="[/THINK]", +) +``` + +For Gemma4 channel-style thinking, adjust the start and end markers to match the visible channel tags: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="<|channel>", + reasoning_end="", +) +``` + +Use `reasoning_start_in_prompt=True` when the prompt or chat template has already inserted the reasoning start tag. In that case, the sampler will not see the start tag during generation, so it must start directly in `COUNTING` state from the first generated token. This is suitable for thinking models or handlers that prefill the assistant prefix with a thinking tag, for example: + +```text +<|im_start|>assistant\n\n +``` + +Example: + +```python +response = llm.create_chat_completion( + messages=[{"role": "user", "content": "Solve this carefully."}], + max_tokens=1024, + reasoning_budget=256, + reasoning_start="", + reasoning_end="", + reasoning_start_in_prompt=True, +) +``` + +`reasoning_start_in_prompt` is **not** a generic "thinking enabled" switch. It should only be set when the final prompt already contains `reasoning_start` before generation begins. For templates that merely enable thinking but still expect the model to generate the start tag itself, keep `reasoning_start_in_prompt=False`. + +When `verbose=True`, high-level reasoning-budget transitions are printed to stderr, such as initialization, start-tag detection, budget exhaustion, forced ending, and final passthrough. + ### 🛠️ Usage Example You can pass these parameters directly when calling the model to generate text. From 504f7477847fe9149e185fe00843681e88ec6736 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 6 Jun 2026 21:14:59 +0800 Subject: [PATCH 485/518] docs(README): Update `ReasoningBudgetSampler` quick link Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 03f66a8cf9..c605c94542 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This package provides: - [Dynamic LoRA Example](https://github.com/JamePeng/llama-cpp-python#dynamic-lora-example) - [Control Vector Injection (Representation Engineering)](https://github.com/JamePeng/llama-cpp-python#control-vector-injection-representation-engineering) - [Sampling Configuration & Usage (LlamaSamplingParams)](https://github.com/JamePeng/llama-cpp-python#sampling-configuration--usage-llamasamplingparams) + - [How to use the ReasoningBudgetSampler](https://github.com/JamePeng/llama-cpp-python#reasoning-budget-first-reasoning-block) - [Multi-modal Models Support](https://github.com/JamePeng/llama-cpp-python#multi-modal-models) - Support Models Lists - [Loading a Local Image With Qwen3VL(Thinking/Instruct)](https://github.com/JamePeng/llama-cpp-python#loading-a-local-image-with-qwen3vlthinkinginstruct) From 5929d86c76d248d58daf38045cb38b974e5107d6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 04:49:25 +0800 Subject: [PATCH 486/518] feat(chat-format): Update google/gemma-4 chat template jinja Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 75 ++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 4e7c045127..fb42a59f23 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4820,12 +4820,12 @@ class Gemma4ChatHandler(MTMDChatHandler): GEMMA4_ETR_TOKEN = "" CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required) -%}\n" + "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" " {%- set ns = namespace(found_first=false) -%}\n" " {%- for key, value in properties | dictsort -%}\n" " {%- set add_comma = false -%}\n" - " {%- if key not in standard_keys -%}\n" + " {%- if not filter_keys or key not in standard_keys -%}\n" " {%- if ns.found_first %},{% endif -%}\n" " {%- set ns.found_first = true -%}\n" " {{ key }}:{\n" @@ -4887,7 +4887,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- elif value is mapping -%}\n" " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" " properties:{\n" - " {{- format_parameters(value, value['required'] | default([])) -}}\n" + " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" " }\n" " {%- endif -%}\n" " {%- if value['required'] -%}\n" @@ -4910,10 +4910,10 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set params = tool_data['function']['parameters'] -%}\n" " {%- if params -%}\n" " ,parameters:{\n" - " {%- if params['properties'] -%}\n" + " {%- if params.get('properties') -%}\n" " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" " {%- endif -%}\n" - " {%- if params['required'] -%}\n" + " {%- if params.get('required') -%}\n" " required:[\n" " {%- for item in params['required'] -%}\n" " <|\"|>{{- item -}}<|\"|>\n" @@ -4921,7 +4921,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " ],\n" " {%- endif -%}\n" - " {%- if params['type'] -%}\n" + " {%- if params.get('type') -%}\n" " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" " {%- endif -%}\n" " {%- endif -%}\n" @@ -4978,6 +4978,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " {{- ns.result | trim -}}\n" "{%- endmacro -%}\n" + "\n" "{%- macro format_tool_response_block(tool_name, response) -%}\n" " {{- '<|tool_response>' -}}\n" " {%- if response is mapping -%}\n" @@ -4992,6 +4993,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {{- '' -}}\n" "{%- endmacro -%}\n" + "\n" "{%- set ns = namespace(prev_message_type=None) -%}\n" "{%- set loop_messages = messages -%}\n" "{{- bos_token -}}\n" @@ -5004,7 +5006,13 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set ns.prev_message_type = 'think' -%}\n" " {%- endif -%}\n" " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- messages[0]['content'] | trim -}}\n" + " {%- if messages[0]['content'] is string -%}\n" + " {{- messages[0]['content'] | trim -}}\n" + " {%- elif messages[0]['content'] is sequence -%}\n" + " {%- for item in messages[0]['content'] -%}\n" + " {{- item['text'] | trim + ' '-}}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" " {%- set loop_messages = messages[1:] -%}\n" " {%- endif -%}\n" " {%- if tools -%}\n" @@ -5017,6 +5025,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {{- '\\n' -}}\n" "{%- endif %}\n" + "\n" "{#- Pre-scan: find last user message index for reasoning guard -#}\n" "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" "{%- for i in range(loop_messages | length) -%}\n" @@ -5024,6 +5033,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set ns_turn.last_user_idx = i -%}\n" " {%- endif -%}\n" "{%- endfor -%}\n" + "\n" "{#- Loop through messages -#}\n" "{%- for message in loop_messages -%}\n" " {%- if message['role'] != 'tool' -%}\n" @@ -5045,12 +5055,14 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- if not continue_same_model_turn -%}\n" " {{- '<|turn>' + role + '\\n' }}\n" " {%- endif -%}\n" + "\n" " {#- Render reasoning/reasoning_content as thinking channel -#}\n" " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" " {%- endif -%}\n" - " {%- if message['tool_calls'] -%}\n" + "\n" + " {%- if message.get('tool_calls') -%}\n" " {%- for tool_call in message['tool_calls'] -%}\n" " {%- set function = tool_call['function'] -%}\n" " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" @@ -5068,6 +5080,7 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endfor -%}\n" " {%- set ns.prev_message_type = 'tool_call' -%}\n" " {%- endif -%}\n" + "\n" " {%- set ns_tr_out = namespace(flag=false) -%}\n" " {%- if message.get('tool_responses') -%}\n" " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" @@ -5104,6 +5117,23 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {%- endfor -%}\n" " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'image_url' -%}\n" + " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" + " {%- if part.get('type') == 'audio_url' -%}\n" + " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif part.get('type') == 'input_audio' -%}\n" + " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + # " {%- elif part.get('type') == 'video_url' -%}\n" + # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" " {%- else -%}\n" " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" " {%- endif -%}\n" @@ -5112,6 +5142,8 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- endif -%}\n" " {%- endfor -%}\n" " {%- endif -%}\n" + "\n" + " {%- set captured_content -%}\n" " {%- if message['content'] is string -%}\n" " {%- if role == 'model' -%}\n" " {{- strip_thinking(message['content']) -}}\n" @@ -5130,28 +5162,35 @@ class Gemma4ChatHandler(MTMDChatHandler): " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" " {{- '<|image|>' + url_val -}}\n" " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" + " {%- if item['type'] == 'audio_url' -%}\n" + " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] == 'input_audio' -%}\n" + " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" " {%- set ns.prev_message_type = 'audio' -%}\n" + " {%- endif -%}\n" # " {%- elif item['type'] == 'video_url' -%}\n" # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" # " {{- '<|video|>' + video_val -}}\n" # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endif -%}\n" " {%- endfor -%}\n" " {%- endif -%}\n" + " {%- endset -%}\n" + "\n" + " {{- captured_content -}}\n" + " {%- set has_content = captured_content | trim | length > 0 -%}\n" + "\n" " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not message.get('content')) -%}\n" + " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" " {{- '\\n' -}}\n" " {%- endif -%}\n" " {%- endif -%}\n" "{%- endfor -%}\n" + "\n" "{%- if add_generation_prompt -%}\n" " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" " {{- '<|turn>model\\n' -}}\n" @@ -5180,7 +5219,7 @@ def __call__(self, **kwargs): self.extra_template_arguments["enable_thinking"] = self.enable_thinking # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [ 1, 106, 50] + # generation_config.json: "eos_token_id": [1, 106, 50] kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] if self.verbose: From d154e63e2e916e734bee29b3926abe80a9923fce Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 15:42:15 +0800 Subject: [PATCH 487/518] docs(README): update MinerU2.5-Pro-2605-1.2B OCR model support and link Signed-off-by: JamePeng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c605c94542..433e031ae0 100644 --- a/README.md +++ b/README.md @@ -1034,6 +1034,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [lfm2-vl](https://huggingface.co/LiquidAI/LFM2-VL-3B-GGUF) | `LFM2VLChatHandler` | `lfm2-vl` | | [lfm2.5-vl](https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-GGUF) | `LFM25VLChatHandler` | `lfm2.5-vl` | | [deepseek-ocr](https://huggingface.co/JamePeng2023/DeepSeek-OCR-2-GGUF) | `MTMDChatHandler` | `None` | +| [mineru2.5-pro](https://huggingface.co/JamePeng2023/MinerU2.5-Pro-2605-1.2B-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [paddleocr-vl-1.5](https://huggingface.co/JamePeng2023/PaddleOCR-VL-1.5-GGUF) | `PaddleOCRChatHandler` | `paddleocr` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [qwen3-asr](https://huggingface.co/JamePeng2023/Qwen3-ASR-1.7B-GGUF) | `Qwen3ASRChatHandler` | `qwen3-asr` | From db8292d336ae1e708623792426481c414754353e Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 17:15:32 +0800 Subject: [PATCH 488/518] Update Submodule vendor/llama.cpp 6b80c74..f71af35 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6b80c74f28..f71af352a5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6b80c74f285390368b3c99c5e750f19e9b096e98 +Subproject commit f71af352a52b8efe824c7a698d0632afa4794c01 From 12861b918f67b62f78f28c5cabb7223f766e1097 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 7 Jun 2026 18:08:31 +0800 Subject: [PATCH 489/518] Bump version to 0.3.40-Milestone - Reasoning Budget Control, Gemma 4 12B Support, Enhanced Jinja2ChatFormatter, NGram k/k4v Speculative Decoding, Faster Native Sampling and Multimodal Improvements Signed-off-by: JamePeng --- CHANGELOG.md | 304 +++++++++++++++++++++++++++++++++++++++++- llama_cpp/__init__.py | 2 +- 2 files changed, 304 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8ebb5cd3e..1865195db3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,308 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.40-Milestone] Reasoning Budget Control, Gemma 4 12B Support, Enhanced Jinja2ChatFormatter, NGram k/k4v Speculative Decoding, Faster Native Sampling and Multimodal Improvements + +- feat(internals): Add `ReasoningBudgetSampler` support + - Add Python-backed `ReasoningBudgetSampler` for first reasoning-block control + - Install the sampler before probability filters to preserve forced end tokens + - Support `reasoning_budget` **-1/0/N** semantics in sampling params + - Force `reasoning_budget_message` + `reasoning_end` when the budget is exhausted + - Add manual `force_reasoning_budget()` at the sampling-context level + - Match llama.cpp force behavior by allowing only `COUNTING -> FORCING` + - Keep DONE as permanent passthrough and ignore later reasoning tags + - Support prefilled reasoning starts with `reasoning_start_in_prompt` + - Preserve UTF-8 boundary safety before forcing the end sequence + - Keep Python-backed custom sampler callbacks alive across C sampler usage + - Avoid shallow-copying custom_samplers when cloning sampler chains + - Add `verbose` parameter to `ReasoningBudgetSampler` to print high-level + state transitions to stderr. + - Log key events: initialization, `reasoning_start matched`, `budget exhausted`, + `forced end sequence`, `UTF-8 boundary waiting`, `manual force`, `natural end`, `reset`. + - Pass `verbose=getattr(model, "verbose", False)` from `LlamaSamplingContext` + when building the sampler chain. + - Preserve verbose flag when cloning the sampler. + +- feat(Llama): pass `reasoning budget` params through Llama APIs + - Add `reasoning budget` params to public completion and chat entry points + - Forward the params from chat handlers into `create_completion` + - Propagate reasoning budget controls down to `generate` and `sampling params` + - Document -1/0/N reasoning_budget behavior in completion docstrings + - Support custom `reasoning_start` and `reasoning_end` tags without model-specific inference + - Support `reasoning_budget_message` and `reasoning_start_in_prompt` + - Wire `MTMD chat handler` to the same reasoning budget controls + +- feat(sampling): add reasoning budget configurations + * Introduce reasoning budget and block control parameters to `LlamaSamplingParams` + to mirror llama.cpp CLI semantics. This includes: + - `reasoning_budget` + - `reasoning_start` / `reasoning_end` + - `reasoning_budget_message` + - `reasoning_start_in_prompt` + - `reasoning_start_max_tokens` + - Fix typo from typ_p to typical_p in logs + - Also updated `print_params()` to include these new metrics. + +- feat: add `ReasoningBudgetState` enum and `TokenMatcher` helper class to _internals.py + * Introduce `ReasoningBudgetState` enum and `TokenMatcher` helper class + to `_internals.py`. This lays the groundwork for the upcoming + `ReasoningBudgetSampler`, mirroring the state machine defined in + `common/reasoning-budget.h`. + + - `ReasoningBudgetState`: Tracks the lifecycle of the first reasoning block. + - `TokenMatcher`: Handles incremental matching for multi-token sequences. + +- docs(README): document reasoning budget sampler usage + - Add README section for first reasoning-block budget control + - Document reasoning_budget -1/0/N semantics and related sampler parameters + - Explain reasoning_budget_message injection before reasoning_end + - Add examples for default tags, Mistral [THINK] tags, and Gemma4 channel tags + - Clarify when to use reasoning_start_in_prompt for prefilled thinking tags + - Note that reasoning_start_in_prompt is not a generic thinking-enabled switch + - Mention verbose transition logs for reasoning-budget state changes + - docs(README): Update ReasoningBudgetSampler quick link + +- feat(chat-format): Update `google/gemma-4` chat template jinja + +- feat(llama): enhance chat template initialization with full special tokens + * Update Llama.__init__ to register additional tokenizer special tokens + and improve stop token handling for chat templates. + + - Expose extra special tokens (EOT, SEP, NL, PAD, MASK) via + `special_tokens_map` to Jinja2ChatFormatter. + - Keep BOS and EOS tokens as explicit parameters, no longer redundantly + put them in `special_tokens_map`. + - Build `stop_token_ids` once, including EOS and EOT tokens, skipping + invalid (-1) ids. + - Update try-block comment: now `{% generation %}` blocks are supported, + guard only against malformed or model-specific templates. + - This ensures better compatibility with HuggingFace-style chat templates + while maintaining llama-cpp-python prompt-rendering behavior. + +- **feat(chat-format): improve Jinja2ChatFormatter HF compatibility** + * Enhance Jinja2ChatFormatter to better support HuggingFace-style chat + templates while keeping the formatter lightweight and aligned with + llama-cpp-python's prompt-rendering needs. + + - Key changes: + - Add IgnoreGenerationTags Jinja extension for HF `{% generation %}` blocks. + - Enable Jinja loop controls for chat templates using break/continue. + - Register Transformers-compatible `tojson` behavior. + - Register `raise_exception` and `strftime_now` as Jinja globals. + - Add `special_tokens_map` support for additional template variables. + - Add optional `documents` argument for document-aware templates. + - Precompute text stop sequences and token-id stopping criteria. + - Improve type normalization for `stop_token_ids`. + - Expand docstrings for formatter initialization and render-time variables. + +- docs(wiki): update SCHEMA.md to v0.4 with full wiki path layout + - Added comprehensive docs/wiki/ directory structure overview. + - Reorganized modules description; removed hardcoded module page list. + - Clarified top-level file purposes and update guidance. + - Updated page type examples and templates (Class/Module, Feature, Example, Development). + - Strengthened cross-linking rules and update/placeholder guidance. + - Bumped schema version from 0.3 → 0.4 and last_modified date. + +- docs(install): add source-aligned build and backend guide + * Document installation workflows for llama-cpp-python with a focus on + the underlying llama.cpp CMake build configuration. + - Add virtual environment, source install, editable install, rebuild, and + verification guidance. + - Document common CMake options such as GGML_NATIVE, + GGML_BACKEND_DL, GGML_CPU_ALL_VARIANTS, and compiler selection. + - Summarize backend-specific build flags for CUDA, BLAS, Metal, Vulkan, + OpenVINO, HIP, SYCL, OpenCL, CANN, ZenDNN, and zDNN. + - Include backend runtime notes and common installation pitfalls while + keeping server-related installation content out of the page. + - docs(wiki): link installation guide from index + * Promote the completed installation guide into the wiki entry point so + new users can find build and backend setup instructions before reading + API-specific documentation. + - Add a Getting Started section that links to install.md. + - Move installation to the top of the recommended reading order. + - Mark install.md as an available page. + - Remove installation from the planned documentation areas. + - docs(readme): link detailed installation wiki guide + +- feat(mtmd): improve fallback chat template for multimodal models + - Add BOS/EOS token handling to the default MTMD chat format. + - Use a clearer role-based template with explicit USER and ASSISTANT prefixes. + - Append a newline after each message to keep generated prompts readable. + - Treat EOS as the end marker for the serialized conversation history before + the optional generation prompt. + - Improve fallback behavior for multimodal GGUF models that do not provide a + chat template, such as OCR-oriented models like `DeepSeek-OCR 1/2`. + - Make the default system prompt a single normalized string while preserving + its original meaning. + - Clean up minor formatting around MTMD context parameter initialization. + - docs(Readme): Update `Deepseek-OCR-2-GGUF` Link + - docs(README): update `MinerU2.5-Pro-2605-1.2B` OCR model support and link + + This improves prompt compatibility for multimodal models that either lack a + GGUF chat template or are not yet covered by a complete custom chat handler. + +- refactor(internals): align model metadata wrappers with llama.cpp API + - Use `llama_vocab_n_tokens()` instead of the old vocab size helper. + - Add Python wrappers for model description, size, chat template, and + trained RoPE frequency scaling. + - Clarify model capability helpers with docstrings matching llama.cpp + semantics. + - Rename `desc()` and `size()` to `model_desc()` and `model_size()` to + make their scope explicit. + - Drop the unused `get_tensor()` stub since llama.cpp does not expose it. + - Route rerank template lookup through `LlamaModel.model_chat_template()` for + consistency with the internal model abstraction. + +- feat(chat_handler): update multimodal handlers for Qwen2.5-VL, Qwen3-VL, and PaddleOCR + - Update PaddleOCRChatHandler to support version 1.6 + - Add token configuration and stop sequences for Qwen2.5-VL and Qwen3-VL + - Standardize input_ids initialization in __call__ methods for Qwen2.5-VL, Qwen3-ASR, and Qwen3-VL handlers + +- **perf(eval): skip unnecessary logit array copies during native sampling** + * Introduce the `copy_logits` parameter to `Llama.eval()` to control + whether C-level logits are copied into the Python `self.scores` array. + - Automatically disable `copy_logits` during the generation loop unless + Python-side hooks (`logits_processor`, `stopping_criteria`) or + `logits_all` explicitly require them. + - Skip logit copies entirely for intermediate prompt evaluations (e.g., + before hybrid checkpoints). + - Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch + the final token's logits when copying is required. + + In a PDF-reading summarization workload, this reduced the end-to-end completion + time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path + also improved noticeably: + + - `_create_completion`: 41.32s -> 25.93s + - `generate`: 37.82s -> below the top sampled entries + - `eval`: 35.14s -> 21.96s + - logits retrieval/copy path: 29.89s `get_logits()` -> 18.68s `get_logits_ith()` + - `decode`: 3.89s -> 2.25s + - `detokenize`: 2.60s -> 1.33s + - `sample`: 2.35s -> 2.03s + + This significantly reduces CPU overhead and memory bandwidth during generation, + as the native `llama.cpp` sampler reads directly from the C context without + needing to expose the `n_vocab` array to Python on every token. + +- docs(CUDA): Add note about PDL optimization for newer NVIDIA GPUs (CC ≥ 90) + +- docs(readme/wiki): update supported embeddings models table + - Add `jina-embeddings-v2-base-zh` + - Add `jina-embeddings-v3` + - Minor table formatting clean up + +- docs(development): add AI agent prompt for git commit generation + * Introduce `git-commit-generation-agent.md` to the development wiki to + standardize the creation of high-quality git commit messages using LLM + assistants. + + - Define the system persona, core principles (Conventional Commits, DCO), + and strict formatting rules for generating commits. + - Provide concrete template examples for build, performance, and + documentation updates. + - Ensure future maintainers and contributors can easily generate + consistent, maintainer-level commits that explicitly explain the "Why" + and "How" of code changes. + +- docs(wiki): add development helper to index + * Introduce the development section in the wiki index so maintainer-facing + workflows and LLM-assisted helper tools are discoverable from the main + navigation. + + - Add a Development section with a link to the Git commit generation agent. + Include the helper in the recommended reading order for new wiki users. + - Add development/git-commit-generation-agent.md to the available pages list. + +- feat(LlamaContext): add safety checks and docstrings to logits retrieval + - Add explicit null pointer validation to `get_logits` and `get_logits_ith`. + These methods now raise a `RuntimeError` instead of silently returning + invalid pointers when logits are unavailable or the index is out of bounds. + - Add comprehensive docstrings to both methods, detailing the underlying + buffer shape and memory layout. + - Include a performance warning in `get_logits_ith` about the internal + synchronization/reordering overhead to discourage its use on the hot path. + +- **feat(speculative): upgrade ngram map decoder with k/k4v modes +Enhance `LlamaNGramMapDecoding` to align with the upstream llama.cpp +ngram-map algorithm, offering better memory management and draft quality.** + - Introduce `mode` selection ("k" and "k4v"): "k" stores only historical + positions for memory efficiency, while "k4v" caches continuation values + directly for faster lookups. + - Add `min_hits` threshold to filter out low-confidence drafts. + - Implement `max_entries_per_key` to cap dictionary growth and prevent + memory bloat during long-context generations. + - Improve state synchronization (`_sync_and_index`) using `sync_check_tokens` + to safely verify incremental history appends. + - Add explicit lifecycle management methods (`clear`, `close`, `accept`) + for better API symmetry and resource cleanup. + - examples: add benchmark script for speculative decoding + - Add `benchmark_speculative.py` to the `examples/benchmark` directory. + - Test `LlamaPromptLookupDecoding` and `LlamaNGramMapDecoding` (k/k4v). + - Include diverse test scenarios (code, JSON logs, tables, essays) to + measure tokens-per-second (TPS) speedup compared to baseline generation. + +- docs(speculative): update wiki for NGramMap k/k4v modes and lifecycle APIs +Reflect the recent architectural upgrades to `LlamaNGramMapDecoding` in +the official documentation. + + - Document the new `__init__` parameters (`mode`, `min_hits`, + `max_entries_per_key`, `sync_check_tokens`) and their validation rules. + - Add a detailed comparison table explaining the memory and behavior + differences between the `"k"` and `"k4v"` lookup modes. + - Document the newly exposed lifecycle methods (`clear`, `close`, `accept`). + - Add comprehensive usage examples demonstrating `k4v` mode with memory caps. + - Update internal state descriptions (replacing `_ngram_map` with `_map_k` + and `_map_k4v`). + - Add a strong production warning against the legacy `LlamaPromptLookupDecoding` + and cross-link the new `benchmark_speculative.py` script. + +- docs(readme): revamp speculative decoding documentation +Expand the Speculative Decoding section to fully document the +new `LlamaNGramMapDecoding` capabilities and configuration options. + + - Clarify that `LlamaNGramMapDecoding` is a model-free prompt lookup + decoder that does not require a secondary GGUF draft model. + - Add a detailed parameter table explaining `mode` (k vs. k4v), + `min_hits`, memory caps, and sync thresholds. + - Provide usage examples and tuning recommendations for different + hardware (e.g., lowering `num_pred_tokens` for CPU setups). + - Demote the older `LlamaPromptLookupDecoding` to a legacy section, + warning about its sliding-window overhead on long contexts. + - Add practical notes on performance and state management (`clear()`). + +- docs(readme): Removed outdated macOS installation guides and added the latest installation notes. + +- docs(readme): Add Windows ROCm build instructions(by **@0xDELUXA**) + - Optimize the formatting of the ROCm section in README.md. + +- fix: wire LFM VL chat handlers into server loader(by **@JayAnderson360**) + +- build(cmake): disable building of upstream unified binary + - Set `LLAMA_BUILD_APP` to `OFF` to prevent the compilation of the new + unified `llama` binary introduced in upstream llama.cpp. + + - Since the Python package only requires the underlying shared libraries + and specific targets, explicitly disabling the standalone application + reduces build times and prevents unnecessary executable artifacts from + being compiled. + +- build(deps): align Jinja2 minimum with Transformers + - Require Jinja2 >= 3.1.0 for HuggingFace-style chat template support. + + - The updated Jinja2ChatFormatter relies on behavior aligned with Transformers' + chat-template runtime, which also requires Jinja2 3.1 or newer. Updating the + minimum dependency avoids parser/runtime differences with older Jinja versions. + +- ci : update metal build/test job to macos-26/macos-15-intel + - Build on the Tahoe runners in order to enable the tensor API for M5 and A19. + +- feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f71af352a52b8efe824c7a698d0632afa4794c01](https://github.com/ggml-org/llama.cpp/commit/f71af352a52b8efe824c7a698d0632afa4794c01) + +- feat: Sync llama.cpp llama/mtmd/ggml API Binding 20260606 + +More information see: https://github.com/JamePeng/llama-cpp-python/compare/a778c57d73ec7d4f43e2518a513e7d4cf68a0df8...db8292d336ae1e708623792426481c414754353e + ## [0.3.39] Dynamic GGML Backends, Qwen3-ASR/MiniCPM-V-4.6, On-Device Hybrid Checkpoint, and Granular Logging - **ci(cu131/128/126/124): build wheels with GGML dynamic backends for windows/Linux** @@ -513,7 +815,7 @@ This commit significantly overhauls the media parsing and loading pipeline in `M - feat: Update llama.cpp to [ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea](https://github.com/ggml-org/llama.cpp/commit/f5ddcd1696eca5069dc7915f4d4c03c9a709afea) -## [0.3.30] Milestone Release +## [0.3.30-Milestone] Milestone Release I will update the release notes for version 0.3.30 in the [discussion](https://github.com/JamePeng/llama-cpp-python/discussions). diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index ec28faae66..1650e6af69 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.39" +__version__ = "0.3.40" From b1ad4452e24baac561e75b254192ebf55f1fbd3c Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 01:03:44 +0800 Subject: [PATCH 490/518] Update Submodule vendor/llama.cpp f71af35..f0156d1 Signed-off-by: JamePeng --- llama_cpp/llama.py | 1 + llama_cpp/llama_cpp.py | 7 +++++++ vendor/llama.cpp | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2bab3709e6..d89f4c361d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -503,6 +503,7 @@ def __init__( self.context_params.n_threads_batch = self.n_threads_batch self.context_params.ctx_type = ctx_type + self.context_params.ctx_other = None self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9c911bcb14..1e81d80f65 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -898,6 +898,9 @@ class llama_sampler_seq_config(ctypes.Structure): # // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) # struct llama_sampler_seq_config * samplers; # size_t n_samplers; +# // a source/target/parent context +# // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts +# struct llama_context * ctx_other; # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -945,6 +948,8 @@ class llama_context_params(ctypes.Structure): samplers(llama_sampler_seq_config *): the samplers must be sampler chains (i.e. use llama_sampler_chain_init) n_samplers(size_t): numbers of sampler chains + + ctx_other(llama_context *): a source/target/parent context can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts """ if TYPE_CHECKING: @@ -983,6 +988,7 @@ class llama_context_params(ctypes.Structure): kv_unified:bool samplers: ctypes.c_void_p n_samplers: int + ctx_other: ctypes.c_void_p _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -1020,6 +1026,7 @@ class llama_context_params(ctypes.Structure): ("kv_unified", ctypes.c_bool), ("samplers", llama_sampler_seq_config_p), ("n_samplers", ctypes.c_int), + ("ctx_other", ctypes.c_void_p), ] llama_context_params_p = ctypes.POINTER(llama_context_params) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f71af352a5..f0156d1401 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f71af352a52b8efe824c7a698d0632afa4794c01 +Subproject commit f0156d1401500512ad85042ccf38970568b12253 From 7a8272e6b928974efc8c131d518b1363e2e47263 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 01:24:53 +0800 Subject: [PATCH 491/518] feat(_ctypes_extensions): improve error diagnostics for shared library loading When `load_shared_library` fails, the resulting `RuntimeError` now includes a listing of the contents of the searched directories. This provides immediate context to help developers diagnose missing, misplaced, or incorrectly named library files. - Added `_format_library_dir_contents` to safely format directory listings. - Appended the directory listing to the failure message. - Confined this diagnostic work strictly to the failure path to avoid any performance overhead during successful imports. Signed-off-by: JamePeng --- llama_cpp/_ctypes_extensions.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index a8936fa2bf..1a9f8eb8c5 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -18,6 +18,37 @@ ) from typing_extensions import TypeAlias +def _format_library_dir_contents(base_paths: list[pathlib.Path]) -> str: + """Format directory contents for diagnostics after library loading fails.""" + sections = [] + + for base_path in base_paths: + p = pathlib.Path(base_path) + + if not p.exists(): + sections.append(f"{p}: ") + continue + + if not p.is_dir(): + sections.append(f"{p}: ") + continue + + try: + # Only list files when reporting a final loading failure. + files = sorted(x.name for x in p.iterdir()) + except Exception as e: + sections.append(f"{p}: ") + continue + + if files: + sections.append( + f"{p}:\n" + + "\n".join(f" - {name}" for name in files) + ) + else: + sections.append(f"{p}: ") + + return "\n".join(sections) # Load the library def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list[pathlib.Path]]): @@ -114,9 +145,12 @@ def load_shared_library(lib_base_name: str, base_paths: Union[pathlib.Path, list except Exception as e: errors.append(f"{lib_path}: {e}") + # Include directory contents only in the failure path to avoid extra work during successful imports. raise RuntimeError( f"Failed to load '{lib_base_name}' from {base_paths}\n" + "\n".join(errors) + + "\nLibrary search path contents:\n" + + _format_library_dir_contents(base_paths) ) From 323da373ad2f30409123bfba8322041113f0eba8 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 23:08:36 +0800 Subject: [PATCH 492/518] build(CMakelists): Improve Windows LLVM OpenMP runtime discovery - Also improve diagnostics by reporting the selected runtime source and path, warning when an explicit override points to a missing file, and keeping a clear runtime warning when no OpenMP DLL can be found. Signed-off-by: JamePeng --- CMakeLists.txt | 79 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f09cdb783..1ace43c4aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,8 @@ function(llama_cpp_python_install_target target) endfunction() -# Install an extra Windows runtime DLL into the Python package runtime directory. +# Copy an extra Windows runtime DLL into the Python package runtime directory +# during the CMake install step. # # Some dynamically loaded backend libraries depend on runtime DLLs that are not # always discoverable through $. One important example @@ -75,7 +76,10 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) endif() if(NOT EXISTS "${runtime_file}") - message(WARNING "Windows runtime file does not exist and will not be installed: ${runtime_file}") + message(WARNING + "Windows runtime DLL was selected but does not exist and will not be copied: " + "${runtime_file}" + ) return() endif() @@ -92,6 +96,11 @@ function(llama_cpp_python_install_windows_runtime_file runtime_file) foreach(DIR ${INSTALL_DIRS}) file(TO_CMAKE_PATH "${DIR}" DIR_CMAKE) + message(STATUS + "Will copy Windows runtime DLL during install: " + "${runtime_file_cmake} -> ${DIR_CMAKE}" + ) + install( FILES "${runtime_file_cmake}" DESTINATION "${DIR_CMAKE}" @@ -115,42 +124,62 @@ function(llama_cpp_python_install_windows_openmp_runtime) endif() set(OPENMP_RUNTIME_DLL "") + set(OPENMP_RUNTIME_SOURCE "") + set(FOUND_OPENMP_DLLS "") + + if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL) + if(EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "LLAMA_CPP_OPENMP_RUNTIME_DLL") + else() + message(WARNING + "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " + "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " + "LLVM OpenMP runtime discovery." + ) + endif() + endif() - if(DEFINED LLAMA_CPP_OPENMP_RUNTIME_DLL AND EXISTS "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - set(OPENMP_RUNTIME_DLL "${LLAMA_CPP_OPENMP_RUNTIME_DLL}") - else() + if(NOT OPENMP_RUNTIME_DLL) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_ROOTS - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC" - "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC" - ) + set(VS_OPENMP_SEARCH_PATTERNS + # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - foreach(ROOT ${VS_OPENMP_SEARCH_ROOTS}) - if(EXISTS "${ROOT}") - file( - GLOB_RECURSE FOUND_OPENMP_DLLS - "${ROOT}/*/debug_nonredist/x64/Microsoft.VC*.OpenMP.LLVM/libomp140.x86_64.dll" - "${ROOT}/**/libomp140.x86_64.dll" - ) + # Keep these as secondary fallbacks for non-standard installs. + "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" + "C:/Windows/System32/libomp140.x86_64.dll" + ) - if(FOUND_OPENMP_DLLS) - list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - break() - endif() - endif() + foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS}) + file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") + list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() + + if(FOUND_OPENMP_DLLS) + list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) + list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) + list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback") + endif() endif() if(OPENMP_RUNTIME_DLL) - message(STATUS "Installing Windows LLVM OpenMP runtime: ${OPENMP_RUNTIME_DLL}") + message(STATUS + "Selected Windows LLVM OpenMP runtime from ${OPENMP_RUNTIME_SOURCE}: " + "${OPENMP_RUNTIME_DLL}" + ) llama_cpp_python_install_windows_runtime_file("${OPENMP_RUNTIME_DLL}") else() message(WARNING - "Could not find libomp140.x86_64.dll. " + "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 " + "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), " + "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " From 111819832614d488c840b266ad95f894f420bfea Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 8 Jun 2026 23:48:49 +0800 Subject: [PATCH 493/518] ci(test): add cuda 13.0.2 build workflow Signed-off-by: JamePeng --- .github/workflows/build-wheels-cu130-win.yml | 249 +++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 .github/workflows/build-wheels-cu130-win.yml diff --git a/.github/workflows/build-wheels-cu130-win.yml b/.github/workflows/build-wheels-cu130-win.yml new file mode 100644 index 0000000000..790d7c9665 --- /dev/null +++ b/.github/workflows/build-wheels-cu130-win.yml @@ -0,0 +1,249 @@ +name: Build Wheels (CU130) for Windows + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build_wheels: + name: Build Wheel ${{ matrix.os }} py${{ matrix.pyver }} cu130 + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["windows-2022"] + pyver: ["3.10", "3.11", "3.12", "3.13", "3.14"] + cuda: ["13.0.2"] + cudaarch: ["75-real;80-real;86-real;87-real;89-real;90-real;100-real;120-real"] + + defaults: + run: + shell: pwsh + + env: + CUDAVER: ${{ matrix.cuda }} + CUDAARCHVER: ${{ matrix.cudaarch }} + MAX_JOBS: 12 + + steps: + - name: Add MSBuild to PATH + uses: microsoft/setup-msbuild@v3 + with: + msbuild-architecture: x64 + + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Inspect Visual Studio OpenMP runtime paths + run: | + Write-Output "ProgramFiles=$env:ProgramFiles" + Write-Output "ProgramFiles(x86)=${env:ProgramFiles(x86)}" + Write-Output "" + + $vsRoots = @( + "$env:ProgramFiles\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "$env:ProgramFiles\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC", + "${env:ProgramFiles(x86)}\Microsoft Visual Studio\2022\BuildTools\VC\Redist\MSVC" + ) + + foreach ($root in $vsRoots) { + Write-Output "Checking root: $root" + + if (Test-Path $root) { + Write-Output " Exists: yes" + Write-Output " MSVC version directories:" + + Get-ChildItem $root -Directory -ErrorAction SilentlyContinue | + Sort-Object Name | + ForEach-Object { + Write-Output " $($_.FullName)" + } + + Write-Output " OpenMP runtime candidates:" + + Get-ChildItem $root -Recurse -Filter "libomp140.x86_64.dll" -ErrorAction SilentlyContinue | + Sort-Object FullName | + ForEach-Object { + $sizeKB = [Math]::Round($_.Length / 1KB, 2) + $sizeMB = [Math]::Round($_.Length / 1MB, 4) + + Write-Output " Path: $($_.FullName)" + Write-Output " Size: $($_.Length) bytes / $sizeKB KB / $sizeMB MB" + } + } else { + Write-Output " Exists: no" + } + + Write-Output "" + } + + Write-Output "Checking System32 fallback:" + $system32OpenMP = "C:\Windows\System32\libomp140.x86_64.dll" + + if (Test-Path $system32OpenMP) { + $dll = Get-Item $system32OpenMP + $sizeKB = [Math]::Round($dll.Length / 1KB, 2) + $sizeMB = [Math]::Round($dll.Length / 1MB, 4) + + Write-Output " Path: $($dll.FullName)" + Write-Output " Size: $($dll.Length) bytes / $sizeKB KB / $sizeMB MB" + } else { + Write-Output " Not found: $system32OpenMP" + } + + - name: Install CUDA ${{ matrix.cuda }} + uses: Jimver/cuda-toolkit@v0.2.35 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + use-github-cache: false + + - name: Install uv and Python ${{ matrix.pyver }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.pyver }} + activate-environment: true + enable-cache: true + + - name: Install dependencies + run: | + git config --system core.longpaths true + uv pip install --upgrade build setuptools wheel packaging + + - name: Setup MSVC environment for nvcc + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + echo PATH=%PATH%>>%GITHUB_ENV% + echo INCLUDE=%INCLUDE%>>%GITHUB_ENV% + echo LIB=%LIB%>>%GITHUB_ENV% + echo LIBPATH=%LIBPATH%>>%GITHUB_ENV% + + - name: Build wheel + run: | + $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '') + + $env:CUDA_HOME = $env:CUDA_PATH + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CUDA_PATH + $env:VERBOSE = '1' + + # Force CMake to use Ninja + LLVM/Clang instead of the default + # Visual Studio generator. MSVC skips several GGML CPU all-variant + # backends, such as ivybridge, piledriver, cooperlake, zen4, and + # sapphirerapids. + $env:CMAKE_GENERATOR = 'Ninja Multi-Config' + + $toolchainCandidates = @( + (Join-Path $env:GITHUB_WORKSPACE "vendor\llama.cpp\cmake\x64-windows-llvm.cmake"), + (Join-Path $env:GITHUB_WORKSPACE "cmake\x64-windows-llvm.cmake") + ) + + $toolchainFile = $toolchainCandidates | + Where-Object { Test-Path $_ } | + Select-Object -First 1 + + if (!$toolchainFile) { + Write-Error "Toolchain file not found. Checked: $($toolchainCandidates -join ', ')" + exit 1 + } + + $toolchainFile = $toolchainFile.Replace('\', '/') + Write-Output "Using toolchain file: $toolchainFile" + + # Build one CUDA wheel with dynamic GGML backends: + # - GGML_BACKEND_DL enables runtime-loadable backend DLLs. + # - GGML_CPU_ALL_VARIANTS builds CPU variant DLLs such as ggml-cpu-x64, + # ggml-cpu-haswell, ggml-cpu-alderlake, etc. + # - GGML_NATIVE=OFF avoids binding the wheel to the runner CPU. + + # Suppress CUDA compiler warnings + $cudaDiagSuppress = '--diag-suppress=177,221,550' + + $cmakeArgs = @( + # Windows toolchain / common runtime + '-DCMAKE_TOOLCHAIN_FILE=vendor/llama.cpp/cmake/x64-windows-llvm.cmake' + '-DLLAMA_BUILD_BORINGSSL=ON' + + # Disable non-wheel targets + '-DLLAMA_BUILD_EXAMPLES=OFF' + '-DLLAMA_BUILD_TESTS=OFF' + '-DLLAMA_BUILD_TOOLS=OFF' + '-DLLAMA_BUILD_SERVER=OFF' + '-DLLAMA_BUILD_UI=OFF' + '-DLLAMA_USE_PREBUILT_UI=OFF' + '-DLLAMA_CURL=OFF' + + # GGML dynamic backend layout + '-DGGML_CPU=ON' + '-DGGML_CUDA=ON' + '-DGGML_NATIVE=OFF' + '-DGGML_BACKEND_DL=ON' + '-DGGML_CPU_ALL_VARIANTS=ON' + '-DGGML_OPENMP=ON' + + # CUDA backend + "-DCMAKE_CUDA_ARCHITECTURES=$env:CUDAARCHVER" + '-DGGML_CUDA_FORCE_MMQ=ON' + '-DCUDA_SEPARABLE_COMPILATION=ON' + "-DCMAKE_CUDA_FLAGS=$cudaDiagSuppress" + + # Build behavior + "-DCMAKE_BUILD_PARALLEL_LEVEL=$env:MAX_JOBS" + '-DENABLE_CCACHE=ON' + ) + + $env:CMAKE_ARGS = $cmakeArgs -join ' ' + Write-Output "CMAKE_ARGS=$env:CMAKE_ARGS" + + python -m build --wheel + + # Check if wheel was built + if (!(Test-Path '.\dist\*.whl')) { + Write-Error "No wheel built in dist/ directory" + exit 1 + } + + $wheelFile = Get-Item '.\dist\*.whl' | Select-Object -First 1 + + # Wheel filename format: + # name-version-python_tag-abi_tag-platform_tag.whl + $parts = $wheelFile.Name.Split('-') + $distName = $parts[0] + $version = $parts[1] + $pyTag = $parts[2] + $abiTag = $parts[3] + $platTag = $parts[4] + + # CPU all-variants is now an internal runtime layout detail. + $newVersion = "$version+cu$cudaVersion" + $newName = "$distName-$newVersion-$pyTag-$abiTag-$platTag" + + # Rename wheel file + Rename-Item -Path $wheelFile.FullName -NewName $newName + Write-Output "Renamed wheel to: $newName" + + # Write the build tag to the output + Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + Write-Output "TAG_VERSION=$version" >> $env:GITHUB_ENV + + - name: Get current date + id: get-date + run: | + $currentDate = Get-Date -UFormat "%Y%m%d" + Write-Output "BUILD_DATE=$currentDate" >> $env:GITHUB_ENV + + - name: Create release + if: always() && env.TAG_VERSION != '' + uses: softprops/action-gh-release@v3 + with: + files: dist/* + # Set tag_name to v-cu-win- + tag_name: v${{ env.TAG_VERSION }}-cu${{ env.CUDA_VERSION }}-win-${{ env.BUILD_DATE }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 7a6ee9fcd57438a950eb2ee6c8e079f2409c2765 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 00:33:10 +0800 Subject: [PATCH 494/518] =?UTF-8?q?build(CMakeLists):=20prefer=20VS=202022?= =?UTF-8?q?=20VC143=20OpenMP=20redist=20and=20keep=20System32=20as=20final?= =?UTF-8?q?=20fallback=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: JamePeng --- CMakeLists.txt | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ace43c4aa..5b2cfeeb8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,7 +135,7 @@ function(llama_cpp_python_install_windows_openmp_runtime) message(WARNING "LLAMA_CPP_OPENMP_RUNTIME_DLL was set, but the file does not exist: " "${LLAMA_CPP_OPENMP_RUNTIME_DLL}. Falling back to Visual Studio " - "LLVM OpenMP runtime discovery." + "VC143 LLVM OpenMP runtime discovery." ) endif() endif() @@ -144,18 +144,19 @@ function(llama_cpp_python_install_windows_openmp_runtime) file(TO_CMAKE_PATH "$ENV{ProgramFiles}" PROGRAMFILES_CMAKE) file(TO_CMAKE_PATH "$ENV{ProgramFiles\(x86\)}" PROGRAMFILES_X86_CMAKE) - set(VS_OPENMP_SEARCH_PATTERNS - # Prefer the exact VS 2022 Enterprise / BuildTools LLVM OpenMP redist layout. + set(VS_OPENMP_VC143_PATTERNS + # Prefer VS 2022 VC143 LLVM OpenMP redist paths. + # The MSVC version directory is intentionally globbed because + # GitHub runners may contain versions such as 14.44.35112 or 14.44.35207. "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - # Keep these as secondary fallbacks for non-standard installs. + # Secondary VS layout fallbacks for unusual installations. "${PROGRAMFILES_CMAKE}/Microsoft Visual Studio/2022/BuildTools/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" "${PROGRAMFILES_X86_CMAKE}/Microsoft Visual Studio/2022/Enterprise/VC/Redist/MSVC/*/debug_nonredist/x64/Microsoft.VC143.OpenMP.LLVM/libomp140.x86_64.dll" - "C:/Windows/System32/libomp140.x86_64.dll" ) - foreach(PATTERN ${VS_OPENMP_SEARCH_PATTERNS}) + foreach(PATTERN ${VS_OPENMP_VC143_PATTERNS}) file(GLOB PATTERN_OPENMP_DLLS "${PATTERN}") list(APPEND FOUND_OPENMP_DLLS ${PATTERN_OPENMP_DLLS}) endforeach() @@ -164,7 +165,16 @@ function(llama_cpp_python_install_windows_openmp_runtime) list(REMOVE_DUPLICATES FOUND_OPENMP_DLLS) list(SORT FOUND_OPENMP_DLLS COMPARE NATURAL ORDER DESCENDING) list(GET FOUND_OPENMP_DLLS 0 OPENMP_RUNTIME_DLL) - set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 LLVM OpenMP redist fallback") + set(OPENMP_RUNTIME_SOURCE "Visual Studio 2022 VC143 LLVM OpenMP redist") + endif() + endif() + + if(NOT OPENMP_RUNTIME_DLL) + set(SYSTEM32_OPENMP_RUNTIME_DLL "C:/Windows/System32/libomp140.x86_64.dll") + + if(EXISTS "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_DLL "${SYSTEM32_OPENMP_RUNTIME_DLL}") + set(OPENMP_RUNTIME_SOURCE "System32 fallback") endif() endif() @@ -177,9 +187,10 @@ function(llama_cpp_python_install_windows_openmp_runtime) else() message(WARNING "Could not find libomp140.x86_64.dll for Windows LLVM OpenMP. " - "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL and Visual Studio 2022 " - "Enterprise/BuildTools redist paths under Program Files and Program Files (x86), " - "with a fuzzy MSVC version match such as 14.44.35112 or 14.44.35207. " + "Searched LLAMA_CPP_OPENMP_RUNTIME_DLL, Visual Studio 2022 " + "Enterprise/BuildTools VC143 redist paths under Program Files and " + "Program Files (x86), with a fuzzy MSVC version match such as " + "14.44.35112 or 14.44.35207, and C:/Windows/System32 as a final fallback. " "If GGML_OPENMP=ON and GGML CPU backend DLLs are built with LLVM OpenMP, " "the packaged ggml-cpu-*.dll files may fail to load at runtime. " "Set LLAMA_CPP_OPENMP_RUNTIME_DLL to the full path of libomp140.x86_64.dll " From 50bbdd61fdf7e2e1cd7582a2183e476c98a47c17 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 02:54:00 +0800 Subject: [PATCH 495/518] Update Submodule vendor/llama.cpp f0156d1..7d2b45b Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f0156d1401..7d2b45b4f7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f0156d1401500512ad85042ccf38970568b12253 +Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545 From 55e855b75f901b494259a1c81b45ac80f0e3013f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 05:03:15 +0800 Subject: [PATCH 496/518] Update mtmd API 20260609 Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 293 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 283 insertions(+), 10 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 4542555c65..61fb0e7859 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -10,12 +10,14 @@ c_uint8, c_int32, c_uint32, + c_int64, c_float, c_void_p, c_size_t, POINTER, _Pointer, # type: ignore Structure, + CFUNCTYPE ) import pathlib from typing import ( @@ -318,6 +320,16 @@ def mtmd_get_audio_sample_rate(ctx: mtmd_context_p) -> c_int: """ ... +# // get the current marker string +# MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_get_marker", [mtmd_context_p_ctypes], c_char_p) +def mtmd_get_marker(ctx: mtmd_context_p) -> c_char_p: + """ + get the current marker string + """ + ... + # // mtmd_bitmap # // # // if bitmap is image: @@ -420,6 +432,58 @@ def mtmd_bitmap_set_id( ... +# // mtmd_bitmap lazy +# // +# // this is a special bitmap that: +# // - does not hold the actual data +# // - can be expanded into one or more chunks (either media to text chunks) +# // user must provide a callback to fill in the data when mtmd_tokenize() is called +# // this is useful for large video inputs: +# // - allow reading video frame by frame, without loading the entire video into memory +# // - allow tracking the whole video with a single ID (for example, the file hash) + +# // set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically +# // set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically +# // either out_bitmap or out_text can be set, but not both +# // out_bitmap cannot be another lazy bitmap (no nested lazy allowed) +# // return value: +# // 0 on success +# // -1 on EOF (signal to mtmd_tokenize to move on) +# // -2 on error (signal to mtmd_tokenize to abort) +# typedef int(* mtmd_bitmap_lazy_callback)( +# size_t chunk_idx, +# void * user_data, +# mtmd_bitmap ** out_bitmap, +# char ** out_text); +mtmd_bitmap_lazy_callback = CFUNCTYPE( + c_int, + c_size_t, # chunk_idx + c_void_p, # user_data + POINTER(mtmd_bitmap_p), # mtmd_bitmap ** out_bitmap + POINTER(c_char_p), # char ** out_text +) + +# MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, +# const char * id, // usually set to file hash +# void * user_data, +# mtmd_bitmap_lazy_callback callback); +@ctypes_function_mtmd( + "mtmd_input_chunks_get", [ + mtmd_context_p_ctypes, + c_char_p, + c_void_p, + mtmd_bitmap_lazy_callback, + ], mtmd_bitmap_p_ctypes) +def mtmd_input_chunks_get( + ctx: mtmd_context_p, + id: c_char_p, + user_data: c_void_p, + callback: mtmd_bitmap_lazy_callback, # type: ignore + /, +) -> mtmd_bitmap_p: + ... + + # // mtmd_input_chunks # // # // this is simply a list of mtmd_input_chunk @@ -772,6 +836,9 @@ def mtmd_test_create_input_chunks() -> mtmd_input_chunk_p: # // BREAKING CHANGES are expected. # // +# struct mtmd_helper_video; +mtmd_helper_video_p = NewType("mtmd_helper_video_p", int) +mtmd_helper_video_p_ctypes = c_void_p # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. @@ -786,11 +853,33 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # ... +# // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). +# MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_helper_support_video", [mtmd_context_p], c_bool) +def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: + """ + Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). + """ + ... + + +# struct mtmd_helper_bitmap_wrapper { +# mtmd_bitmap * bitmap; +# mtmd_helper_video * video_ctx; +# }; +class mtmd_helper_bitmap_wrapper(Structure): + _fields_ = [ + ("bitmap", mtmd_bitmap_p), + ("video_ctx", mtmd_helper_video_p), + ] +mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper) + # // helper function to construct a mtmd_bitmap from a file # // it calls mtmd_helper_bitmap_init_from_buf() internally # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); +# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); @ctypes_function_mtmd( "mtmd_helper_bitmap_init_from_file", [ @@ -798,14 +887,14 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # c_char_p, c_bool, ], - mtmd_bitmap_p_ctypes + mtmd_helper_bitmap_wrapper ) def mtmd_helper_bitmap_init_from_file( ctx: mtmd_context_p, fname: c_char_p, placeholder: c_bool, /, -) -> mtmd_bitmap_p: +) -> mtmd_helper_bitmap_wrapper: """ helper function to construct a mtmd_bitmap from a file it calls mtmd_helper_bitmap_init_from_buf() internally @@ -818,10 +907,13 @@ def mtmd_helper_bitmap_init_from_file( # // supported formats: # // image: formats supported by stb_image: jpg, png, bmp, gif, etc. # // audio: formats supported by miniaudio: wav, mp3, flac -# // note: audio files will be auto-detected based on magic bytes +# // note: +# // - for now, video input is only supported via C++ helper functions +# // - audio files will be auto-detected based on magic bytes +# // - output bitmap will have FNV hash as the ID # // returns nullptr on failure # // this function is thread-safe -# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); +# MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); @ctypes_function_mtmd( "mtmd_helper_bitmap_init_from_buf", [ mtmd_context_p_ctypes, @@ -829,7 +921,7 @@ def mtmd_helper_bitmap_init_from_file( c_size_t, c_bool, ], - mtmd_bitmap_p_ctypes + mtmd_helper_bitmap_wrapper ) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, @@ -837,13 +929,16 @@ def mtmd_helper_bitmap_init_from_buf( len: c_size_t, placeholder: c_bool, /, -) -> mtmd_bitmap_p: +) -> mtmd_helper_bitmap_wrapper: """ helper function to construct a mtmd_bitmap from a buffer containing a file supported formats: - image: formats supported by stb_image: jpg, png, bmp, gif, etc. - audio: formats supported by miniaudio: wav, mp3, flac - note: audio files will be auto-detected based on magic bytes + image: formats supported by stb_image: jpg, png, bmp, gif, etc. + audio: formats supported by miniaudio: wav, mp3, flac + note: + - for now, video input is only supported via C++ helper functions + - audio files will be auto-detected based on magic bytes + - output bitmap will have FNV hash as the ID returns nullptr on failure """ ... @@ -1020,3 +1115,181 @@ def mtmd_helper_decode_image_chunk( ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure """ ... + +# // +# // video input helpers (requires ffmpeg/ffprobe installed on the system) +# // the notion of video only exists at the helper level, it is not visible to the core mtmd library +# // +# // NOTE: this implementation is model-agnostic, it can be used with any vision-capable model +# // however, it may not be accurate for some specific models +# // (this is expected for now, to keep the implementation simple) +# // + +# struct mtmd_helper_video_info { +# uint32_t width; +# uint32_t height; +# float fps; // effective fps (fps_target if set, else original video fps) +# int32_t n_frames; // estimated total frames at effective fps (-1 if unknown) +# }; +class mtmd_helper_video_info(Structure): + _fields_ = [ + ("width", c_uint32), + ("height", c_uint32), + ("fps", c_float), + ("n_frames", c_int32), + ] +mtmd_helper_video_info_p_ctypes = POINTER(mtmd_helper_video_info) + + +# struct mtmd_helper_video_init_params { +# float fps_target; // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f +# const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH +# int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms +# // TODO @ngxson : allow "placeholder" bitmap output for counting tokens +# }; +class mtmd_helper_video_init_params(Structure): + _fields_ = [ + ("fps_target", c_float), + ("ffmpeg_bin_dir", c_char_p), + ("timestamp_interval_ms", c_int64), + ] +mtmd_helper_video_init_params_p_ctypes = POINTER(mtmd_helper_video_init_params) + + +# MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void); +@ctypes_function_mtmd( + "mtmd_helper_video_init_params_default", + [], + mtmd_helper_video_init_params, +) +def mtmd_helper_video_init_params_default( + /, +) -> mtmd_helper_video_init_params: + """ + get default init params for mtmd_helper_video + """ + ... + + +# // returns NULL on failure (ffprobe not found, file unreadable, etc.) +# MTMD_API mtmd_helper_video * mtmd_helper_video_init( +# struct mtmd_context * mctx, +# const char * path, +# struct mtmd_helper_video_init_params params); +@ctypes_function_mtmd( + "mtmd_helper_video_init", [ + mtmd_context_p_ctypes, + c_char_p, + mtmd_helper_video_init_params, + ], + mtmd_helper_video_p) +def mtmd_helper_video_init( + mctx: mtmd_context_p, + path: c_char_p, + params: mtmd_helper_video_init_params, + /, +) -> mtmd_helper_video_p: + """ + helper function to init an mtmd_helper_video object + returns NULL on failure (ffprobe not found, file unreadable, etc.) + """ + ... + + +# // Same as mtmd_helper_video_init(), but reads from an in-memory buffer. +# // The buffer is copied internally; the caller does not need to keep it alive. +# // Note: pipe input is not seekable, so seeking will use output-side seeking +# // (ffmpeg decodes and discards frames up to the target position). +# MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf( +# struct mtmd_context * mctx, +# const unsigned char * buf, size_t len, +# struct mtmd_helper_video_init_params params); +@ctypes_function_mtmd( + "mtmd_helper_video_init_from_buf", + [ + mtmd_context_p_ctypes, + c_char_p, + c_size_t, + mtmd_helper_video_init_params, + ], + mtmd_helper_video_p_ctypes, +) +def mtmd_helper_video_init_from_buf( + mctx: mtmd_context_p, + buf: c_char_p, + len: int, + params: mtmd_helper_video_init_params, + /, +) -> mtmd_helper_video_p: + """ + helper function to init an mtmd_helper_video object from an in-memory video buffer + + The buffer is copied internally, so the caller does not need to keep it alive + after this function returns. + """ + ... + + +# MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx); +@ctypes_function_mtmd("mtmd_helper_video_free", [mtmd_helper_video_p_ctypes], None) +def mtmd_helper_video_free( + ctx: mtmd_helper_video_p, + /, +) -> None: + """ + free an mtmd_helper_video object + """ + ... + + +# MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx); +@ctypes_function_mtmd("mtmd_helper_video_get_info", [mtmd_helper_video_p_ctypes], mtmd_helper_video_info) +def mtmd_helper_video_get_info( + ctx: mtmd_helper_video_p, + /, +) -> mtmd_helper_video_info: + """ + get video information from an mtmd_helper_video object + """ + ... + + +# // Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call. +# // *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free() +# // *out_text - heap-allocated (always via strdup/malloc); caller must free with free() +# // returns 0 on success, -1 on EOF, -2 on error +# MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, +# mtmd_bitmap ** out_bitmap, +# char ** out_text); +@ctypes_function_mtmd( + "mtmd_helper_video_read_next", + [ + mtmd_helper_video_p_ctypes, + POINTER(mtmd_bitmap_p_ctypes), + POINTER(c_char_p), + ], + c_int32, +) +def mtmd_helper_video_read_next( + ctx: mtmd_helper_video_p, + out_bitmap: POINTER(mtmd_bitmap_p_ctypes), # type: ignore + out_text: POINTER(c_char_p), # type: ignore + /, +) -> int: + """ + read the next item from the video stream + + Exactly one of out_bitmap or out_text is set per successful call. + + out_bitmap: + heap-allocated bitmap; caller must free it with mtmd_bitmap_free() + + out_text: + heap-allocated string via strdup/malloc; caller must free it with free() + + returns: + 0 on success + -1 on EOF + -2 on error + """ + ... From 10b4addb9d5f2ff71bddde34f43f8a43fac44b61 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 05:08:49 +0800 Subject: [PATCH 497/518] feat(mtmd): add video input support to MTMDChatHandler - Add video_url handling to the MTMD chat template and media extraction pipeline. Detect whether the loaded libmtmd build supports video helpers and reject video inputs early when MTMD_VIDEO is unavailable. - Update media loading and bitmap creation for the new helper wrapper API. mtmd_helper_bitmap_init_from_buf now returns a bitmap wrapper containing both the decoded bitmap and an optional video helper context, so keep the video context alive until mtmd_tokenize completes and release it afterward. - Also consolidate duplicated audio/video byte loading into a shared _load_bytes helper, reuse it for image loading, and add richer default HTTP headers for remote media requests. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 173 ++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 58 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index fb42a59f23..2224466436 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3064,6 +3064,8 @@ class MTMDChatHandler: "{% else %}" "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" "{% elif content.type == 'text' %}" "{{ content.text }}" "{% endif %}" @@ -3114,6 +3116,10 @@ def __init__( self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + if not os.path.exists(clip_model_path): raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") @@ -3182,6 +3188,15 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if self.verbose: print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + def close(self) -> None: """Explicitly free the mtmd context and vision model resources.""" if getattr(self, "mtmd_ctx", None) is not None: @@ -3259,7 +3274,16 @@ def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessa if url: media_items.append({"url": url, "type": "audio"}) - # 3. Text & Unknown Types + # 3. Video Processing + elif content_type == "video_url": + if not self.is_support_video: + raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") + + video_url = content["video_url"] + url = video_url if isinstance(video_url, str) else video_url["url"] + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types elif content_type == "text": continue else: @@ -3274,6 +3298,7 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): Supported formats: - Images (via stb_image): jpg, png, bmp, etc. - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. Note: - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. @@ -3283,25 +3308,35 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): media_bytes (bytes): The raw byte content of the media file. Returns: - mtmd_bitmap: A pointer to the allocated bitmap structure containing decoded media features. + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL """ if self.mtmd_ctx is None: raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") - # Create bitmap from buffer using helper function - bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, - (ctypes.c_uint8 * len(media_bytes)).from_buffer(bytearray(media_bytes)), + buf, len(media_bytes), False, ) - if bitmap is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load image or audio file from media bytes " - "(unsupported media format or corrupted data).") + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) - return bitmap + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx def _process_mtmd_prompt( @@ -3360,16 +3395,17 @@ def _process_mtmd_prompt( # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding bitmaps = [None] * len(media_items) bitmap_cleanup = [] + video_cleanup = [] chunks = None try: # Concurrent Media Decoding import concurrent.futures if media_items: - def _create_bitmap_func(idx: int, item: str): + def _create_bitmap_func(idx: int, item: dict): media_bytes = self.load_media(item["url"], item["type"]) - bitmap = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, # which can be used in the future to process large numbers of video frames. max_workers = min(llama.n_threads, len(media_items)) @@ -3377,10 +3413,14 @@ def _create_bitmap_func(idx: int, item: str): futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] for future in concurrent.futures.as_completed(futures): - idx, bitmap = future.result() + idx, bitmap, video_ctx = future.result() + bitmaps[idx] = bitmap bitmap_cleanup.append(bitmap) + if video_ctx: + video_cleanup.append(video_ctx) + # Strict validation: Abort if any thread failed to decode its assigned media if any(b is None for b in bitmaps): raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") @@ -3415,6 +3455,12 @@ def _create_bitmap_func(idx: int, item: str): if result != 0: raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + # 6. Virtual Token Ledger Construction full_prompt_ids = [] chunk_token_spans = [] @@ -3424,6 +3470,7 @@ def _create_bitmap_func(idx: int, item: str): # Cursor to track the actual media contents (URLs or base64 data) provided by the user media_items_count = len(media_items) media_items_cur = 0 + last_media_id = None for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) @@ -3463,7 +3510,11 @@ def _create_bitmap_func(idx: int, item: str): # while instantly breaking the match if the image content changes. # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id else: # Magic Negative Number as fallback :) media_id = -314159 @@ -3492,6 +3543,12 @@ def _create_bitmap_func(idx: int, item: str): for bitmap in bitmap_cleanup: self._mtmd_cpp.mtmd_bitmap_free(bitmap) bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + bitmaps = None raise e @@ -3825,18 +3882,22 @@ def __call__( def load_media(self, media_url: str, media_type: str) -> bytes: """ Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image or audio processor based on the media_type. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. """ if media_type == "image": return self._load_image(media_url) + elif media_type == "audio": - audio_bytes = self._load_audio(media_url) - # Apply ironclad magic bytes validation before returning + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") try: self.detect_audio_format(audio_bytes) except ValueError as e: raise ValueError(f"{self.log_prefix}(load_media): {e}") return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + else: raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") @@ -3876,41 +3937,51 @@ def detect_audio_format(audio_bytes: bytes) -> str: "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." ) + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + @staticmethod - def _load_audio(audio_url: str) -> bytes: + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: """ - Load audio from either a URL, local path, or a data URI and return raw bytes. + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. """ + media_bytes = b"" - audio_bytes = b"" - - # 1. Handle data URI (base64) - if audio_url.strip().startswith("data:"): - comma_pos = audio_url.find(",") + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") if comma_pos == -1: raise ValueError("Invalid data URI: missing comma separator") - base64_data = audio_url[comma_pos + 1 :] - audio_bytes = base64.b64decode(base64_data) + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) # 2. Handle local file path - elif os.path.exists(audio_url): - with open(audio_url, "rb") as f: - audio_bytes = f.read() + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() # 3. Handle remote URL via HTTP/HTTPS else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(audio_url, headers=headers) + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) try: - with urllib.request.urlopen(req, timeout=15) as f: - audio_bytes = f.read() + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download audio from {audio_url}: {e}") + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") - if not audio_bytes: - raise ValueError("Empty audio data received") + if not media_bytes: + raise ValueError(f"Empty {kind} data received") - return audio_bytes + return media_bytes @staticmethod def _load_image(image_url: str) -> bytes: @@ -3926,28 +3997,14 @@ def _load_image(image_url: str) -> bytes: Returns: JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. """ - image_bytes = b"" - - # 1. Handle data URI (base64) - if image_url.strip().startswith("data:"): - # Split only once from the right to correctly handle mime types containing commas - comma_pos = image_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - base64_data = image_url[comma_pos + 1 :] - image_bytes = base64.b64decode(base64_data) - - # 2. Handle local/remote URL - else: - headers = {"User-Agent": "Mozilla/5.0"} - req = urllib.request.Request(image_url, headers=headers) - - try: - with urllib.request.urlopen(req, timeout=15) as f: - image_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download image from {image_url}: {e}") + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + # 2. Check if image_bytes is empty. if not image_bytes: raise ValueError("Empty image data received") From e4dcac1af57b58973ecf7e206a3c25b3c367d881 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 9 Jun 2026 22:22:15 +0800 Subject: [PATCH 498/518] Update Submodule vendor/llama.cpp 7d2b45b..d6d0ce8 Signed-off-by: JamePeng --- llama_cpp/mtmd_cpp.py | 14 ++++++-------- vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 61fb0e7859..30ca8fab90 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -459,8 +459,8 @@ def mtmd_bitmap_set_id( c_int, c_size_t, # chunk_idx c_void_p, # user_data - POINTER(mtmd_bitmap_p), # mtmd_bitmap ** out_bitmap - POINTER(c_char_p), # char ** out_text + POINTER(mtmd_bitmap_p_ctypes), # mtmd_bitmap ** out_bitmap + POINTER(c_char_p), # char ** out_text ) # MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, @@ -856,7 +856,7 @@ def mtmd_helper_log_set(log_callback: ggml_log_callback, user_data: c_void_p): # # // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). # MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); @ctypes_function_mtmd( - "mtmd_helper_support_video", [mtmd_context_p], c_bool) + "mtmd_helper_support_video", [mtmd_context_p_ctypes], c_bool) def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: """ Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). @@ -870,8 +870,8 @@ def mtmd_helper_support_video(ctx: mtmd_context_p) -> c_bool: # }; class mtmd_helper_bitmap_wrapper(Structure): _fields_ = [ - ("bitmap", mtmd_bitmap_p), - ("video_ctx", mtmd_helper_video_p), + ("bitmap", mtmd_bitmap_p_ctypes), + ("video_ctx", mtmd_helper_video_p_ctypes), ] mtmd_helper_bitmap_wrapper_p_ctypes = POINTER(mtmd_helper_bitmap_wrapper) @@ -1162,9 +1162,7 @@ class mtmd_helper_video_init_params(Structure): [], mtmd_helper_video_init_params, ) -def mtmd_helper_video_init_params_default( - /, -) -> mtmd_helper_video_init_params: +def mtmd_helper_video_init_params_default() -> mtmd_helper_video_init_params: """ get default init params for mtmd_helper_video """ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d2b45b4f7..d6d0ce8215 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d2b45b4f7b663cda74f23fbc3ce6dc3bd4f6545 +Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f From 54f56bd8f89769f2021f31eba0aa377dc290f203 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sat, 13 Jun 2026 00:46:09 +0800 Subject: [PATCH 499/518] Update Submodule vendor/llama.cpp d6d0ce8..ebc1077 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d6d0ce8215..ebc10770ac 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d6d0ce8215a1c324e8de04b52f9dd65c5edc129f +Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3 From 6d1bd3b8d751a3a2ac86d377ecd34a3b37278b15 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 00:04:56 +0800 Subject: [PATCH 500/518] Update Submodule vendor/llama.cpp ebc1077..e8067a8 Signed-off-by: JamePeng --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ebc10770ac..e8067a8b36 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ebc10770ac5a9331824c53ef0c6adad780904dc3 +Subproject commit e8067a8b3624aa40cc88ecb2940060e5d65b7532 From 971ee384227f6268f244c93f620b12f0a6ff47c0 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 01:03:09 +0800 Subject: [PATCH 501/518] Update(mtmd): Append mtmd batching API - Sync upstream: mtmd: add batching API (#24384) Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 3 + llama_cpp/mtmd_cpp.py | 142 ++++++++++++++++++++++++++++++--- 2 files changed, 134 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2224466436..520d2429d4 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3094,6 +3094,7 @@ def __init__( use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1, + batch_max_tokens: int = 1024, **kwargs ): @@ -3108,6 +3109,7 @@ def __init__( self.clip_model_path = clip_model_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens self.use_gpu = use_gpu self.verbose = verbose @@ -3152,6 +3154,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens # Cache the model's eos token and bos token self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 30ca8fab90..4513761a63 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -153,6 +153,21 @@ class mtmd_pos_type(enum.IntEnum): mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) mtmd_input_chunks_p_ctypes = c_void_p +# struct mtmd_batch { +# mtmd_context * ctx; +# std::vector entries; +# std::vector output_embd; // aggregated output embedding for the whole batch +# mtmd_batch(mtmd_context * ctx): ctx(ctx) {} +# int32_t n_tokens() const { +# int32_t n = 0; +# for (const auto * chunk : entries) { +# n += mtmd_input_chunk_get_n_tokens(chunk); +# } +# return n; +# } +# }; +mtmd_batch_p = NewType("mtmd_batch_p", int) +mtmd_batch_p_ctypes = c_void_p # struct mtmd_input_text { # const char * text; @@ -210,6 +225,11 @@ class clip_context_params(Structure): # // callback function passed over to mtmd proper # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; +# +# // batching params +# int32_t batch_max_tokens; // maximum number of output tokens in a batch +# // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit) +# // (default: 1024) # }; class mtmd_context_params(Structure): _fields_ = [ @@ -224,6 +244,7 @@ class mtmd_context_params(Structure): ("image_max_tokens", c_int), ("cb_eval", ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), + ("batch_max_tokens", c_int32), ] mtmd_context_params_p_ctypes = POINTER(mtmd_context_params) @@ -731,8 +752,8 @@ def mtmd_tokenize( # // returns 0 on success # // TODO: deprecate -# MTMD_API int32_t mtmd_encode(mtmd_context * ctx, -# const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens), +# "use mtmd_encode_chunk() instead"); @ctypes_function_mtmd( "mtmd_encode", [ mtmd_context_p_ctypes, @@ -745,10 +766,15 @@ def mtmd_encode( image_tokens: mtmd_image_tokens_p, /, ) -> c_int32: + """ + DEPRECATED: use mtmd_encode_chunk() instead + """ ... +# // text chunk will be ignored silently, only media chunk will be encoded # // returns 0 on success +# // returns 1 on generic error # MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, # const mtmd_input_chunk * chunk); @ctypes_function_mtmd( @@ -763,6 +789,11 @@ def mtmd_encode_chunk( chunk: mtmd_input_chunk_p, /, ) -> c_int32: + """ + text chunk will be ignored silently, only media chunk will be encoded + returns 0 on success + returns 1 on generic error + """ ... # // get output embeddings from the last encode pass @@ -778,6 +809,95 @@ def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float): # type: ignor ... +# // batch encoding API +# // chunks are not owned by the batch, they will not be freed by mtmd_batch_free() +# // batch is valid for a given context, cannot be shared across contexts +# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +@ctypes_function_mtmd( + "mtmd_batch_init", + [mtmd_context_p_ctypes], + mtmd_batch_p_ctypes, +) +def mtmd_batch_init(ctx: mtmd_context_p, /) -> mtmd_batch_p: + ... + + +# MTMD_API void mtmd_batch_free(mtmd_batch * batch); +@ctypes_function_mtmd( + "mtmd_batch_free", + [mtmd_batch_p_ctypes], + None, +) +def mtmd_batch_free(batch: mtmd_batch_p, /): + """ + chunks are not owned by the batch, they will not be freed by mtmd_batch_free() + batch is valid for a given context, cannot be shared across contexts + """ + ... + + +# // only media chunks are allowed, text chunks will be rejected +# // returns 0 on success +# // returns 1 on generic error +# // returns 2 if the batch is too large (chunk won't be added) +# // returns 3 if it cannot be batched with the existing chunks in the batch +# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function_mtmd( + "mtmd_batch_add_chunk", + [ + mtmd_batch_p_ctypes, + mtmd_input_chunk_p_ctypes, + ], + c_int32, +) +def mtmd_batch_add_chunk( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> c_int32: + """ + only media chunks are allowed, text chunks will be rejected + returns 0 on success + returns 1 on generic error + returns 2 if the batch is too large (chunk won't be added) + returns 3 if it cannot be batched with the existing chunks in the batch + """ + ... + + +# // returns 0 on success +# // returns 1 on generic error +# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +@ctypes_function_mtmd( + "mtmd_batch_encode", + [mtmd_batch_p_ctypes], + c_int32, +) +def mtmd_batch_encode(batch: mtmd_batch_p, /) -> c_int32: + """ + returns 0 on success + returns 1 on generic error + """ + ... + + +# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function_mtmd( + "mtmd_batch_get_output_embd", + [ + mtmd_batch_p_ctypes, + mtmd_input_chunk_p_ctypes, + ], + POINTER(c_float), +) +def mtmd_batch_get_output_embd( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> POINTER(c_float): # type: ignore + ... + + # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. # MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @@ -947,8 +1067,8 @@ def mtmd_helper_bitmap_init_from_buf( # // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @ctypes_function_mtmd( - "mtmd_helper_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) -def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t: + "mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) +def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p) -> c_size_t: """ helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache """ @@ -959,8 +1079,8 @@ def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunk_p) -> c_size_t: # // normally, n_pos is equal to n_tokens, but for M-RoPE it is different # MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); @ctypes_function_mtmd( - "mtmd_helper_get_n_pos", [mtmd_input_chunk_p_ctypes], c_int32) -def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32: + "mtmd_helper_get_n_pos", [mtmd_input_chunks_p_ctypes], c_int32) +def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p) -> c_int32: """ helper to count the total position of tokens from a list of chunks, useful to keep track of n_past normally, n_pos is equal to n_tokens, but for M-RoPE it is different @@ -991,8 +1111,8 @@ def mtmd_helper_image_get_decoder_pos( # // helper function that automatically: # // 1. run llama_decode() on text chunks -# // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -# // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +# // 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode() +# // if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error # // otherwise, returns 0 on success # // this function is NOT thread-safe # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, @@ -1007,7 +1127,7 @@ def mtmd_helper_image_get_decoder_pos( "mtmd_helper_eval_chunks", [ mtmd_context_p_ctypes, llama_cpp.llama_context_p_ctypes, - mtmd_input_chunk_p_ctypes, + mtmd_input_chunks_p_ctypes, c_int32, c_int32, c_int32, @@ -1018,7 +1138,7 @@ def mtmd_helper_image_get_decoder_pos( def mtmd_helper_eval_chunks( ctx: mtmd_context_p, lctx: llama_cpp.llama_context_p, - chunks: mtmd_input_chunk_p, + chunks: mtmd_input_chunks_p, n_past: c_int32, seq_id: c_int32, n_batch: c_int32, @@ -1106,7 +1226,7 @@ def mtmd_helper_decode_image_chunk( n_past: c_int32, seq_id: c_int32, n_batch: c_int32, - new_n_past: c_int32, + new_n_past: POINTER(c_int32), # type: ignore /, ) -> c_int32: """ From cb299e67e51e5aff061ebcf9f1521695ad3f1a5d Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 03:05:32 +0800 Subject: [PATCH 502/518] Update(MTMDChatHandler): add chunk type helpers - Add small helper methods `_is_text_chunk`/`_is_image_chunk`/`_is_audio_chunk` for checking MTMD text, image, and audio chunk type enum values. - This keeps MTMD prompt processing easier to read and avoids repeating direct enum comparisons when building token spans for text and media chunks. Signed-off-by: JamePeng --- llama_cpp/llama_chat_format.py | 36 +++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 520d2429d4..aadec4600e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3341,6 +3341,26 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): return wrapper.bitmap, wrapper.video_ctx + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) def _process_mtmd_prompt( self, @@ -3480,7 +3500,7 @@ def _create_bitmap_func(idx: int, item: dict): if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + if self._is_text_chunk(chunk_type): # Extract standard text token IDs n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) @@ -3489,10 +3509,7 @@ def _create_bitmap_func(idx: int, item: dict): chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) full_prompt_ids.extend(tokens) current_idx += len(tokens) - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): # Extract media properties # Note(JamePeng): # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). @@ -3673,7 +3690,7 @@ def __call__( if end_idx <= n_past: continue - if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT: + if self._is_text_chunk(chunk_type): unprocessed_start = max(start_idx, n_past) - start_idx n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) @@ -3689,14 +3706,11 @@ def __call__( llama.eval(tokens_to_eval) n_past = llama.n_tokens - elif chunk_type in [ - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, - self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO - ]: + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) if self.verbose: - media_str = "IMAGE" if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE else "AUDIO" + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) # Stage 5: Multimodal Physical OOM Defense From d8ee3eed7163c6c1f3802a9b979f9009e5e96c53 Mon Sep 17 00:00:00 2001 From: Alcoft Date: Sun, 14 Jun 2026 08:00:09 +0200 Subject: [PATCH 503/518] Change 'clip_model_path' to 'mmproj_path'. Implemented 'chat_template_override'. Only the chat template is passed from llama to the chat handler; not the entire model's metadata. --- llama_cpp/llama.py | 10 ++++----- llama_cpp/llama_chat_format.py | 39 +++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 544e755ea9..1f5ffa20b5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -96,7 +96,7 @@ class Llama: def __init__( self, model_path: str, - clip_model_path: Optional[str] = None, + mmproj_path: Optional[str] = None, *, # Model Params n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto", @@ -710,13 +710,13 @@ def __init__( if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) - if clip_model_path is not None: + if mmproj_path is not None: if self.chat_handler is not None and self.verbose: - print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True) + print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) self.chat_handler = llama_chat_format.GenericMTMDChatHandler( - gguf_metadata = self.metadata, - clip_model_path = clip_model_path, + chat_format = self.metadata.get("tokenizer.chat_template", None), + mmproj_path = mmproj_path, verbose = self.verbose, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 254195f95a..966c2e28fa 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2856,11 +2856,12 @@ class MTMDChatHandler: def __init__( self, - clip_model_path: str, + mmproj_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, **kwargs ): @@ -2872,7 +2873,7 @@ def __init__( f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." ) - self.clip_model_path = clip_model_path + self.mmproj_path = mmproj_path self.image_min_tokens = image_min_tokens self.image_max_tokens = image_max_tokens self.use_gpu = use_gpu @@ -2883,20 +2884,25 @@ def __init__( self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None self.extra_template_arguments: dict[str, Any] = {} - if not os.path.exists(clip_model_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}") + if not os.path.exists(mmproj_path): + raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}") # Pre-compile Jinja template - if not hasattr(self, "chat_format") or self.chat_format is None: + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override self._chat_format_parser_tags = [] - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True, - ).from_string(self.chat_format) + self.change_chat_template(self.chat_format) self._exit_stack = ExitStack() + + def change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) def _init_mtmd_context(self, llama_model: llama_core.Llama): """Initialize mtmd context with the llama model.""" @@ -2929,13 +2935,13 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama): # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), + self.mmproj_path.encode(), llama_model.model, self.mctx_params ) if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.clip_model_path}") + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") # Check if vision is supported self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) @@ -3835,7 +3841,7 @@ def from_pretrained( model_path = os.path.join(local_dir, filename) return cls( - clip_model_path=model_path, + mmproj_path=model_path, **kwargs, ) @@ -3852,13 +3858,12 @@ class GenericMTMDChatHandler(MTMDChatHandler): def __init__( self, - gguf_metadata: Dict[str, Any], - clip_model_path: str, + chat_format: str, + mmproj_path: str, verbose: bool = True, **kwargs ) -> None: - self.model_metadata = gguf_metadata - self.chat_format = self.model_metadata.get("tokenizer.chat_template", None) + self.chat_format = chat_format if verbose: print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) @@ -3866,7 +3871,7 @@ def __init__( if self.chat_format is None: raise ValueError("Failed to get model chat template automatically.") - super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs) + super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) def __call__(self, **kwargs): self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] From 1965d5f6c3c949cab7f7ef934266c8062ebc0f45 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 20:19:43 +0800 Subject: [PATCH 504/518] refactor(mtmd): move multimodal handlers to separate module `llama_multimodal` - Move MTMDChatHandler, GenericMTMDChatHandler, and model-specific multimodal chat handlers out of llama_chat_format.py into llama_multimodal.py. - llama_chat_format.py has grown too large and difficult to maintain, especially as MTMD support expands beyond image-only use cases. Splitting multimodal handling into its own module makes the chat formatting layer smaller and keeps media loading, MTMD tokenization, multimodal KV-cache bookkeeping, and handler implementations in a dedicated place. - This also prepares the codebase for broader multimodal support and future video frame / image batch evaluation, where the media-processing path will need to evolve independently from text-only chat formatting. - Keep backward-compatible re-exports from llama_chat_format.py so existing imports continue to work. - Also keep `clip_model_path` as a deprecated initialization alias for `mmproj_path` in the base MTMD handler. Signed-off-by: JamePeng --- llama_cpp/llama.py | 8 +- llama_cpp/llama_chat_format.py | 3811 ++------------------------------ llama_cpp/llama_multimodal.py | 3473 +++++++++++++++++++++++++++++ 3 files changed, 3690 insertions(+), 3602 deletions(-) create mode 100644 llama_cpp/llama_multimodal.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ec202568f1..dbc60eaf76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -45,6 +45,7 @@ from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp.llama_multimodal as llama_multimodal from llama_cpp.llama_speculative import LlamaDraftModel @@ -711,20 +712,19 @@ def __init__( self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) - - if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) if mmproj_path is not None: if self.chat_handler is not None and self.verbose: print("Warning: Both `chat_handler` and `mmproj_path` are not null. Chat handler will be overwritten.", flush = True) - self.chat_handler = llama_chat_format.GenericMTMDChatHandler( + self.chat_handler = llama_multimodal.GenericMTMDChatHandler( chat_format = self.metadata.get("tokenizer.chat_template", None), mmproj_path = mmproj_path, verbose = self.verbose, **chat_handler_kwargs ) + + if self.verbose: print(f"Model desc: {self.model_desc}, " f"Model size: {self.model_size / (1024 * 1024):.2f} MB, " f"Model metadata: {self.metadata}", diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0e5c9d4906..6ffe68e5e3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,7 +1,5 @@ from __future__ import annotations -import base64 -import ctypes import dataclasses import datetime import json @@ -9,9 +7,7 @@ import random import string import sys -import zlib -from contextlib import ExitStack from typing import ( Any, Dict, @@ -32,16 +28,11 @@ import numpy as np import numpy.typing as npt -import urllib.request -from urllib.error import URLError, HTTPError - -import llama_cpp.llama_cpp as llama_cpp_lib import llama_cpp.llama as llama_core import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar -from ._ggml import GGMLLogLevel -from ._logger import logger, ggml_log_callback +from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -3037,3612 +3028,204 @@ def generate_streaming(tools, functions, function_call, prompt): ) -class MTMDChatHandler: - DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( -"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " -"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." - ) - - CHAT_FORMAT = ( - "{{ bos_token if bos_token is defined else '' }}" +@register_chat_completion_handler("chatml-function-calling") +def chatml_function_calling( + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs, # type: ignore +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + function_calling_template = ( "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% elif message.role == 'user' %}" - "USER: " - "{% if message.content is string %}" - "{{ message.content }}" - "{% elif message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% elif content.type == 'audio_url' %}" - "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" - "{% elif content.type == 'input_audio' %}" - "{% if content.input_audio is string %}" - "{{ content.input_audio }}" - "{% else %}" - "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" - "{% endif %}" - "{% elif content.type == 'video_url' %}" - "{{ content.video_url if content.video_url is string else content.video_url.url }}" - "{% elif content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - - "{% elif message.role == 'assistant' and message.content is not none %}" - "ASSISTANT: {{ message.content }}" - "{% endif %}" - "{{ \"\n\" }}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\n\nYou can respond to users messages with either a single message or one or more function calls." + "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" + "\n\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Reglar message + "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" + "message:\n" + "{% endif %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if 'tool_calls' in message %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" "{% endfor %}" - - "{% if eos_token is defined %}" - "{{ eos_token }}" + "<|im_end|>\n" "{% endif %}" - - "{% if add_generation_prompt %}" - "ASSISTANT: " "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) + template_renderer = ImmutableSandboxedEnvironment( + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) - def __init__( - self, - mmproj_path: str, - verbose: bool = True, - use_gpu: bool = True, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - chat_template_override: Optional[str] = None, - batch_max_tokens: int = 1024, - **kwargs - ): - - self.log_prefix = self.__class__.__name__ - if kwargs: - unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) - raise TypeError( - f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" - f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." - ) - - self.mmproj_path = mmproj_path - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - self.batch_max_tokens = batch_max_tokens - self.use_gpu = use_gpu - self.verbose = verbose - - import llama_cpp.mtmd_cpp as mtmd_cpp - self._mtmd_cpp = mtmd_cpp - self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None - self.extra_template_arguments: dict[str, Any] = {} - - self.is_support_vision = False - self.is_support_audio = False - self.is_support_video = False - - if not os.path.exists(mmproj_path): - raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {mmproj_path}") - - # Pre-compile Jinja template - if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: - self.chat_format = self.CHAT_FORMAT - elif chat_template_override is not None: - self.chat_format = chat_template_override - - self._chat_format_parser_tags = [] - self.change_chat_template(self.chat_format) - - self._exit_stack = ExitStack() - - def change_chat_template(self, new_template: str): - self.chat_template = ImmutableSandboxedEnvironment( - trim_blocks=True, - lstrip_blocks=True - ).from_string(new_template) - - def _init_mtmd_context(self, llama_model: llama_core.Llama): - """Initialize mtmd context with the llama model.""" - if self.mtmd_ctx is not None: - return # Already initialized - - self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) - - # Get default parameters - self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() - self.mctx_params.use_gpu = self.use_gpu - self.mctx_params.print_timings = self.verbose - self.mctx_params.n_threads = llama_model.n_threads - self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO - self.mctx_params.warmup = True - if self.image_min_tokens > 0: - self.mctx_params.image_min_tokens = self.image_min_tokens - if self.image_max_tokens > 0: - self.mctx_params.image_max_tokens = self.image_max_tokens - if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " - f"cannot be less than image_min_tokens ({self.image_min_tokens}).") - self.mctx_params.batch_max_tokens = self.batch_max_tokens - - # Cache the model's eos token and bos token - self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') - self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') - - # Cache the mtmd_default_marker - self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - - # Initialize mtmd context - self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.mmproj_path.encode(), - llama_model.model, - self.mctx_params - ) - - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") - - # Check if vision is supported - self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) - if self.is_support_vision: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if audio is supported - self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) - if self.is_support_audio: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) - - # Check if video is supported - self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) - if self.is_support_video: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) - - def close(self) -> None: - """Explicitly free the mtmd context and vision model resources.""" - if getattr(self, "mtmd_ctx", None) is not None: - try: - self._mtmd_cpp.mtmd_free(self.mtmd_ctx) - except Exception: - pass - self.mtmd_ctx = None - self.mctx_params = None - self.chat_template = None - - if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): - self._exit_stack.close() - self._exit_stack = None - - def __del__(self) -> None: - self.close() - - def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: - """ - Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. - Strictly enforces capability checks, raising exceptions if unsupported media is passed. - - Returns: - media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). - """ - media_items: List[Dict[str, str]] = [] - for message in messages: - if isinstance(message.get("content"), list): - for content in message["content"]: - content_type = content.get("type", "") - - # 1. Vision Processing - if content_type == "image_url": - if not self.is_support_vision: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") - - url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] - media_items.append({"url": url, "type": "image"}) - - # 2. Audio Processing - elif content_type in ["audio", "audio_url", "input_audio"]: - if not self.is_support_audio: - raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") - - # Case A: Handle custom/forward-compatible audio_url format - if content_type == "audio_url" or content_type == "audio": - audio_url = content[content_type] - url = audio_url if isinstance(audio_url, str) else audio_url["url"] - media_items.append({"url": url, "type": "audio"}) - # Case B: Handle OpenAI standard input_audio format - elif content_type == "input_audio": - input_audio = content.get("input_audio", {}) - if isinstance(input_audio, dict) and "data" in input_audio: - # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic - # input_audio: { - # data: audio.base64Data, - # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' - # } - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - - # Strictly align with llama.cpp (require wav/mp3) - if audio_format not in ["wav", "mp3"]: - raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") - - # Format as a Data URI to reuse the unified load_media logic - media_items.append({ - "url": f"data:audio/{audio_format};base64,{audio_data}", - "type": "audio" - }) - else: - # Just a raw base64 data - url = input_audio if isinstance(input_audio, str) else "" - if url: - media_items.append({"url": url, "type": "audio"}) - - # 3. Video Processing - elif content_type == "video_url": - if not self.is_support_video: - raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") - - video_url = content["video_url"] - url = video_url if isinstance(video_url, str) else video_url["url"] - media_items.append({"url": url, "type": "video"}) - - # 4. Text & Unknown Types - elif content_type == "text": - continue - else: - if self.verbose: - print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) - return media_items - - def _create_bitmap_from_bytes(self, media_bytes: bytes): - """ - Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. - - Supported formats: - - Images (via stb_image): jpg, png, bmp, etc. - - Audio (via miniaudio): wav, mp3, flac. - - Video: depends on whether MTMD_VIDEO was enabled at build time. - - Note: - - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. - - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. - - Args: - media_bytes (bytes): The raw byte content of the media file. - - Returns: - bitmap: mtmd_bitmap * - video_ctx: mtmd_helper_video * or NULL - """ - if self.mtmd_ctx is None: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] - if not media_bytes: - raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } - buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + stop = ( + [stop, "<|im_end|>"] + if isinstance(stop, str) + else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + ) - wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( - self.mtmd_ctx, - buf, - len(media_bytes), - False, + # Case 1: No tool choice by user + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, + tools=[], + tool_calls=None, + add_generation_prompt=True, ) - if not wrapper.bitmap: - if wrapper.video_ctx: - self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) - - raise ValueError( - f"{self.log_prefix}(_create_bitmap_from_bytes): " - "Failed to load media from bytes " - "(unsupported media format, corrupted data, or missing helper support)." - ) - - return wrapper.bitmap, wrapper.video_ctx - - def _is_text_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD text chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT - ) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) - def _is_image_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD image chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, ) - def _is_audio_chunk(self, chunk_type: int) -> bool: - """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" - return ( - chunk_type - == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + # Case 2: Tool choice by user + if isinstance(tool_choice, dict): + tool_name = tool_choice["function"]["name"] + tool = next( + (tool for tool in tools if tool["function"]["name"] == tool_name), None ) - - def _process_mtmd_prompt( - self, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - add_generation_prompt: bool = True, - ) -> Tuple[List[int], List[tuple], Any, List[Any]]: - """ - Core multimodal preprocessing pipeline. - Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. - - Features: - - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. - - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. - - Strict RAII-style C++ memory management to prevent leaks on failure. - - Returns: - full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. - chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). - chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). - bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. - """ - # 1. Inject default system prompt if omitted by the user - system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") - if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: - messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages - - media_items = self._get_media_items(messages) - media_marker = self.media_marker - - # 2. Render the chat template and replace actual URLs with C++ media markers - text = self.chat_template.render( + if tool is None: + raise ValueError(f"Tool with name '{tool_name}' not found in tools") + prompt = template_renderer.render( messages=messages, - add_generation_prompt=add_generation_prompt, - eos_token=self.mtmd_eos_token, - bos_token=self.mtmd_bos_token, - functions=functions, - function_call=function_call, tools=tools, - tool_choice=tool_choice, - **getattr(self, 'extra_template_arguments', {}) + tool_calls=True, + add_generation_prompt=True, ) - - for tag in self._chat_format_parser_tags: - if tag not in text: - continue - - text = text.replace(tag, media_marker) - - # Replace image_url by media_marker in text - for item in media_items: - text = text.replace(item["url"], media_marker) - - if self.verbose: - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) - print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) - - # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding - bitmaps = [None] * len(media_items) - bitmap_cleanup = [] - video_cleanup = [] - chunks = None - - try: - # Concurrent Media Decoding - import concurrent.futures - if media_items: - def _create_bitmap_func(idx: int, item: dict): - media_bytes = self.load_media(item["url"], item["type"]) - bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) - return idx, bitmap, video_ctx - # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, - # which can be used in the future to process large numbers of video frames. - max_workers = min(llama.n_threads, len(media_items)) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] - - for future in concurrent.futures.as_completed(futures): - idx, bitmap, video_ctx = future.result() - - bitmaps[idx] = bitmap - bitmap_cleanup.append(bitmap) - - if video_ctx: - video_cleanup.append(video_ctx) - - # Strict validation: Abort if any thread failed to decode its assigned media - if any(b is None for b in bitmaps): - raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") - else: - if self.verbose: - print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") - else: - # If there are no images, set the bitmaps to empty. - bitmaps = [] - - # 4. Initialize mtmd_input_chunks - input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') - input_text.add_special = (llama.n_tokens == 0) - input_text.parse_special = True - - chunks = self._mtmd_cpp.mtmd_input_chunks_init() - if chunks is None: - raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") - - # 5. Hybrid Tokenization (Text + Media binding) - if len(bitmaps) > 0: - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) - ) - else: - result = self._mtmd_cpp.mtmd_tokenize( - self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") - - # Video helper contexts only need to stay alive until mtmd_tokenize() completes. - if video_cleanup: - for video_ctx in video_cleanup: - self._mtmd_cpp.mtmd_helper_video_free(video_ctx) - video_cleanup.clear() - - # 6. Virtual Token Ledger Construction - full_prompt_ids = [] - chunk_token_spans = [] - current_idx = 0 - n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - - # Cursor to track the actual media contents (URLs or base64 data) provided by the user - media_items_count = len(media_items) - media_items_cur = 0 - last_media_id = None - - for i in range(n_chunks): - chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) - if chunk is None: continue - chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - - if self._is_text_chunk(chunk_type): - # Extract standard text token IDs - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) - if tokens_ptr and n_tokens_out.value > 0: - tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) - full_prompt_ids.extend(tokens) - current_idx += len(tokens) - elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): - # Extract media properties - # Note(JamePeng): - # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). - # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. - # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - - if media_items_cur < media_items_count: - # The C++ parser only sees identical placeholders (e.g., "<__media__>"). - # We MUST inject the actual media content's identity here. - real_media_url = media_items[media_items_cur]["url"] - # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) - # Generate a deterministic, unique negative ID for this specific image/audio. - # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). - # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with - # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). - # This empowers `longest_token_prefix` to correctly identify and reuse cached images, - # while instantly breaking the match if the image content changes. - # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 - media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 - last_media_id = media_id - media_items_cur += 1 - elif last_media_id is not None: - # video may expand into multiple image chunks from one media marker - media_id = last_media_id - else: - # Magic Negative Number as fallback :) - media_id = -314159 - - if self.verbose: - print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") - - chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) - - # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache - full_prompt_ids.extend([media_id] * chunk_n_tokens) - current_idx += chunk_n_tokens - else: - raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") - - return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup - - except Exception as e: - # Ensure no useless pointers remain upon any failure - # Free chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Free bitmaps - if len(bitmap_cleanup) > 0: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup = None - # Free videos - if len(video_cleanup) > 0: - for video_ctx in video_cleanup: - self._mtmd_cpp.mtmd_helper_video_free(video_ctx) - video_cleanup = None - - bitmaps = None - - raise e - - def __call__( - self, - *, - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - seed: Optional[int] = None, - response_format: Optional[ - llama_types.ChatCompletionRequestResponseFormat - ] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logit_bias: Optional[Dict[str, float]] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - add_generation_prompt: bool = True, - reasoning_budget: int = -1, - reasoning_start: str = "", - reasoning_end: str = "", - reasoning_budget_message: Optional[str] = None, - reasoning_start_in_prompt: bool = False, - reasoning_start_max_tokens: Optional[int] = 32, - **kwargs, # type: ignore - ) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], - ]: - # 1. Initialize mtmd context - self._init_mtmd_context(llama) - assert self.mtmd_ctx is not None - - # 2. Concurrent Preprocessing & Ledger Construction - full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( - llama=llama, - messages=messages, - functions=functions, - function_call=function_call, - tools=tools, - tool_choice=tool_choice, - add_generation_prompt=add_generation_prompt, - ) - - if self.verbose: - print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) - - try: - # 3. KV Cache Synchronization & State Rollback - # Compares the virtual ledger with physical history to prevent Cache Poisoning. - current_history = llama.input_ids[:llama.n_tokens].tolist() - longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) - - if longest_prefix < llama.n_tokens: - if llama.is_hybrid and llama._hybrid_cache_mgr is not None: - if llama._hybrid_cache_mgr.max_checkpoints > 0: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " - f"Searching for nearest checkpoint...", file=sys.stderr) - - best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) - if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): - llama.n_tokens = best_ckpt.pos - if self.verbose: - print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) - llama._hybrid_cache_mgr.clear() - llama._ctx.memory_clear(True) - llama.n_tokens = 0 - else: - if self.verbose: - print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) - llama._ctx.memory_seq_rm(0, longest_prefix, -1) - llama.n_tokens = longest_prefix - - n_past = llama.n_tokens - - for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: - # Skip previously matched chunks - if end_idx <= n_past: - continue - - if self._is_text_chunk(chunk_type): - unprocessed_start = max(start_idx, n_past) - start_idx - n_tokens_out = ctypes.c_size_t() - tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) - - if tokens_ptr and n_tokens_out.value > 0: - all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - tokens_to_eval = all_tokens[unprocessed_start:] - - if tokens_to_eval: - if self.verbose: - print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - # Text evaluation delegates shift and chunking to native llama.eval - llama.eval(tokens_to_eval) - n_past = llama.n_tokens - - elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) - - if self.verbose: - media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" - print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) - - # Stage 5: Multimodal Physical OOM Defense - if n_past + chunk_n_tokens > llama.n_ctx(): - if not llama._ctx.memory_can_shift(): - raise RuntimeError( - f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " - f"(n_pos_per_embd > 1 or incompatible M-RoPE). " - f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " - f"You MUST increase n_ctx to fit the dialogue." - ) - else: - # Safely discard oldest tokens while preserving system prompts - n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch - n_keep = min(llama.n_keep, n_past) - n_discard = min(n_discard, n_past - n_keep) - - if n_discard <= 0: - raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") - - if self.verbose: - print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) - - # Execute physical memory shift - llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) - llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) - - # Shift python virtual array to match - remaining_len = n_past - (n_keep + n_discard) - if remaining_len > 0: - llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] - - n_past -= n_discard - llama.n_tokens = n_past - - # Execute C++ Multimodal Black-box Extraction - new_n_past = llama_cpp_lib.llama_pos(0) - result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( - self.mtmd_ctx, - llama._ctx.ctx, - chunk_ptr, - llama_cpp_lib.llama_pos(n_past), - llama_cpp_lib.llama_seq_id(0), - llama.n_batch, - True, # logits_last = True, drastically saves computational overhead - ctypes.byref(new_n_past) - ) - - if result != 0: - raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") - - # Update Ledger with "Negative Reverse Vocabulary" IDs - llama.input_ids[n_past : new_n_past.value] = media_id - n_past = new_n_past.value - llama.n_tokens = n_past - - # Extract the final, perfectly synchronized prompt sequence - prompt = llama.input_ids[: llama.n_tokens].tolist() - - # End-of-Turn Checkpoint - # Anchors the state ONLY after the entire multi-modal turn is processed - if ( - llama.is_hybrid - and llama._hybrid_cache_mgr is not None - and llama._hybrid_cache_mgr.max_checkpoints > 0 - ): - if self.verbose: - print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) - - llama._hybrid_cache_mgr.save_checkpoint( - current_pos=llama.n_tokens, - tokens=prompt, - seq_id=0 - ) - finally: - # Cleanup chunks - if chunks is not None: - self._mtmd_cpp.mtmd_input_chunks_free(chunks) - chunks = None - # Cleanup bitmaps - if bitmap_cleanup: - for bitmap in bitmap_cleanup: - self._mtmd_cpp.mtmd_bitmap_free(bitmap) - bitmap_cleanup.clear() - bitmap_array = None - - # Handle response format and tools (same as before) - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - tool = None - if ( - tool_choice is not None - and isinstance(tool_choice, dict) - and tools is not None - ): - name = tool_choice["function"]["name"] - tool = next((t for t in tools if t["function"]["name"] == name), None) - if tool is None: - raise ValueError(f"Tool choice '{name}' not found in tools.") - schema = tool["function"]["parameters"] - try: - # create grammar from json schema - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(schema), verbose=llama.verbose - ) - except Exception as e: - if llama.verbose: - print(str(e), file=sys.stderr) - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF, verbose=llama.verbose - ) - - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - logprobs=top_logprobs if logprobs else None, - stream=stream, - stop=stop, - seed=seed, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logit_bias=logit_bias, - reasoning_budget=reasoning_budget, - reasoning_start=reasoning_start, - reasoning_end=reasoning_end, - reasoning_budget_message=reasoning_budget_message, - reasoning_start_in_prompt=reasoning_start_in_prompt, - reasoning_start_max_tokens=reasoning_start_max_tokens, - ) - - if tool is not None: - tool_name = tool["function"]["name"] - return _convert_completion_to_chat_function( - tool_name, completion_or_chunks, stream - ) - return _convert_completion_to_chat(completion_or_chunks, stream=stream) - - def load_media(self, media_url: str, media_type: str) -> bytes: - """ - Unified dispatcher for loading media payloads. - Routes the URL/URI to the specific image, audio, or video processor based on the media_type. - """ - if media_type == "image": - return self._load_image(media_url) - - elif media_type == "audio": - audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") - try: - self.detect_audio_format(audio_bytes) - except ValueError as e: - raise ValueError(f"{self.log_prefix}(load_media): {e}") - return audio_bytes - - elif media_type == "video": - return self._load_bytes(media_url, timeout=30, kind="video") - - else: - raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") - - @staticmethod - def detect_audio_format(audio_bytes: bytes) -> str: - """ - Pure utility function: Detects the audio format from magic bytes. - Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility - and avoid false positives (e.g., AVI files disguised as RIFF). - """ - length = len(audio_bytes) - - if length < 12: - raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") - - # RIFF & WAVE magic bytes verification - is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" - - # ID3 metadata or MPEG sync word verification - is_mp3 = length >= 3 and ( - audio_bytes.startswith(b"ID3") or - (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) - ) - - # FLAC magic bytes verification - is_flac = audio_bytes.startswith(b"fLaC") - - if is_wav: - return "wav" - elif is_mp3: - return "mp3" - elif is_flac: - return "flac" - else: - raise ValueError( - "Unsupported audio format detected via magic bytes. " - "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." - ) - - DEFAULT_HTTP_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/148.0.0.0 Safari/537.36" - ), - } - - @staticmethod - def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: - """ - Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. - """ - media_bytes = b"" - - # 1. Handle data URI - if media_url.strip().startswith("data:"): - comma_pos = media_url.find(",") - if comma_pos == -1: - raise ValueError("Invalid data URI: missing comma separator") - - base64_data = media_url[comma_pos + 1:] - media_bytes = base64.b64decode(base64_data) - - # 2. Handle local file path - elif os.path.exists(media_url): - with open(media_url, "rb") as f: - media_bytes = f.read() - - # 3. Handle remote URL via HTTP/HTTPS - else: - req = urllib.request.Request( - media_url, - headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, - ) - try: - with urllib.request.urlopen(req, timeout=timeout) as f: - media_bytes = f.read() - except (URLError, HTTPError) as e: - raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") - - if not media_bytes: - raise ValueError(f"Empty {kind} data received") - - return media_bytes - - @staticmethod - def _load_image(image_url: str) -> bytes: - """ - Load an image from either a URL or a data URI and return it as JPEG bytes. - - Supports: - - Remote images via HTTP/HTTPS (with proper User-Agent) - - Data URIs (base64-encoded, e.g., data:image/png;base64,...) - - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background - - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html - - Returns: - JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. - """ - # 1. Load image bytes from image_url - image_bytes = MTMDChatHandler._load_bytes( - image_url, - timeout=15, - kind="image", - ) - - # 2. Check if image_bytes is empty. - if not image_bytes: - raise ValueError("Empty image data received") - - # 3. Open image with Pillow - try: - from PIL import Image, ImageStat - except ImportError: - raise ImportError("Pillow is required for image processing. Install with: pip install pillow") - - import io - image = Image.open(io.BytesIO(image_bytes)) - - # 4. Handle transparency (RGBA, LA, P with transparency, etc.) - if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): - # Use alpha channel as mask - if image.mode == "P": - image = image.convert("RGBA") - - alpha = image.split()[-1] # Last channel is alpha - # Compute average brightness of visible (non-transparent) pixels - stat = ImageStat.Stat(image.convert("L"), mask=alpha) - - # Choose background: white for dark content, black for bright content - bg_color = (255, 255, 255) # white - if stat.count[0] > 0 and stat.mean[0] > 127: - bg_color = (0, 0, 0) # black - - background = Image.new("RGB", image.size, bg_color) - background.paste(image, mask=alpha) - image = background - - # 5. Ensure RGB mode for formats like CMYK, palette, etc. - elif image.mode != "RGB": - image = image.convert("RGB") - - # 6. Save as high-quality JPEG, suitable for most vision models. - output = io.BytesIO() - image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) - return output.getvalue() - - @classmethod - def from_pretrained( - cls, - repo_id: str, - filename: Optional[str], - local_dir: Optional[Union[str, os.PathLike[str]]] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - cache_dir: Optional[Union[str, os.PathLike[str]]] = None, - **kwargs: Any, - ) -> "MTMDChatHandler": - import fnmatch - from pathlib import Path - - try: - from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore - from huggingface_hub.utils import validate_repo_id # type: ignore - except ImportError: - raise ImportError( - "Llama.from_pretrained requires the huggingface_hub package. " - "You can install it with `pip install --upgrade huggingface_hub`." - ) - - validate_repo_id(repo_id) - - hffs = HfFileSystem() - - files = [ - file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id) # type: ignore - ] - - # split each file into repo_id, subfolder, filename - file_list: List[str] = [] - for file in files: - rel_path = Path(file).relative_to(repo_id) - file_list.append(str(rel_path)) - - matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore - - if len(matching_files) == 0: - raise ValueError( - f"No file found in {repo_id} that match {filename}\n\n" - f"Available Files:\n{json.dumps(file_list)}" - ) - - if len(matching_files) > 1: - raise ValueError( - f"Multiple files found in {repo_id} matching {filename}\n\n" - f"Available Files:\n{json.dumps(files)}" - ) - - (matching_file,) = matching_files - - subfolder = str(Path(matching_file).parent) - filename = Path(matching_file).name - - # download the file - hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=cast(Union[str, Path, None], local_dir), - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - ) - - if local_dir is None: - model_path = hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder, - local_dir=local_dir, - local_dir_use_symlinks=local_dir_use_symlinks, - cache_dir=cast(Union[str, Path, None], cache_dir), - local_files_only=True, - ) - else: - model_path = os.path.join(local_dir, filename) - - return cls( - mmproj_path=model_path, - **kwargs, - ) - -class GenericMTMDChatHandler(MTMDChatHandler): - KNOWN_MEDIA_TAGS = [ - "<|image_pad|>", - "<|audio_pad|>", - "<|video_pad|>", - "<|image|>", - "<|audio|>", - "<|video|>", - "[IMG]" - ] - - def __init__( - self, - chat_format: str, - mmproj_path: str, - verbose: bool = True, - **kwargs - ) -> None: - self.chat_format = chat_format - - if verbose: - print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) - - if self.chat_format is None: - raise ValueError("Failed to get model chat template automatically.") - - super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) - - def __call__(self, **kwargs): - self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Llava15ChatHandler(MTMDChatHandler): - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - - "{% if message.role == 'user' %}" - "{% if message.content is string %}" - "\nUSER: {{ message.content }}" - "{% elif message.content is iterable %}" - "\nUSER: " - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{{ content.image_url if content.image_url is string else content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% endif %}" - - "{% if message.role == 'assistant' and message.content is not none %}" - "\nASSISTANT: {{ message.content }}" - "{% endif %}" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "\nASSISTANT: " - "{% endif %}" - ) - - -class ObsidianChatHandler(MTMDChatHandler): - # Prompt Format - # The model followed ChatML format. However, with ### as the seperator - - # <|im_start|>user - # What is this sign about?\n - # ### - # <|im_start|>assistant - # The sign is about bullying, and it is placed on a black background with a red background. - # ### - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}\n" - "###\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "###\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "###\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MoondreamChatHandler(MTMDChatHandler): - # Chat Format: - # f"\n\n{chat_history}Question: {question}\n\nAnswer:" - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "Question: {{ content.text }}\n\n" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "Question: {{ message.content }}\n\n" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "Answer:{{ message.content }}\n\n" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class Llava16ChatHandler(MTMDChatHandler): - # Example prompt - # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.role == 'user' %}" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}\n" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "{{ message.content }}" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "Answer:" - "{% endif %}" - ) - - -class NanoLlavaChatHandler(MTMDChatHandler): - # Prompt Format - # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: - - # <|im_start|>system - # Answer the question<|im_end|><|im_start|>user - # - # What is the picture about?<|im_end|><|im_start|>assistant - DEFAULT_SYSTEM_MESSAGE = "Answer the question" - - CHAT_FORMAT = ( - "{% for message in messages %}" - # System message - "{% if message.role == 'system' %}" - "<|im_start|>system\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "<|im_start|>user\n" - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% if message.content is iterable %}" - "{% for content in message.content %}" - "{% if content.type == 'image_url' and content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.type == 'image_url' and content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endfor %}" - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "<|im_end|>" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - "<|im_start|>assistant\n" - "{{ message.content }}" - "<|im_end|>" - "{% endif %}" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class Llama3VisionAlphaChatHandler(MTMDChatHandler): - # question = "" + q - - # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - CHAT_FORMAT = ( - "{% for message in messages %}" - "<|start_header_id|>" - "{% if message.role == 'user' %}" - "user<|end_header_id|>\n\n" - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "assistant<|end_header_id|>\n\n" - "{{ message.content }}" - "{% endif %}" - "<|eot_id|>" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - "{% endif %}" - ) - - -# alias -Llama3VisionAlpha = Llama3VisionAlphaChatHandler - - -class MiniCPMv26ChatHandler(MTMDChatHandler): - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and messages[0]['role'] != 'system' %}" - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is iterable %}" - "{% for content in message['content'] %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{% set image_count.value = image_count.value + 1 %}" - "{{ image_count.value }}: {{ content.image_url.url }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - - "{% for content in message['content'] %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% endif %}" - "<|im_end|>\n" - "{% endfor %}" - "{% if add_generation_prompt %}" - "<|im_start|>assistant\n" - "{% endif %}" - ) - - -class MiniCPMv45ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V 4.5 models. - - Supports: - - Multi-step tool calls with and XML tags. - - Integrated reasoning (thinking) process with tags. - - Specialized system prompt handling with tool definitions. - - Global image numbering for multi-image processing. - """ - - # Model specific control tokens - MINICPMV_BOS_TOKEN = "<|im_start|>" - MINICPMV_EOS_TOKEN = "<|im_end|>" - MINICPMV_PAD_TOKEN = "<|endoftext|>" - - # Image placeholder tags - MINICPMV_IMAGE_START_TOKEN = "" - MINICPMV_IMAGE_END_TOKEN = "" - MINICPMV_IMAGE_ID_START_TOKEN = "" - MINICPMV_IMAGE_ID_END_TOKEN = "" - - CHAT_FORMAT = ( - # --- 1. First System Message & Tools Definitions --- - "{%- if tools %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" - "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" - "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" - "{{- 'You are provided with function signatures within XML tags:\\n' }}" - "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" - "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- elif messages[0].role == 'system' %}" - "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - - # --- 2. Message Stream Processing --- - "{% set image_count = namespace(value=0) %}" - "{%- for message in messages %}" - # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- - "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" - "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" - - "{%- set content = message.content %}" - "{%- if content is not string %}" - "{%- set ns = namespace(content_str='') %}" - "{%- for item in content %}" - # --- Explicit image_url type and value checking --- - "{%- if item.type == 'image_url' %}" - "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" - "{%- set image_count.value = image_count.value + 1 %}" - # Format: N: IMAGE_URL - "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" - "{%- elif item.type == 'text' %}" - "{%- set ns.content_str = ns.content_str + item.text %}" - "{%- endif %}" - "{%- endfor %}" - "{%- set content = ns.content_str %}" - "{%- endif %}" - - "{{- content -}}" - - # Append tool_calls to assistant messages if they exist - "{%- if message.role == 'assistant' and message.tool_calls %}" - "{%- for tool_call in message.tool_calls %}" - "{%- set tc = tool_call.function if tool_call.function else tool_call %}" - "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" - "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" - "{{- '}\\n' }}" - "{%- endfor %}" - "{%- endif %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - - # --- Specialized Tool Response Handling --- - # Group consecutive tool responses under a single user-like block - "{%- elif message.role == 'tool' %}" - "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" - "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" - "{%- endif %}" - "{{- '\\n\\n' + message.content + '\\n' }}" - "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" - "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- endfor %}" - - # --- 3. Generation Prompt --- - "{%- if add_generation_prompt %}" - "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" - # Handle thinking/reasoning block visibility based on configuration - "{%- if enable_thinking is defined and enable_thinking is false %}" - "{{- '\\n\\n\\n\\n' }}" - "{%- elif enable_thinking is defined and enable_thinking is true %}" - "{{- '\\n' }}" - "{%- endif %}" - "{%- endif %}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V 4.5 Handler. - - Args: - enable_thinking (bool): If True, model generates reasoning before the final answer. - **kwargs: Additional arguments for the base MTMDChatHandler. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject thinking control flag into the template - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set stop token patch - kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class MiniCPMV46ChatHandler(MTMDChatHandler): - """ - Handler for MiniCPM-V-4.6 models. - - Features: - - Aligned with official tokenizer_config.json special tokens. - - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. - - Integrated MTMD-style URL and Base64 injection for visual content. - - Specialized `` and `` block generation. - - Autonomously folds previous reasoning paths using `last_query_index`. - - Toggles `` block generation via `enable_thinking` (Defaults to False). - """ - - # Core tokens - MINICPM_BOS_TOKEN = "<|im_start|>" - MINICPM_EOS_TOKEN = "<|im_end|>" - MINICPM_PAD_TOKEN = "<|endoftext|>" - - # Vision tokens - MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" - MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" - MINICPM_IMAGE_TOKEN = "<|image_pad|>" - MINICPM_VIDEO_TOKEN = "<|video_pad|>" - - CHAT_FORMAT = ( - "{%- if enable_thinking is not defined -%}\n" - " {%- set enable_thinking = false -%}\n" - "{%- endif -%}\n" - "{%- macro render_content(content, is_system_content=false) -%}\n" - " {%- if content is string -%}\n" - " {{- content -}}\n" - " {%- elif content is iterable and content is not mapping -%}\n" - " {%- set ns = namespace(parts=[]) -%}\n" - " {%- for item in content -%}\n" - " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" - " {%- if is_system_content -%}\n" - " {{- raise_exception('System message cannot contain images.') -}}\n" - " {%- endif -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.type == 'image_url' -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" - # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" - # " {%- if is_system_content -%}\n" - # " {{- raise_exception('System message cannot contain videos.') -}}\n" - # " {%- endif -%}\n" - # " {%- set url_val = '' -%}\n" - # " {%- if item.type == 'video_url' -%}\n" - # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" - # " {%- endif -%}\n" - # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" - " {%- elif 'text' in item -%}\n" - " {%- set ns.parts = ns.parts + [item.text] -%}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected item type in content.') -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.parts | join('\\n') -}}\n" - " {%- elif content is none or content is undefined -%}\n" - " {{- '' -}}\n" - " {%- else -%}\n" - " {{- raise_exception('Unexpected content type.') -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- if not messages %}\n" - " {{- raise_exception('No messages provided.') }}\n" - "{%- endif %}\n" - "{%- if tools and tools is iterable and tools is not mapping %}\n" - " {{- '<|im_start|>system\\n' }}\n" - " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" - " {%- for tool in tools %}\n" - " {{- '\\n' }}\n" - " {{- tool | tojson }}\n" - " {%- endfor %}\n" - " {{- '\\n' }}\n" - " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {%- if content %}\n" - " {{- '\\n\\n' + content }}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - "{%- else %}\n" - " {%- if messages[0].role == 'system' %}\n" - " {%- set content = render_content(messages[0].content, true)|trim %}\n" - " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" - "{%- for message in messages[::-1] %}\n" - " {%- set index = (messages|length - 1) - loop.index0 %}\n" - " {%- if ns.multi_step_tool and message.role == 'user' %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if not(content.startswith('') and content.endswith('')) %}\n" - " {%- set ns.multi_step_tool = false %}\n" - " {%- set ns.last_query_index = index %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if ns.multi_step_tool %}\n" - " {{- raise_exception('No user query found in messages.') }}\n" - "{%- endif %}\n" - "{%- for message in messages %}\n" - " {%- set content = render_content(message.content)|trim %}\n" - " {%- if message.role == 'system' %}\n" - " {%- if not loop.first %}\n" - " {{- raise_exception('System message must be at the beginning.') }}\n" - " {%- endif %}\n" - " {%- elif message.role == 'user' %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" - " {%- elif message.role == 'assistant' %}\n" - " {%- set reasoning_content = '' %}\n" - " {%- if message.reasoning_content is string %}\n" - " {%- set reasoning_content = message.reasoning_content %}\n" - " {%- else %}\n" - " {%- if '' in content %}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" - " {%- endif %}\n" - " {%- endif %}\n" - " {%- set reasoning_content = reasoning_content|trim %}\n" - " {%- if loop.index0 > ns.last_query_index %}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" - " {%- else %}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" - " {%- endif %}\n" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" - " {%- for tool_call in message.tool_calls %}\n" - " {%- if tool_call.function is defined %}\n" - " {%- set tool_call = tool_call.function %}\n" - " {%- endif %}\n" - " {%- if loop.first %}\n" - " {%- if content|trim %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- '\\n\\n\\n' }}\n" - " {%- endif %}\n" - " {%- if tool_call.arguments is defined %}\n" - " {%- for args_name, args_value in tool_call.arguments|items %}\n" - " {{- '\\n' }}\n" - " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" - " {{- args_value }}\n" - " {{- '\\n\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '\\n' }}\n" - " {%- endfor %}\n" - " {%- endif %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif message.role == 'tool' %}\n" - " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" - " {{- '<|im_start|>user' }}\n" - " {%- endif %}\n" - " {{- '\\n\\n' }}\n" - " {{- content }}\n" - " {{- '\\n' }}\n" - " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- elif loop.last %}\n" - " {{- '<|im_end|>\\n' }}\n" - " {%- endif %}\n" - " {%- else %}\n" - " {{- raise_exception('Unexpected message role.') }}\n" - " {%- endif %}\n" - "{%- endfor %}\n" - "{%- if add_generation_prompt %}\n" - " {{- '<|im_start|>assistant\\n' }}\n" - " {%- if enable_thinking is defined and enable_thinking is false %}\n" - " {{- '\\n\\n\\n\\n' }}\n" - " {%- else %}\n" - " {{- '\\n' }}\n" - " {%- endif %}\n" - "{%- endif %}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the MiniCPM-V-4.6 Handler. - - Args: - enable_thinking (bool): Controls whether to open a `` block for reasoning. - Defaults to False as per the standard template logic. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # MiniCPM uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class Gemma3ChatHandler(MTMDChatHandler): - - GEMMA3_BOI_TOKEN = "" - GEMMA3_EOI_TOKEN = "" - GEMMA3_BOS_TOKEN = "" - GEMMA3_EOS_TOKEN = "" - - CHAT_FORMAT = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" - "{% if messages[0]['content'] is string %}" - "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" - "{% else %}" - "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" - "{% endif %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set first_user_prefix = '' %}" - "{% endif %}" - - "{% for message in loop_messages %}" - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" - "{% endif %}" - - "{% if message['role'] == 'assistant' %}" - "{% set role = 'model' %}" - "{% else %}" - "{% set role = message['role'] %}" - "{% endif %}" - - "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" - - "{% if message['content'] is string %}" - "{{ message['content'] | trim }}" - "{% elif message['content'] is iterable %}" - "{% for item in message['content'] %}" - "{% if item['type'] == 'image_url' and item['image_url'] is string %}" - "{{ '' + item['image_url'] + '' }}" - "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" - "{{ '' + item['image_url']['url'] + '' }}" - "{% elif item['type'] == 'text' %}" - "{{ item['text'] | trim }}" - "{% endif %}" - "{% endfor %}" - "{% else %}" - "{{ raise_exception('Invalid content type') }}" - "{% endif %}" - - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "model\n" - "{% endif %}" - ) - - -class Gemma4ChatHandler(MTMDChatHandler): - """ - Handler for Gemma 4 models. - - Note on `enable_thinking`: - The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. - It is NOT supported by Gemma4 E2B and E4B models. - - [Important Note for Audio Processing!] - It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. - Other quantizations are known to have degraded performance; - ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 - """ - - # The special token in Gemma 4 - GEMMA4_BOI_TOKEN = "<|image>" - GEMMA4_EOI_TOKEN = "" - GEMMA4_BOA_TOKEN = "<|audio>" - GEMMA4_EOA_TOKEN = "" - GEMMA4_BOS_TOKEN = "" - GEMMA4_EOS_TOKEN = "" - GEMMA4_SOT_TOKEN = "<|turn>" - GEMMA4_EOT_TOKEN = "" - GEMMA4_SOC_TOKEN = "<|channel>" - GEMMA4_EOC_TOKEN = "" - GEMMA4_STC_TOKEN = "<|tool_call>" - GEMMA4_ETC_TOKEN = "" - GEMMA4_STD_TOKEN = "<|tool>" - GEMMA4_ETD_TOKEN = "" - GEMMA4_STR_TOKEN = "<|tool_response>" - GEMMA4_ETR_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" - " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in properties | dictsort -%}\n" - " {%- set add_comma = false -%}\n" - " {%- if not filter_keys or key not in standard_keys -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {{ key }}:{\n" - " {%- if value['description'] -%}\n" - " description:<|\"|>{{ value['description'] }}<|\"|>\n" - " {%- set add_comma = true -%}\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'STRING' -%}\n" - " {%- if value['enum'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " enum:{{ format_argument(value['enum']) }}\n" - " {%- endif -%}\n" - " {%- elif value['type'] | upper == 'ARRAY' -%}\n" - " {%- if value['items'] is mapping and value['items'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " items:{\n" - " {%- set ns_items = namespace(found_first=false) -%}\n" - " {%- for item_key, item_value in value['items'] | dictsort -%}\n" - " {%- if item_value is not none -%}\n" - " {%- if ns_items.found_first %},{% endif -%}\n" - " {%- set ns_items.found_first = true -%}\n" - " {%- if item_key == 'properties' -%}\n" - " properties:{\n" - " {%- if item_value is mapping -%}\n" - " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" - " {%- endif -%}\n" - " }\n" - " {%- elif item_key == 'required' -%}\n" - " required:[\n" - " {%- for req_item in item_value -%}\n" - " <|\"|>{{- req_item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- elif item_key == 'type' -%}\n" - " {%- if item_value is string -%}\n" - " type:{{ format_argument(item_value | upper) }}\n" - " {%- else -%}\n" - " type:{{ format_argument(item_value | map('upper') | list) }}\n" - " {%- endif -%}\n" - " {%- else -%}\n" - " {{ item_key }}:{{ format_argument(item_value) }}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " }\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if value['nullable'] %}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " nullable:true\n" - " {%- endif -%}\n" - " {%- if value['type'] | upper == 'OBJECT' -%}\n" - " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" - " }\n" - " {%- elif value is mapping -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " properties:{\n" - " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" - " }\n" - " {%- endif -%}\n" - " {%- if value['required'] -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " required:[\n" - " {%- for item in value['required'] | default([]) -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " ]\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" - " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - "{%- endmacro -%}\n" - "{%- macro format_function_declaration(tool_data) -%}\n" - " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" - " {%- set params = tool_data['function']['parameters'] -%}\n" - " {%- if params -%}\n" - " ,parameters:{\n" - " {%- if params.get('properties') -%}\n" - " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" - " {%- endif -%}\n" - " {%- if params.get('required') -%}\n" - " required:[\n" - " {%- for item in params['required'] -%}\n" - " <|\"|>{{- item -}}<|\"|>\n" - " {{- ',' if not loop.last -}}\n" - " {%- endfor -%}\n" - " ],\n" - " {%- endif -%}\n" - " {%- if params.get('type') -%}\n" - " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if 'response' in tool_data['function'] -%}\n" - " {%- set response_declaration = tool_data['function']['response'] -%}\n" - " ,response:{\n" - " {%- if response_declaration['description'] -%}\n" - " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" - " {%- endif -%}\n" - " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" - " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " }\n" - "{%- endmacro -%}\n" - "{%- macro format_argument(argument, escape_keys=True) -%}\n" - " {%- if argument is string -%}\n" - " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" - " {%- elif argument is boolean -%}\n" - " {{- 'true' if argument else 'false' -}}\n" - " {%- elif argument is mapping -%}\n" - " {{- '{' -}}\n" - " {%- set ns = namespace(found_first=false) -%}\n" - " {%- for key, value in argument | dictsort -%}\n" - " {%- if ns.found_first %},{% endif -%}\n" - " {%- set ns.found_first = true -%}\n" - " {%- if escape_keys -%}\n" - " {{- '<|\"|>' + key + '<|\"|>' -}}\n" - " {%- else -%}\n" - " {{- key -}}\n" - " {%- endif -%}\n" - " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- elif argument is sequence -%}\n" - " {{- '[' -}}\n" - " {%- for item in argument -%}\n" - " {{- format_argument(item, escape_keys=escape_keys) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- ']' -}}\n" - " {%- else -%}\n" - " {{- argument -}}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "{%- macro strip_thinking(text) -%}\n" - " {%- set ns = namespace(result='') -%}\n" - " {%- for part in text.split('') -%}\n" - " {%- if '<|channel>' in part -%}\n" - " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" - " {%- else -%}\n" - " {%- set ns.result = ns.result + part -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- ns.result | trim -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- macro format_tool_response_block(tool_name, response) -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- if response is mapping -%}\n" - " {{- 'response:' + tool_name + '{' -}}\n" - " {%- for key, value in response | dictsort -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- if not loop.last %},{% endif -%}\n" - " {%- endfor -%}\n" - " {{- '}' -}}\n" - " {%- else -%}\n" - " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" - " {%- endif -%}\n" - " {{- '' -}}\n" - "{%- endmacro -%}\n" - "\n" - "{%- set ns = namespace(prev_message_type=None) -%}\n" - "{%- set loop_messages = messages -%}\n" - "{{- bos_token -}}\n" - "{#- Handle System/Tool Definitions Block -#}\n" - "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" - " {{- '<|turn>system\\n' -}}\n" - " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" - " {%- if enable_thinking is defined and enable_thinking -%}\n" - " {{- '<|think|>\\n' -}}\n" - " {%- set ns.prev_message_type = 'think' -%}\n" - " {%- endif -%}\n" - " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" - " {%- if messages[0]['content'] is string -%}\n" - " {{- messages[0]['content'] | trim -}}\n" - " {%- elif messages[0]['content'] is sequence -%}\n" - " {%- for item in messages[0]['content'] -%}\n" - " {{- item['text'] | trim + ' '-}}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set loop_messages = messages[1:] -%}\n" - " {%- endif -%}\n" - " {%- if tools -%}\n" - " {%- for tool in tools %}\n" - " {{- '<|tool>' -}}\n" - " {{- format_function_declaration(tool) | trim -}}\n" - " {{- '' -}}\n" - " {%- endfor %}\n" - " {%- set ns.prev_message_type = 'tool' -%}\n" - " {%- endif -%}\n" - " {{- '\\n' -}}\n" - "{%- endif %}\n" - "\n" - "{#- Pre-scan: find last user message index for reasoning guard -#}\n" - "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" - "{%- for i in range(loop_messages | length) -%}\n" - " {%- if loop_messages[i]['role'] == 'user' -%}\n" - " {%- set ns_turn.last_user_idx = i -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{#- Loop through messages -#}\n" - "{%- for message in loop_messages -%}\n" - " {%- if message['role'] != 'tool' -%}\n" - " {%- set ns.prev_message_type = None -%}\n" - " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" - " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" - " {%- set prev_nt = namespace(role=None, found=false) -%}\n" - " {%- if loop.index0 > 0 -%}\n" - " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" - " {%- if not prev_nt.found -%}\n" - " {%- if loop_messages[j]['role'] != 'tool' -%}\n" - " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" - " {%- set prev_nt.found = true -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" - " {%- if not continue_same_model_turn -%}\n" - " {{- '<|turn>' + role + '\\n' }}\n" - " {%- endif -%}\n" - "\n" - " {#- Render reasoning/reasoning_content as thinking channel -#}\n" - " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" - " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" - " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" - " {%- endif -%}\n" - "\n" - " {%- if message.get('tool_calls') -%}\n" - " {%- for tool_call in message['tool_calls'] -%}\n" - " {%- set function = tool_call['function'] -%}\n" - " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" - " {%- if function['arguments'] is mapping -%}\n" - " {%- set ns_args = namespace(found_first=false) -%}\n" - " {%- for key, value in function['arguments'] | dictsort -%}\n" - " {%- if ns_args.found_first %},{% endif -%}\n" - " {%- set ns_args.found_first = true -%}\n" - " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" - " {%- endfor -%}\n" - " {%- elif function['arguments'] is string -%}\n" - " {{- function['arguments'] -}}\n" - " {%- endif -%}\n" - " {{- '}' -}}\n" - " {%- endfor -%}\n" - " {%- set ns.prev_message_type = 'tool_call' -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set ns_tr_out = namespace(flag=false) -%}\n" - " {%- if message.get('tool_responses') -%}\n" - " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" - " {%- for tool_response in message['tool_responses'] -%}\n" - " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endfor -%}\n" - " {%- elif message.get('tool_calls') -%}\n" - " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" - " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" - " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" - " {%- if ns_tool_scan.stopped -%}\n" - " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" - " {%- set ns_tool_scan.stopped = true -%}\n" - " {%- else -%}\n" - " {%- set follow = loop_messages[k] -%}\n" - " {#- Resolve tool_call_id to function name -#}\n" - " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" - " {%- for tc in message['tool_calls'] -%}\n" - " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" - " {%- set ns_tname.name = tc['function']['name'] -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {#- Handle content as string or content-parts array -#}\n" - " {%- set tool_body = follow.get('content') -%}\n" - " {%- if tool_body is string -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- elif tool_body is sequence and tool_body is not string -%}\n" - " {%- set ns_txt = namespace(s='') -%}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'text' -%}\n" - " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" - " {%- for part in tool_body -%}\n" - " {%- if part.get('type') == 'image_url' -%}\n" - " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" - " {%- if part.get('type') == 'audio_url' -%}\n" - " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif part.get('type') == 'input_audio' -%}\n" - " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - # " {%- elif part.get('type') == 'video_url' -%}\n" - # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- else -%}\n" - " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" - " {%- endif -%}\n" - " {%- set ns_tr_out.flag = true -%}\n" - " {%- set ns.prev_message_type = 'tool_response' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "\n" - " {%- set captured_content -%}\n" - " {%- if message['content'] is string -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(message['content']) -}}\n" - " {%- else -%}\n" - " {{- message['content'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif message['content'] is sequence -%}\n" - " {%- for item in message['content'] -%}\n" - " {%- if item['type'] == 'text' -%}\n" - " {%- if role == 'model' -%}\n" - " {{- strip_thinking(item['text']) -}}\n" - " {%- else -%}\n" - " {{- item['text'] | trim -}}\n" - " {%- endif -%}\n" - " {%- elif item['type'] == 'image_url' -%}\n" - " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {{- '<|image|>' + url_val -}}\n" - " {%- set ns.prev_message_type = 'image' -%}\n" - " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" - " {%- if item['type'] == 'audio_url' -%}\n" - " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- elif item['type'] == 'input_audio' -%}\n" - " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" - " {{- '<|audio|>' + audio_val -}}\n" - " {%- endif -%}\n" - " {%- set ns.prev_message_type = 'audio' -%}\n" - " {%- endif -%}\n" - # " {%- elif item['type'] == 'video_url' -%}\n" - # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" - # " {{- '<|video|>' + video_val -}}\n" - # " {%- set ns.prev_message_type = 'video' -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endset -%}\n" - "\n" - " {{- captured_content -}}\n" - " {%- set has_content = captured_content | trim | length > 0 -%}\n" - "\n" - " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" - " {{- '<|tool_response>' -}}\n" - " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- if add_generation_prompt -%}\n" - " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" - " {{- '<|turn>model\\n' -}}\n" - " {%- if not enable_thinking | default(false) -%}\n" - " {{- '<|channel>thought\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Gemma 4 Handler. - - Args: - enable_thinking (bool): Controls whether the <|think|> tag is injected and - manages <|channel>thought behavior. - Note: ONLY supported on Gemma4 31B and 26BA4B models. - NOT supported on Gemma4 E2B and E4B models. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject the thinking variable into the Jinja environment - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Set the stop token based on Gemma 4's format () - # generation_config.json: "eos_token_id": [1, 106, 50] - kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GLM41VChatHandler(MTMDChatHandler): - # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. - - GLM41V_EOS_TOKEN = "<|endoftext|>" - GLM41V_PAD_TOKEN = "<|endoftext|>" - GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]\n" - "{%- for msg in messages -%}" - "{%- if msg.role == 'system' -%}" - "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'user' -%}" - "<|user|>\n" - "{%- if msg.content is string -%}" - "{{ msg.content }}" - "{%- else -%}" - "{%- for item in msg.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" - "{%- elif msg.role == 'assistant' -%}" - "{%- if msg.metadata -%}" - "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- else -%}" - "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{%- endif -%}" - ) - - def __call__(self, **kwargs): - self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN - # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json - stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch - kwargs['stop'] = stop_tokens - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class GLM46VChatHandler(MTMDChatHandler): - GLM46V_EOS_TOKEN = "<|endoftext|>" - GLM46V_PAD_TOKEN = "<|endoftext|>" - GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" - GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" - - CHAT_FORMAT = ( - "[gMASK]" - "{%- if tools -%}" - "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" - "You are provided with function signatures within XML tags:\n\n" - "{%- for tool in tools -%}" - "{{ tool | tojson(ensure_ascii=False) }}\n" - "{%- endfor -%}" - "\n\nFor each function call, output the function name and arguments within the following XML format:\n" - "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" - "{%- endif -%}" - - "{%- for m in messages -%}" - "{%- if m.role == 'system' -%}" - "<|system|>\n{{ m.content }}" - "{%- elif m.role == 'user' -%}" - "<|user|>\n" - "{%- if m.content is string -%}" - "{{ m.content }}" - "{%- else -%}" - "{%- for item in m.content -%}" - "{%- if item.type == 'image_url' or 'image_url' in item -%}" - "<|begin_of_image|>" - "{%- if item.image_url is string -%}" - "{{- item.image_url -}}" - "{%- else -%}" - "{{- item.image_url.url -}}" - "{%- endif -%}" - "<|end_of_image|>" - "{%- elif item.type == 'text' -%}" - "{{ item.text }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - # If enable_thinking is disabled, insert `/nothink` according to the source code logic. - "{{ '/nothink' if not enable_thinking else '' }}" - "{%- elif m.role == 'assistant' -%}" - "<|assistant|>" - "{%- if enable_thinking -%}" - "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" - "\n{{ reasoning.strip() }}" - "{%- else -%}" - "\n" - "{%- endif -%}" - "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" - "{%- endif -%}" - "{{ GLM46V_EOS_TOKEN }}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "<|assistant|>\n" - "{{ '' if enable_thinking else '\n' }}" - "{%- endif -%}" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - GLM-4.6V Handler - Parameters: - - enable_thinking (bool): Whether to enable the model's think process. The default is True. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN - - # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json - kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -class GraniteDoclingChatHandler(MTMDChatHandler): - """ - Handler for Granite-Docling models. - - Format(512x512): Content - - Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! - Since the model does not have special tokens for the start and end of an image, - it is recommended to process only one image at a time. - You can iterate through the images individually for recognition. - - """ - GRANITE_BOS_TOKEN = "<|start_of_role|>" - GRANITE_EOS_TOKEN = "<|end_of_text|>" - GRANITE_PAD_TOKEN = "<|end_of_text|>" - GRANITE_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for part in message['content'] -%}" - "{%- if part['type'] == 'text' -%}" - "{{- part['text'] -}}" - "{%- elif part['type'] == 'image_url' -%}" - "{%- if part.image_url is string -%}" - "{{- part.image_url -}}" - "{%- else -%}" - "{{- part.image_url.url -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '<|end_of_text|>\n' -}}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|start_of_role|>assistant' -}}" - # Support the 'controls' parameter if present in generation arguments - "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" - "{{- '<|end_of_role|>' -}}" - "{%- endif -%}" - ) - - def __init__(self, controls: dict = None, **kwargs): - """ - Granite-Docling Handler - Args: - controls (dict, optional): Operational parameters passed to the assistant role. - - The 'controls' parameter is used to guide the model's behavior or output format. - Common examples for 'controls' include: - - Document Parsing: {"mode": "document_parsing", "format": "json"} - """ - self.controls = controls - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Inject controls into the template environment - self.extra_template_arguments["controls"] = self.controls - self.DEFAULT_SYSTEM_MESSAGE = None - kwargs['stop'] = [self.GRANITE_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - - return super().__call__(**kwargs) - - -class LFM2VLChatHandler(MTMDChatHandler): - LFM2VL_BOS_TOKEN = "<|startoftext|>" - LFM2VL_EOS_TOKEN = "<|im_end|>" - LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" - - CHAT_FORMAT = ( - "{%- for message in messages -%}" - "{{ '<|im_start|>' + message['role'] + '\n' }}" - "{%- if message['content'] is string -%}" - "{{ message['content'] }}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if 'image_url' in content -%}" - "{%- if content.image_url is string -%}" - "<|image_start|>{{ content.image_url }}<|image_end|>" - "{%- else -%}" - "<|image_start|>{{ content.image_url.url }}<|image_end|>" - "{%- endif -%}" - "{%- elif content['type'] == 'text' -%}" - "{{ content['text'] }}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{ '<|im_end|>\n' }}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{ '<|im_start|>assistant\n' }}" - "{%- endif -%}" - ) - - def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): - """ - LFM2-VL Handler - LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 - """ - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) - - def __call__(self, **kwargs): - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class LFM25VLChatHandler(MTMDChatHandler): - """ - Handler for LFM2.5-VL multimodal models. - - Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. - """ - # Aligned with LFM2.5-VL tokenizer_config - LFM25VL_BOS_TOKEN = "<|startoftext|>" - LFM25VL_EOS_TOKEN = "<|im_end|>" - LFM25VL_PAD_TOKEN = "<|pad|>" - - # Image specific tokens - LFM25VL_IMAGE_TOKEN = "" - LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" - LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" - LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" - - CHAT_FORMAT = ( - "{{- bos_token -}}\n" - "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" - "{%- set ns = namespace(system_prompt='', content='') -%}\n" - "{%- if messages[0]['role'] == 'system' -%}\n" - " {%- set ns.system_prompt = messages[0]['content'] -%}\n" - " {%- set messages = messages[1:] -%}\n" - "{%- endif -%}\n" - "{%- if tools -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" - " {%- for tool in tools -%}\n" - " {%- if tool is not string -%}\n" - " {%- set tool = tool | tojson -%}\n" - " {%- endif -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" - " {%- if not loop.last -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" - "{%- endif -%}\n" - "{%- if ns.system_prompt -%}\n" - " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" - "{%- endif -%}\n" - "{%- set ns.last_assistant_index = -1 -%}\n" - "{%- for message in messages -%}\n" - " {%- if message['role'] == 'assistant' -%}\n" - " {%- set ns.last_assistant_index = loop.index0 -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- for message in messages -%}\n" - " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" - " {%- set content = message['content'] -%}\n" - " {%- if content is not string -%}\n" - " {%- set ns.content = '' -%}\n" - " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" - " {%- for item in content -%}\n" - " {%- if item['type'] == 'image_url' -%}\n" - " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" - " {%- set ns.content = ns.content + img_val -%}\n" - " {%- elif item['type'] == 'text' -%}\n" - " {%- set ns.content = ns.content + item['text'] -%}\n" - " {%- else -%}\n" - " {%- set ns.content = ns.content + (item | tojson) -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- set content = ns.content -%}\n" - " {%- endif -%}\n" - " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" - " {%- if '' in content -%}\n" - " {%- set content = content.split('')[-1] | trim -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {{- content + '<|im_end|>\\n' -}}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, keep_past_thinking: bool = False, **kwargs): - self.keep_past_thinking = keep_past_thinking - super().__init__(**kwargs) - - - def __call__(self, **kwargs): - if self.image_min_tokens > 256: - if self.verbose: - print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") - self.image_min_tokens = -1 - - self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking - - kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") - return super().__call__(**kwargs) - - -class PaddleOCRChatHandler(MTMDChatHandler): - """ - Handler for PaddleOCR 1.5/1.6 multimodal models. - """ - - PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" - PADDLEOCR_BOS_TOKEN = "" - PADDLEOCR_EOS_TOKEN = "" - PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" - PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" - PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" - - CHAT_FORMAT = ( - "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" - "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" - "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" - - "{{- cls_token -}}" - "{%- for message in messages -%}" - "{%- if message['role'] == 'user' -%}" - "{{- 'User: ' -}}" - - # Robust parsing: Check if content is string or list - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - # Pass 1: Render all images first - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" - "{{- '<|IMAGE_START|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|IMAGE_END|>' -}}" - "{%- endif -%}" - "{%- endfor -%}" - - # Pass 2: Render all text second - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- '\\n' -}}" - - "{%- elif message['role'] == 'assistant' -%}" - "{{- 'Assistant:\\n' -}}" - "{%- if message['content'] is string -%}" - "{{- message['content'] -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{{- eos_token -}}" - - "{%- elif message['role'] == 'system' -%}" - "{%- if message['content'] is string -%}" - "{{- message['content'] + '\\n' -}}" - "{%- else -%}" - "{%- for content in message['content'] -%}" - "{%- if content['type'] == 'text' -%}" - "{{- content['text'] + '\\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- endif -%}" - "{%- endfor -%}" - - "{%- if add_generation_prompt -%}" - "{{- 'Assistant:\\n' -}}" - "{%- endif -%}" - ) - - def __init__( - self, - image_min_tokens: int = -1, - image_max_tokens: int = -1, - **kwargs - ): - self.image_min_tokens = image_min_tokens - self.image_max_tokens = image_max_tokens - super().__init__( - image_min_tokens=self.image_min_tokens, - image_max_tokens=self.image_max_tokens, - **kwargs - ) - - def __call__(self, **kwargs): - # Set the specific stop token defined in the PaddleOCR template - kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - return super().__call__(**kwargs) - - -class Qwen25VLChatHandler(MTMDChatHandler): - - QWEN25_VL_BOS_TOKEN = "<|endoftext|>" - QWEN25_VL_PAD_TOKEN = "<|endoftext|>" - QWEN25_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and message['role'] != 'system' %}" - "<|im_start|>system\n" - "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is string %}" - "{{ message['content'] }}<|im_end|>\n" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" - "{% else %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" - "{% endif %}" - "{% elif content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% endif %}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endfor %}" - "<|im_start|>assistant\n" - ) - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen3ASRChatHandler(MTMDChatHandler): - """ - Handler for Qwen 3 ASR (Automatic Speech Recognition) models. - - Features: - - Highly specialized for Speech-to-Text tasks. - - Aggregates all system text into a single cohesive system block. - - Drops user text entirely, extracting ONLY audio data into a unified user turn. - - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. - - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. - """ - - DEFAULT_SYSTEM_MESSAGE = """ - You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. - You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. - """ - - QWEN3_ASR_BOS_TOKEN = "<|im_start|>" - QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" - QWEN3_ASR_EOS_TOKEN = "<|im_end|>" - - - QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" - QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" - QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" - - CHAT_FORMAT = ( - "{%- set ns = namespace(system_text='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.role == 'system' -%}\n" - " {%- if m.content is string -%}\n" - " {%- set ns.system_text = ns.system_text + m.content -%}\n" - " {%- else -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'text' and (c.text is defined) -%}\n" - " {%- set ns.system_text = ns.system_text + c.text -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- set ns2 = namespace(audio_tokens='') -%}\n" - "{%- for m in messages -%}\n" - " {%- if m.content is not string -%}\n" - " {%- for c in m.content -%}\n" - " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" - " {#- MTMD Audio Injection -#}\n" - " {%- set audio_val = '' -%}\n" - " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" - " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" - " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" - " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" - " {%- endif -%}\n" - " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" - "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token - kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") - - return super().__call__(**kwargs) - -class Qwen3VLChatHandler(MTMDChatHandler): - - QWEN3_VL_BOS_TOKEN = "<|endoftext|>" - QWEN3_VL_PAD_TOKEN = "<|endoftext|>" - QWEN3_VL_EOS_TOKEN = "<|im_end|>" - - CHAT_FORMAT = ( - "{{- '<|im_start|>system\n' -}}" - "{%- if messages[0].content is string and messages[0].role == 'system' -%}" - "{{- messages[0].content -}}" - "{%- elif messages[0].role == 'system' -%}" - "{%- if 'text' in messages[0].content -%}" - "{{- messages[0].content.text -}}" - "{%- else -%}" - "{{- 'You are a helpful assistant.' -}}" - "{%- endif -%}" - "{%- endif -%}" - "{%- if tools -%}" - "{{- '\n\n' -}}" - "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" - "{%- for tool in tools -%}" - "{{- '\n' -}}" - "{{- tool | tojson -}}" - "{%- endfor -%}" - "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" - "{%- endif -%}" - "{{- '<|im_end|>\n' -}}" - "{%- set image_count = namespace(value=0) -%}" - #"{%- set video_count = namespace(value=0) -%}" - "{%- for message in messages -%}" - "{%- if message.role == 'tool' -%}" - "{{- '<|im_start|>user\n\n' -}}" - "{%- elif message.role != 'system' -%}" - "{{- '<|im_start|>' + message.role + '\n' -}}" - "{%- endif -%}" - "{%- if message.content is string and message.role != 'system' -%}" - "{{- message.content -}}" - "{%- elif message.role != 'system' -%}" - "{%- for content in message.content -%}" - "{%- if 'image_url' in content -%}" - "{%- set image_count.value = image_count.value + 1 -%}" - "{%- if add_vision_id -%}" - "{{- 'Picture ' -}}" - "{{- image_count.value | string -}}" - "{{- ': ' -}}" - "{%- endif -%}" - "{{- '<|vision_start|>' -}}" - "{%- if content.image_url is string -%}" - "{{- content.image_url -}}" - "{%- else -%}" - "{{- content.image_url.url -}}" - "{%- endif -%}" - "{{- '<|vision_end|>' -}}" - "{%- endif -%}" - # Video not supported yet - "{%- if 'text' in content -%}" - "{{- content.text -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- if message.role == 'assistant' -%}" - "{%- if message.tool_calls -%}" - "{%- for tool_call in message.tool_calls -%}" - "{%- if (loop.first and message.content) or (not loop.first) -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- if tool_call.function -%}" - "{%- set tool_call = tool_call.function -%}" - "{%- endif -%}" - "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" - "{%- if tool_call.arguments is string -%}" - "{{- tool_call.arguments -}}" - "{%- else -%}" - "{{- tool_call.arguments | tojson -}}" - "{%- endif -%}" - "{{- '}\n' -}}" - "{%- endfor -%}" - "{%- endif -%}" - "{%- elif message.role == 'tool' -%}" - "{{- '' -}}" - "{%- endif -%}" - "{%- if message.role != 'system' -%}" - "{{- '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - "{{- '<|im_start|>assistant\n' -}}" - "{%- if force_reasoning -%}" - "{{- '\n' -}}" - "{%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - force_reasoning: bool = False, - add_vision_id: bool = True, - **kwargs, - ): - """ - Parameters: - - force_reasoning (bool): - - True: Force the reasoning in the model by adding to the chat template. - - False (default): Don't force the reasoning. - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - """ - super().__init__(**kwargs) - self.force_reasoning = force_reasoning - self.extra_template_arguments["force_reasoning"] = force_reasoning - self.extra_template_arguments["add_vision_id"] = add_vision_id - - def __call__(self, **kwargs): - kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] - - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - -class Qwen35ChatHandler(MTMDChatHandler): - """ - Handler for Qwen3.5/Qwen3.6 models. - """ - CHAT_FORMAT = ( - "{%- set image_count = namespace(value=0) -%}" - "{%- set video_count = namespace(value=0) -%}" - "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" - " {%- if content is string -%}" - " {{- content -}}" - " {%- elif content is iterable and content is not mapping -%}" - " {%- for item in content -%}" - " {%- if 'image_url' in item or item.type == 'image_url' -%}" - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain images.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set image_count.value = image_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Picture ' -}}" - " {{- image_count.value | string -}}" - " {{- ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {%- if item.image_url is string -%}" - " {{- item.image_url -}}" - " {%- else -%}" - " {{- item.image_url.url -}}" - " {%- endif -%}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'video' in item -%}" - " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception - " {%- if is_system_content -%}" - " {{- raise_exception('System message cannot contain videos.') -}}" - " {%- endif -%}" - " {%- if do_vision_count -%}" - " {%- set video_count.value = video_count.value + 1 -%}" - " {%- endif -%}" - " {%- if add_vision_id -%}" - " {{- 'Video ' ~ video_count.value ~ ': ' -}}" - " {%- endif -%}" - " {{- '<|vision_start|>' -}}" - " {{- item.video -}}" - " {{- '<|vision_end|>' -}}" - " {%- elif 'text' in item -%}" - " {{- item.text -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected item type in content.') -}}" - " {%- endif -%}" - " {%- endfor -%}" - " {%- elif content is none or content is undefined -%}" - " {{- '' -}}" - " {%- else -%}" - " {{- raise_exception('Unexpected content type.') -}}" - " {%- endif -%}" - "{%- endmacro -%}" - "{%- if not messages -%}" - " {{- raise_exception('No messages provided.') -}}" - "{%- endif -%}" - "{%- if tools and tools is iterable and tools is not mapping -%}" - " {{- '<|im_start|>system\n' -}}" - " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" - " {%- for tool in tools -%}" - " {{- '\n' -}}" - " {{- tool | tojson -}}" - " {%- endfor -%}" - " {{- '\n' -}}" - " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" - " {%- if messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) | trim -%}" - " {%- if content -%}" - " {{- '\n\n' + content -}}" - " {%- endif -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - "{%- elif messages[0].role == 'system' -%}" - " {%- set content = render_content(messages[0].content, false, true) -%}" - " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" - "{%- endif -%}" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" - "{%- for message in messages[::-1] -%}" - " {%- set index = messages | length - 1 - loop.index0 -%}" - " {%- if ns.multi_step_tool and message.role == 'user' -%}" - " {%- set content = render_content(message.content, false) | trim -%}" - " {%- if not (content.startswith('') and content.endswith('')) -%}" - " {%- set ns.multi_step_tool = false -%}" - " {%- set ns.last_query_index = index -%}" - " {%- endif -%}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if ns.multi_step_tool -%}" - " {{- raise_exception('No user query found in messages.') -}}" - "{%- endif -%}" - "{%- for message in messages -%}" - " {%- set content = render_content(message.content, true) | trim -%}" - " {%- if message.role == 'system' -%}" - " {%- if not loop.first -%}" - " {{- raise_exception('System message must be at the beginning.') -}}" - " {%- endif -%}" - " {%- elif message.role == 'user' -%}" - " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" - " {%- elif message.role == 'assistant' -%}" - " {%- set reasoning_content = '' -%}" - " {%- if message.reasoning_content is string -%}" - " {%- set reasoning_content = message.reasoning_content -%}" - " {%- elif '' in content -%}" - " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" - " {%- set content = content.split('')[-1].lstrip('\n') -%}" - " {%- endif -%}" - " {%- set reasoning_content = reasoning_content | trim -%}" - " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" - " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" - " {%- else -%}" - " {{- '<|im_start|>' + message.role + '\n' + content -}}" - " {%- endif -%}" - " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" - " {%- for tool_call in message.tool_calls -%}" - " {%- if tool_call.function is defined -%}" - " {%- set tool_call = tool_call.function -%}" - " {%- endif -%}" - " {%- if loop.first -%}" - " {%- if content | trim -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- '\n\n\n' -}}" - " {%- endif -%}" - " {%- if tool_call.arguments is defined -%}" - " {%- for (args_name, args_value) in tool_call.arguments | items -%}" - " {{- '\n' -}}" - " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" - " {{- args_value -}}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '\n' -}}" - " {%- endfor -%}" - " {%- endif -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif message.role == 'tool' -%}" - " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" - " {{- '<|im_start|>user' -}}" - " {%- endif -%}" - " {{- '\n\n' -}}" - " {{- content -}}" - " {{- '\n' -}}" - " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" - " {{- '<|im_end|>\n' -}}" - " {%- elif loop.last -%}" - " {{- '<|im_end|>\n' -}}" - " {%- endif -%}" - " {%- else -%}" - " {{- raise_exception('Unexpected message role.') -}}" - " {%- endif -%}" - "{%- endfor -%}" - "{%- if add_generation_prompt -%}" - " {{- '<|im_start|>assistant\n' -}}" - " {%- if enable_thinking is defined and enable_thinking is false -%}" - " {{- '\n\n\n\n' -}}" - " {%- else -%}" - " {{- '\n' -}}" - " {%- endif -%}" - "{%- endif -%}" - ) - - def __init__( - self, - add_vision_id: bool = True, - enable_thinking: bool = True, - preserve_thinking: bool = False, - **kwargs, - ): - """ - Parameters: - - add_vision_id (bool): - - True (default): Count all the images. Recommended for multi-image. - - False: Doesn't count the images. Can save tokens with single-image. - - enable_thinking (bool): - - True (default): Enables reasoning for better results. - - False: Disables reasoning for faster results. - - preserve_thinking (bool): - - True: Keeps reasoning process for ALL historical conversational turns. - - False (default): Only keeps for the latest assistant reply to save tokens. - """ - super().__init__(**kwargs) - self.enable_thinking = enable_thinking - self.preserve_thinking = preserve_thinking - self.extra_template_arguments["add_vision_id"] = add_vision_id - self.extra_template_arguments["enable_thinking"] = enable_thinking - self.extra_template_arguments["preserve_thinking"] = preserve_thinking - - def __call__(self, **kwargs): - llama = kwargs['llama'] - - if hasattr(llama, 'input_ids'): - llama.input_ids.fill(0) - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") - - # Use parent implementation - return super().__call__(**kwargs) - - -class Step3VLChatHandler(MTMDChatHandler): - """ - Handler for Step3-VL models. - """ - - STEP3VL_BOS_TOKEN = "<|im_start|>" - STEP3VL_EOS_TOKEN = "<|im_end|>" - STEP3VL_PAD_TOKEN = "<|endoftext|>" - STEP3VL_IMAGE_TOKEN = "" - - CHAT_FORMAT = ( - "{%- macro render_content(content) -%}\n" - " {%- if content is none -%}{{- '' -}}\n" - " {%- elif content is string -%}{{- content -}}\n" - " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" - " {%- elif content is iterable -%}\n" - " {%- for item in content -%}\n" - " {%- if item.type == 'text' -%}\n" - " {{- item['value'] if 'value' in item else item['text'] -}}\n" - " {%- elif item.type in ['image', 'image_url'] -%}\n" - " {%- set url_val = '' -%}\n" - " {%- if item.image_url -%}\n" - " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" - " {%- endif -%}\n" - " {{- '' + url_val -}}\n" - " {%- endif -%}\n" - " {%- endfor -%}\n" - " {%- endif -%}\n" - "{%- endmacro -%}\n" - "\n" - "{%- if tools -%}\n" - " {{- '<|im_start|>system\\n' -}}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" - " {%- endif -%}\n" - " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" - " {%- for tool in tools -%}\n" - " {{- '\\n' -}}\n" - " {{- tool | tojson -}}\n" - " {%- endfor -%}\n" - " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" - "{%- else -%}\n" - " {%- if messages[0].role == 'system' -%}\n" - " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - "{%- endif -%}\n" - "\n" - "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" - "{%- for message in messages[::-1] -%}\n" - " {%- set index = (messages|length - 1) - loop.index0 -%}\n" - " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" - " {%- set ns.multi_step_tool = false -%}\n" - " {%- set ns.last_query_index = index -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "\n" - "{%- for message in messages -%}\n" - " {%- set content = render_content(message.content) -%}\n" - " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" - " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" - " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" - " {%- elif message.role == 'assistant' -%}\n" - " {%- if message.reasoning_content is string -%}\n" - " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" - " {%- else -%}\n" - " {%- if '' in content -%}\n" - " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" - " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" - " {%- else -%}\n" - " {%- set reasoning_content = '' -%}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - " {%- if loop.index0 > ns.last_query_index -%}\n" - " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" - " {%- else -%}\n" - " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" - " {%- endif -%}\n" - " {%- if message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- for tool_call in message.tool_calls -%}\n" - " {{- '\\n' -}}\n" - " {%- if tool_call.function -%}\n" - " {%- set tool_call = tool_call.function -%}\n" - " {%- endif -%}\n" - " {{- '\\n{\"name\": \"' -}}\n" - " {{- tool_call.name -}}\n" - " {{- '\", \"arguments\": ' -}}\n" - " {%- if tool_call.arguments is string -%}\n" - " {{- tool_call.arguments -}}\n" - " {%- else -%}\n" - " {{- tool_call.arguments | tojson -}}\n" - " {%- endif -%}\n" - " {{- '}\\n' -}}\n" - " {%- endfor -%}\n" - " {{- '\\n' -}}\n" - " {%- endif -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- elif message.role == 'tool' -%}\n" - " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" - " {{- '<|im_start|>tool_response' -}}\n" - " {%- endif -%}\n" - " {{- '\\n\\n' -}}\n" - " {{- content -}}\n" - " {{- '\\n' -}}\n" - " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" - " {{- '<|im_end|>\\n' -}}\n" - " {%- endif -%}\n" - " {%- endif -%}\n" - "{%- endfor -%}\n" - "{%- if add_generation_prompt -%}\n" - " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" - "{%- endif -%}\n" - ) - - def __init__(self, enable_thinking: bool = True, **kwargs): - """ - Initializes the Step3-VL Handler. - - Args: - enable_thinking (bool): If False, injects an empty block to bypass reasoning. - """ - self.enable_thinking = enable_thinking - super().__init__(**kwargs) - - def __call__(self, **kwargs): - # Pass thinking toggle into Jinja - self.extra_template_arguments["enable_thinking"] = self.enable_thinking - - # Step3 uses standard <|im_end|> ChatML stop formatting - kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] - - if self.verbose: - print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") - - return super().__call__(**kwargs) - - -@register_chat_completion_handler("chatml-function-calling") -def chatml_function_calling( - llama: llama_core.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - tools: Optional[List[llama_types.ChatCompletionTool]] = None, - tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - min_p: float = 0.05, - typical_p: float = 1.0, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, - max_tokens: Optional[int] = None, - present_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_n_sigma: float = -1.00, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - xtc_threshold: float = 0.1, - xtc_probability: float = 0.0, - dry_multiplier: float = 0.0, - dry_base: float = 1.75, - dry_allowed_length: int = 2, - dry_penalty_last_n:int = 0, - dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], - adaptive_target : float = -1.0, - adaptive_decay : float = 0.9, - use_infill: bool = False, - model: Optional[str] = None, - logits_processor: Optional[llama_core.LogitsProcessorList] = None, - grammar: Optional[llama_grammar.LlamaGrammar] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, - **kwargs, # type: ignore -) -> Union[ - llama_types.CreateChatCompletionResponse, - Iterator[llama_types.CreateChatCompletionStreamResponse], -]: - function_calling_template = ( - "{% for message in messages %}" - "<|im_start|>{{ message.role }}\n" - # System message - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% if tool_calls %}" - "\n\nYou have access to the following functions:\n" - "{% for tool in tools %}" - "\nfunctions.{{ tool.function.name }}:\n" - "{{ tool.function.parameters | tojson }}" - "\n{% endfor %}" - "\n\nYou can respond to users messages with either a single message or one or more function calls." - "\n\nTo respond with a message begin the message with 'message:', use the following format:" - "\n\nmessage:" - "\n" - "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" - "\n\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "\nfunctions.:" - '\n{ "arg1": "value1", "arg2": "value2" }' - "{% endif %}" - "<|im_end|>\n" - "{% endif %}" - # User message - "{% if message.role == 'user' %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - # Assistant message - "{% if message.role == 'assistant' %}" - ## Reglar message - "{% if message.content and message.content | length > 0 %}" - "{% if tool_calls %}" - "message:\n" - "{% endif %}" - "{{ message.content }}" - "<|im_end|>\n" - "{% endif %}" - ## Function calls - "{% if 'tool_calls' in message %}" - "{% for tool_call in message.tool_calls %}" - "functions.{{ tool_call.function.name }}:\n" - "{{ tool_call.function.arguments }}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - ) - template_renderer = ImmutableSandboxedEnvironment( - autoescape=jinja2.select_autoescape(["html", "xml"]), - undefined=jinja2.StrictUndefined, - ).from_string(function_calling_template) - - # Convert legacy functions to tools - if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] - - # Convert legacy function_call to tool_choice - if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): - tool_choice = function_call - if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } - - stop = ( - [stop, "<|im_end|>"] - if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] - ) - - # Case 1: No tool choice by user - if ( - tool_choice is None - or (isinstance(tool_choice, str) and tool_choice == "none") - or tools is None - or len(tools) == 0 - ): - prompt = template_renderer.render( - messages=messages, - tools=[], - tool_calls=None, - add_generation_prompt=True, - ) - - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - - return _convert_completion_to_chat( - llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - present_penalty=present_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - top_n_sigma=top_n_sigma, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - xtc_threshold=xtc_threshold, - xtc_probability=xtc_probability, - dry_multiplier=dry_multiplier, - dry_base=dry_base, - dry_allowed_length=dry_allowed_length, - dry_penalty_last_n=dry_penalty_last_n, - dry_seq_breakers=dry_seq_breakers, - adaptive_target=adaptive_target, - adaptive_decay=adaptive_decay, - use_infill=use_infill, - model=model, - logits_processor=logits_processor, - grammar=grammar, - logprobs=top_logprobs if logprobs else None, - ), - stream=stream, - ) - - # Case 2: Tool choice by user - if isinstance(tool_choice, dict): - tool_name = tool_choice["function"]["name"] - tool = next( - (tool for tool in tools if tool["function"]["name"] == tool_name), None - ) - if tool is None: - raise ValueError(f"Tool with name '{tool_name}' not found in tools") - prompt = template_renderer.render( - messages=messages, - tools=tools, - tool_calls=True, - add_generation_prompt=True, - ) - prompt += f"functions.{tool_name}:\n" + prompt += f"functions.{tool_name}:\n" try: grammar = llama_grammar.LlamaGrammar.from_json_schema( json.dumps(tool["function"]["parameters"]), verbose=llama.verbose @@ -6956,3 +3539,35 @@ def chatml_function_calling( } raise ValueError("Automatic streaming tool choice is not supported") + +# Backward compatibility re-exports. +# These multimodal chat handlers have been moved to `llama_multimodal`. +# New code should import them from `llama_cpp.llama_multimodal` instead of +# `llama_cpp.llama_chat_format`. +from llama_cpp.llama_multimodal import ( + MTMDChatHandler, + GenericMTMDChatHandler, + Llava15ChatHandler, + ObsidianChatHandler, + MoondreamChatHandler, + Llava16ChatHandler, + NanoLlavaChatHandler, + Llama3VisionAlphaChatHandler, + Llama3VisionAlpha, + MiniCPMv26ChatHandler, + MiniCPMv45ChatHandler, + MiniCPMV46ChatHandler, + Gemma3ChatHandler, + Gemma4ChatHandler, + GLM41VChatHandler, + GLM46VChatHandler, + GraniteDoclingChatHandler, + LFM2VLChatHandler, + LFM25VLChatHandler, + PaddleOCRChatHandler, + Qwen25VLChatHandler, + Qwen3ASRChatHandler, + Qwen3VLChatHandler, + Qwen35ChatHandler, + Step3VLChatHandler +) diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py new file mode 100644 index 0000000000..a055869543 --- /dev/null +++ b/llama_cpp/llama_multimodal.py @@ -0,0 +1,3473 @@ +from __future__ import annotations + +import base64 +import ctypes +import json +import os +import sys +import zlib + +from contextlib import ExitStack +from typing import ( + Any, + Dict, + Iterator, + List, + Literal, + Optional, + Tuple, + Union, + Protocol, + TYPE_CHECKING, + cast, +) + +import urllib.request +from urllib.error import URLError, HTTPError + +import llama_cpp.llama_cpp as llama_cpp_lib +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_grammar as llama_grammar + +if TYPE_CHECKING: + import llama_cpp.llama as llama_core + +from ._logger import ggml_log_callback + +from llama_cpp.llama_chat_format import ( + _convert_completion_to_chat, + _convert_completion_to_chat_function, + _grammar_for_response_format, + ImmutableSandboxedEnvironment +) + +class MTMDChatHandler: + DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( +"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, " +"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful." + ) + + CHAT_FORMAT = ( + "{{ bos_token if bos_token is defined else '' }}" + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% elif message.role == 'user' %}" + "USER: " + "{% if message.content is string %}" + "{{ message.content }}" + "{% elif message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% elif content.type == 'audio_url' %}" + "{{ content.audio_url if content.audio_url is string else content.audio_url.url }}" + "{% elif content.type == 'input_audio' %}" + "{% if content.input_audio is string %}" + "{{ content.input_audio }}" + "{% else %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% endif %}" + "{% elif content.type == 'video_url' %}" + "{{ content.video_url if content.video_url is string else content.video_url.url }}" + "{% elif content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + + "{% elif message.role == 'assistant' and message.content is not none %}" + "ASSISTANT: {{ message.content }}" + "{% endif %}" + "{{ \"\n\" }}" + "{% endfor %}" + + "{% if eos_token is defined %}" + "{{ eos_token }}" + "{% endif %}" + + "{% if add_generation_prompt %}" + "ASSISTANT: " + "{% endif %}" + ) + + def __init__( + self, + mmproj_path: Optional[str] = None, + verbose: bool = True, + use_gpu: bool = True, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + chat_template_override: Optional[str] = None, + batch_max_tokens: int = 1024, + **kwargs + ): + + self.log_prefix = self.__class__.__name__ + self.verbose = verbose + + # Backward compatibility: `clip_model_path` was the old name for `mmproj_path`. + # Accept it for existing user code, warn during initialization, and normalize + # all internal usage to `mmproj_path`. + clip_model_path = kwargs.pop("clip_model_path", None) + if mmproj_path is None and clip_model_path is not None: + mmproj_path = clip_model_path + if self.verbose: + print( + f"{self.log_prefix}(__init__): `clip_model_path` is deprecated; " + "please use `mmproj_path` instead.", + file=sys.stderr, + ) + + if kwargs: + unexpected_args = ", ".join(f"'{k}'" for k in kwargs.keys()) + raise TypeError( + f"Initialization Error in {self.log_prefix}: Received unexpected keyword argument(s) {unexpected_args}.\n" + f"If you are passing model-specific parameters, ensure they are supported by {self.log_prefix}." + ) + + if mmproj_path is None: + raise ValueError( + f"{self.log_prefix}(__init__): `mmproj_path` is required. " + "`clip_model_path` is accepted only as a deprecated compatibility alias." + ) + + self.mmproj_path = mmproj_path + if not os.path.exists(self.mmproj_path): + raise ValueError( + f"{self.log_prefix}(__init__): mmproj path does not exist: {self.mmproj_path}" + ) + + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + self.batch_max_tokens = batch_max_tokens + self.use_gpu = use_gpu + + import llama_cpp.mtmd_cpp as mtmd_cpp + self._mtmd_cpp = mtmd_cpp + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + self.extra_template_arguments: dict[str, Any] = {} + + self.is_support_vision = False + self.is_support_audio = False + self.is_support_video = False + + # Pre-compile Jinja template + if (not hasattr(self, "chat_format") or self.chat_format is None) and chat_template_override is None: + self.chat_format = self.CHAT_FORMAT + elif chat_template_override is not None: + self.chat_format = chat_template_override + + self._chat_format_parser_tags = [] + self._change_chat_template(self.chat_format) + + self._exit_stack = ExitStack() + + def _change_chat_template(self, new_template: str): + self.chat_template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True + ).from_string(new_template) + + def _init_mtmd_context(self, llama_model: llama_core.Llama): + """Initialize mtmd context with the llama model.""" + if self.mtmd_ctx is not None: + return # Already initialized + + self._mtmd_cpp.mtmd_helper_log_set(ggml_log_callback, ctypes.c_void_p(0)) + + # Get default parameters + self.mctx_params = self._mtmd_cpp.mtmd_context_params_default() + self.mctx_params.use_gpu = self.use_gpu + self.mctx_params.print_timings = self.verbose + self.mctx_params.n_threads = llama_model.n_threads + self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO + self.mctx_params.warmup = True + if self.image_min_tokens > 0: + self.mctx_params.image_min_tokens = self.image_min_tokens + if self.image_max_tokens > 0: + self.mctx_params.image_max_tokens = self.image_max_tokens + if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Configuration Error! image_max_tokens ({self.image_max_tokens}) " + f"cannot be less than image_min_tokens ({self.image_min_tokens}).") + self.mctx_params.batch_max_tokens = self.batch_max_tokens + + # Cache the model's eos token and bos token + self.mtmd_eos_token=llama_model.detokenize([llama_model.token_eos()]).decode('utf-8', errors='ignore') + self.mtmd_bos_token=llama_model.detokenize([llama_model.token_bos()]).decode('utf-8', errors='ignore') + + # Cache the mtmd_default_marker + self.media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') + + # Initialize mtmd context + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.mmproj_path.encode(), + llama_model.model, + self.mctx_params + ) + + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_init_mtmd_context): Failed to load mtmd context from: {self.mmproj_path}") + + # Check if vision is supported + self.is_support_vision = self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx) + if self.is_support_vision: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Vision is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if audio is supported + self.is_support_audio = self._mtmd_cpp.mtmd_support_audio(self.mtmd_ctx) + if self.is_support_audio: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Audio is NOT supported by this mmproj model backend.", file=sys.stderr) + + # Check if video is supported + self.is_support_video = self._mtmd_cpp.mtmd_helper_support_video(self.mtmd_ctx) + if self.is_support_video: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support detected.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(_init_mtmd_context): Video support is NOT available in this build.", file=sys.stderr) + + def close(self) -> None: + """Explicitly free the mtmd context and vision model resources.""" + if getattr(self, "mtmd_ctx", None) is not None: + try: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + except Exception: + pass + self.mtmd_ctx = None + self.mctx_params = None + self.chat_template = None + + if getattr(self, "_exit_stack", None) is not None and hasattr(self._exit_stack, "close"): + self._exit_stack.close() + self._exit_stack = None + + def __del__(self) -> None: + self.close() + + def _get_media_items(self, messages: List[llama_types.ChatCompletionRequestMessage]) -> List[Dict[str, str]]: + """ + Extracts all media payloads (images, audio) sequentially to maintain exact chronological order. + Strictly enforces capability checks, raising exceptions if unsupported media is passed. + + Returns: + media_items: A list of dictionaries containing the media 'url' and its 'type' (image or audio). + """ + media_items: List[Dict[str, str]] = [] + for message in messages: + if isinstance(message.get("content"), list): + for content in message["content"]: + content_type = content.get("type", "") + + # 1. Vision Processing + if content_type == "image_url": + if not self.is_support_vision: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support image inputs.") + + url = content["image_url"] if isinstance(content["image_url"], str) else content["image_url"]["url"] + media_items.append({"url": url, "type": "image"}) + + # 2. Audio Processing + elif content_type in ["audio", "audio_url", "input_audio"]: + if not self.is_support_audio: + raise ValueError(f"{self.log_prefix}: This mmproj model instance does not support audio inputs.") + + # Case A: Handle custom/forward-compatible audio_url format + if content_type == "audio_url" or content_type == "audio": + audio_url = content[content_type] + url = audio_url if isinstance(audio_url, str) else audio_url["url"] + media_items.append({"url": url, "type": "audio"}) + # Case B: Handle OpenAI standard input_audio format + elif content_type == "input_audio": + input_audio = content.get("input_audio", {}) + if isinstance(input_audio, dict) and "data" in input_audio: + # It might just be raw base64 data, we can format it as a data URI to reuse load_audio logic + # input_audio: { + # data: audio.base64Data, + # format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' + # } + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + + # Strictly align with llama.cpp (require wav/mp3) + if audio_format not in ["wav", "mp3"]: + raise ValueError(f"{self.log_prefix}: input_audio.format must be either 'wav' or 'mp3'") + + # Format as a Data URI to reuse the unified load_media logic + media_items.append({ + "url": f"data:audio/{audio_format};base64,{audio_data}", + "type": "audio" + }) + else: + # Just a raw base64 data + url = input_audio if isinstance(input_audio, str) else "" + if url: + media_items.append({"url": url, "type": "audio"}) + + # 3. Video Processing + elif content_type == "video_url": + if not self.is_support_video: + raise ValueError(f"{self.log_prefix}: This libmtmd build does not support video inputs.") + + video_url = content["video_url"] + url = video_url if isinstance(video_url, str) else video_url["url"] + media_items.append({"url": url, "type": "video"}) + + # 4. Text & Unknown Types + elif content_type == "text": + continue + else: + if self.verbose: + print(f"{self.log_prefix}: Ignored unknown content type '{content_type}'.", file=sys.stderr) + return media_items + + def _create_bitmap_from_bytes(self, media_bytes: bytes): + """ + Constructs an mtmd_bitmap structure from a raw byte buffer containing media data. + + Supported formats: + - Images (via stb_image): jpg, png, bmp, etc. + - Audio (via miniaudio): wav, mp3, flac. + - Video: depends on whether MTMD_VIDEO was enabled at build time. + + Note: + - Media types (Image vs. Audio) are auto-detected by the C++ backend using magic bytes. + - The underlying C++ helper function is thread-safe, making it suitable for concurrent preprocessing. + + Args: + media_bytes (bytes): The raw byte content of the media file. + + Returns: + bitmap: mtmd_bitmap * + video_ctx: mtmd_helper_video * or NULL + """ + if self.mtmd_ctx is None: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): mtmd context not initialized.") + + if not media_bytes: + raise ValueError(f"{self.log_prefix}(_create_bitmap_from_bytes): empty media bytes.") + + buf = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes) + + wrapper = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + buf, + len(media_bytes), + False, + ) + + if not wrapper.bitmap: + if wrapper.video_ctx: + self._mtmd_cpp.mtmd_helper_video_free(wrapper.video_ctx) + + raise ValueError( + f"{self.log_prefix}(_create_bitmap_from_bytes): " + "Failed to load media from bytes " + "(unsupported media format, corrupted data, or missing helper support)." + ) + + return wrapper.bitmap, wrapper.video_ctx + + def _is_text_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD text chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT + ) + + def _is_image_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD image chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE + ) + + def _is_audio_chunk(self, chunk_type: int) -> bool: + """Return True if `chunk_type` is the MTMD audio chunk type enum value.""" + return ( + chunk_type + == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO + ) + + def _process_mtmd_prompt( + self, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + add_generation_prompt: bool = True, + ) -> Tuple[List[int], List[tuple], Any, List[Any]]: + """ + Core multimodal preprocessing pipeline. + Converts raw chat messages into C++ MTMD chunk structures and a virtual token ledger. + + Features: + - Thread-safe concurrent media decoding to eliminate I/O bottlenecks. + - "Negative Reverse Vocabulary" mapping for O(1) prefix matching of media tokens. + - Strict RAII-style C++ memory management to prevent leaks on failure. + + Returns: + full_prompt_ids: Ledger of text tokens and negative media IDs for prefix matching. + chunk_token_spans: Tuples of (start_idx, end_idx, chunk_ptr, chunk_type, media_id). + chunks: Allocated C++ mtmd_input_chunks pointer (must be freed by the caller). + bitmap_cleanup: List of C++ bitmap pointers to be freed after evaluation. + """ + # 1. Inject default system prompt if omitted by the user + system_prompt = next((msg["content"] for msg in messages if msg.get("role") == "system"), "") + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: + messages = [{"role": "system", "content": self.DEFAULT_SYSTEM_MESSAGE}] + messages + + media_items = self._get_media_items(messages) + media_marker = self.media_marker + + # 2. Render the chat template and replace actual URLs with C++ media markers + text = self.chat_template.render( + messages=messages, + add_generation_prompt=add_generation_prompt, + eos_token=self.mtmd_eos_token, + bos_token=self.mtmd_bos_token, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + **getattr(self, 'extra_template_arguments', {}) + ) + + for tag in self._chat_format_parser_tags: + if tag not in text: + continue + + text = text.replace(tag, media_marker) + + # Replace image_url by media_marker in text + for item in media_items: + text = text.replace(item["url"], media_marker) + + if self.verbose: + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt length: {len(text)} chars, Media count: {len(media_items)}.", file=sys.stderr) + print(f"{self.log_prefix}(_process_mtmd_prompt): Rendered prompt: {text}", file=sys.stderr) + + # 3. Pre-allocate bitmap array to guarantee chronological order during concurrent decoding + bitmaps = [None] * len(media_items) + bitmap_cleanup = [] + video_cleanup = [] + chunks = None + + try: + # Concurrent Media Decoding + import concurrent.futures + if media_items: + def _create_bitmap_func(idx: int, item: dict): + media_bytes = self.load_media(item["url"], item["type"]) + bitmap, video_ctx = self._create_bitmap_from_bytes(media_bytes) + return idx, bitmap, video_ctx + # This method uses multi-threaded parallel processing to convert images or audio to bitmaps, + # which can be used in the future to process large numbers of video frames. + max_workers = min(llama.n_threads, len(media_items)) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_create_bitmap_func, i, item) for i, item in enumerate(media_items)] + + for future in concurrent.futures.as_completed(futures): + idx, bitmap, video_ctx = future.result() + + bitmaps[idx] = bitmap + bitmap_cleanup.append(bitmap) + + if video_ctx: + video_cleanup.append(video_ctx) + + # Strict validation: Abort if any thread failed to decode its assigned media + if any(b is None for b in bitmaps): + raise RuntimeError(f"{self.log_prefix}(_create_bitmap_func): Failed to decode one or more media files.") + else: + if self.verbose: + print(f"{self.log_prefix}(_create_bitmap_func with {max_workers} threads): {len(media_items)} bitmaps were successfully created.") + else: + # If there are no images, set the bitmaps to empty. + bitmaps = [] + + # 4. Initialize mtmd_input_chunks + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode('utf-8') + input_text.add_special = (llama.n_tokens == 0) + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError(f"{self.log_prefix}(mtmd_input_chunks_init): Failed to initialize mtmd_input_chunks.") + + # 5. Hybrid Tokenization (Text + Media binding) + if len(bitmaps) > 0: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, len(bitmaps) + ) + else: + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, chunks, ctypes.byref(input_text), None, 0 + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_tokenize): Unable to tokenize prompt, res = {result}.") + + # Video helper contexts only need to stay alive until mtmd_tokenize() completes. + if video_cleanup: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup.clear() + + # 6. Virtual Token Ledger Construction + full_prompt_ids = [] + chunk_token_spans = [] + current_idx = 0 + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + # Cursor to track the actual media contents (URLs or base64 data) provided by the user + media_items_count = len(media_items) + media_items_cur = 0 + last_media_id = None + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: continue + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if self._is_text_chunk(chunk_type): + # Extract standard text token IDs + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk, ctypes.byref(n_tokens_out)) + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + chunk_token_spans.append((current_idx, current_idx + len(tokens), chunk, chunk_type, None)) + full_prompt_ids.extend(tokens) + current_idx += len(tokens) + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + # Extract media properties + # Note(JamePeng): + # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models). + # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample. + # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) + + if media_items_cur < media_items_count: + # The C++ parser only sees identical placeholders (e.g., "<__media__>"). + # We MUST inject the actual media content's identity here. + real_media_url = media_items[media_items_cur]["url"] + # Vocabulary Positive forward: 0 to 248,319 (Qwen3.5) + # Generate a deterministic, unique negative ID for this specific image/audio. + # - zlib.crc32 ensures cross-platform and cross-run consistency (unlike Python's hash()). + # - We map it to a negative space (-100 to -16,777,316) to avoid colliding with + # positive text token IDs (e.g., Qwen3.5 vocab goes up to ~152k). + # This empowers `longest_token_prefix` to correctly identify and reuse cached images, + # while instantly breaking the match if the image content changes. + # media_id = - (zlib.crc32(real_media_url.encode('utf-8')) % (2**24)) - 100 + media_id = - (zlib.crc32(real_media_url.encode('utf-8')) & 0xFFFFFF) - 100 + last_media_id = media_id + media_items_cur += 1 + elif last_media_id is not None: + # video may expand into multiple image chunks from one media marker + media_id = last_media_id + else: + # Magic Negative Number as fallback :) + media_id = -314159 + + if self.verbose: + print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ") + + chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id)) + + # Pad the ledger with the pseudo-ID to mimic the physical space taken in the KV cache + full_prompt_ids.extend([media_id] * chunk_n_tokens) + current_idx += chunk_n_tokens + else: + raise TypeError(f"{self.log_prefix}(mtmd_input_chunk_get_type): Invalid chunk type, chunk_type = {chunk_type}.") + + return full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup + + except Exception as e: + # Ensure no useless pointers remain upon any failure + # Free chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Free bitmaps + if len(bitmap_cleanup) > 0: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup = None + # Free videos + if len(video_cleanup) > 0: + for video_ctx in video_cleanup: + self._mtmd_cpp.mtmd_helper_video_free(video_ctx) + video_cleanup = None + + bitmaps = None + + raise e + + def __call__( + self, + *, + llama: llama_core.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, + response_format: Optional[ + llama_types.ChatCompletionRequestResponseFormat + ] = None, + max_tokens: Optional[int] = None, + present_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_n_sigma: float = -1.00, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + xtc_threshold: float = 0.1, + xtc_probability: float = 0.0, + dry_multiplier: float = 0.0, + dry_base: float = 1.75, + dry_allowed_length: int = 2, + dry_penalty_last_n:int = 0, + dry_seq_breakers: list[str] = ["\n", ":", "\"", "*"], + adaptive_target : float = -1.0, + adaptive_decay : float = 0.9, + use_infill: bool = False, + model: Optional[str] = None, + logits_processor: Optional[llama_core.LogitsProcessorList] = None, + grammar: Optional[llama_grammar.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + add_generation_prompt: bool = True, + reasoning_budget: int = -1, + reasoning_start: str = "", + reasoning_end: str = "", + reasoning_budget_message: Optional[str] = None, + reasoning_start_in_prompt: bool = False, + reasoning_start_max_tokens: Optional[int] = 32, + **kwargs, # type: ignore + ) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], + ]: + # 1. Initialize mtmd context + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None + + # 2. Concurrent Preprocessing & Ledger Construction + full_prompt_ids, chunk_token_spans, chunks, bitmap_cleanup = self._process_mtmd_prompt( + llama=llama, + messages=messages, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + add_generation_prompt=add_generation_prompt, + ) + + if self.verbose: + print(f"{self.log_prefix}(__call__): Prepared virtual token ledger of length {len(full_prompt_ids)}.", file=sys.stderr) + + try: + # 3. KV Cache Synchronization & State Rollback + # Compares the virtual ledger with physical history to prevent Cache Poisoning. + current_history = llama.input_ids[:llama.n_tokens].tolist() + longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose) + + if longest_prefix < llama.n_tokens: + if llama.is_hybrid and llama._hybrid_cache_mgr is not None: + if llama._hybrid_cache_mgr.max_checkpoints > 0: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid prefix mismatch (matched {longest_prefix}/{llama.n_tokens}). " + f"Searching for nearest checkpoint...", file=sys.stderr) + + best_ckpt = llama._hybrid_cache_mgr.find_best_checkpoint(full_prompt_ids, seq_id=0) + if best_ckpt and llama._hybrid_cache_mgr.restore_checkpoint(best_ckpt, seq_id=0): + llama.n_tokens = best_ckpt.pos + if self.verbose: + print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr) + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr) + llama._hybrid_cache_mgr.clear() + llama._ctx.memory_clear(True) + llama.n_tokens = 0 + else: + if self.verbose: + print(f"{self.log_prefix}(__call__): Prefix mismatch. Truncating KV cache from {llama.n_tokens} to {longest_prefix}.", file=sys.stderr) + llama._ctx.memory_seq_rm(0, longest_prefix, -1) + llama.n_tokens = longest_prefix + + n_past = llama.n_tokens + + for start_idx, end_idx, chunk_ptr, chunk_type, media_id in chunk_token_spans: + # Skip previously matched chunks + if end_idx <= n_past: + continue + + if self._is_text_chunk(chunk_type): + unprocessed_start = max(start_idx, n_past) - start_idx + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(chunk_ptr, ctypes.byref(n_tokens_out)) + + if tokens_ptr and n_tokens_out.value > 0: + all_tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + tokens_to_eval = all_tokens[unprocessed_start:] + + if tokens_to_eval: + if self.verbose: + print(f"{self.log_prefix}(__call__): Evaluating TEXT chunk ({len(tokens_to_eval)} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + # Text evaluation delegates shift and chunking to native llama.eval + llama.eval(tokens_to_eval) + n_past = llama.n_tokens + + elif self._is_image_chunk(chunk_type) or self._is_audio_chunk(chunk_type): + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk_ptr) + + if self.verbose: + media_str = "IMAGE" if self._is_image_chunk(chunk_type) else "AUDIO" + print(f"{self.log_prefix}(__call__): Evaluating {media_str} chunk ({chunk_n_tokens} tokens) at pos {llama.n_tokens}...", file=sys.stderr) + + # Stage 5: Multimodal Physical OOM Defense + if n_past + chunk_n_tokens > llama.n_ctx(): + if not llama._ctx.memory_can_shift(): + raise RuntimeError( + f"{self.log_prefix}(__call__): Context Shift is explicitly disabled by the C++ backend " + f"(n_pos_per_embd > 1 or incompatible M-RoPE). " + f"Multimodal chunk exceeded context limit(currently n_ctx={llama._n_ctx}), " + f"You MUST increase n_ctx to fit the dialogue." + ) + else: + # Safely discard oldest tokens while preserving system prompts + n_discard = (n_past + chunk_n_tokens) - llama.n_ctx() + llama.n_batch + n_keep = min(llama.n_keep, n_past) + n_discard = min(n_discard, n_past - n_keep) + + if n_discard <= 0: + raise RuntimeError(f"{self.log_prefix}(__call__): Critical Overflow. Not enough unpinned tokens to discard for Context Shift.") + + if self.verbose: + print(f"{self.log_prefix}(__call__): OOM risk detected. Shifting multimodal context: keeping {n_keep}, discarding {n_discard}...", file=sys.stderr) + + # Execute physical memory shift + llama._ctx.memory_seq_rm(0, n_keep, n_keep + n_discard) + llama._ctx.memory_seq_add(0, n_keep + n_discard, n_past, -n_discard) + + # Shift python virtual array to match + remaining_len = n_past - (n_keep + n_discard) + if remaining_len > 0: + llama.input_ids[n_keep : n_keep + remaining_len] = llama.input_ids[n_keep + n_discard : n_past] + + n_past -= n_discard + llama.n_tokens = n_past + + # Execute C++ Multimodal Black-box Extraction + new_n_past = llama_cpp_lib.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk_ptr, + llama_cpp_lib.llama_pos(n_past), + llama_cpp_lib.llama_seq_id(0), + llama.n_batch, + True, # logits_last = True, drastically saves computational overhead + ctypes.byref(new_n_past) + ) + + if result != 0: + raise ValueError(f"{self.log_prefix}(mtmd_helper_eval_chunk_single): Media evaluation failed with error code {result}.") + + # Update Ledger with "Negative Reverse Vocabulary" IDs + llama.input_ids[n_past : new_n_past.value] = media_id + n_past = new_n_past.value + llama.n_tokens = n_past + + # Extract the final, perfectly synchronized prompt sequence + prompt = llama.input_ids[: llama.n_tokens].tolist() + + # End-of-Turn Checkpoint + # Anchors the state ONLY after the entire multi-modal turn is processed + if ( + llama.is_hybrid + and llama._hybrid_cache_mgr is not None + and llama._hybrid_cache_mgr.max_checkpoints > 0 + ): + if self.verbose: + print(f"{self.log_prefix}(__call__): [End-of-Turn Checkpoint] Anchoring full prompt state at pos {llama.n_tokens}.", file=sys.stderr) + + llama._hybrid_cache_mgr.save_checkpoint( + current_pos=llama.n_tokens, + tokens=prompt, + seq_id=0 + ) + finally: + # Cleanup chunks + if chunks is not None: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + chunks = None + # Cleanup bitmaps + if bitmap_cleanup: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + bitmap_cleanup.clear() + bitmap_array = None + + # Handle response format and tools (same as before) + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) + + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if ( + tool_choice is not None + and isinstance(tool_choice, dict) + and tools is not None + ): + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + if llama.verbose: + print(str(e), file=sys.stderr) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, + stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + present_penalty=present_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + top_n_sigma=top_n_sigma, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + xtc_threshold=xtc_threshold, + xtc_probability=xtc_probability, + dry_multiplier=dry_multiplier, + dry_base=dry_base, + dry_allowed_length=dry_allowed_length, + dry_penalty_last_n=dry_penalty_last_n, + dry_seq_breakers=dry_seq_breakers, + adaptive_target=adaptive_target, + adaptive_decay=adaptive_decay, + use_infill=use_infill, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + reasoning_start=reasoning_start, + reasoning_end=reasoning_end, + reasoning_budget_message=reasoning_budget_message, + reasoning_start_in_prompt=reasoning_start_in_prompt, + reasoning_start_max_tokens=reasoning_start_max_tokens, + ) + + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + def load_media(self, media_url: str, media_type: str) -> bytes: + """ + Unified dispatcher for loading media payloads. + Routes the URL/URI to the specific image, audio, or video processor based on the media_type. + """ + if media_type == "image": + return self._load_image(media_url) + + elif media_type == "audio": + audio_bytes = self._load_bytes(media_url, timeout=15, kind="audio") + try: + self.detect_audio_format(audio_bytes) + except ValueError as e: + raise ValueError(f"{self.log_prefix}(load_media): {e}") + return audio_bytes + + elif media_type == "video": + return self._load_bytes(media_url, timeout=30, kind="video") + + else: + raise ValueError(f"{self.log_prefix}(load_media): Unknown media type '{media_type}'") + + @staticmethod + def detect_audio_format(audio_bytes: bytes) -> str: + """ + Pure utility function: Detects the audio format from magic bytes. + Strictly translated from llama.cpp's `is_audio_file` to ensure 100% compatibility + and avoid false positives (e.g., AVI files disguised as RIFF). + """ + length = len(audio_bytes) + + if length < 12: + raise ValueError("Audio data is corrupted or too small (less than 12 bytes).") + + # RIFF & WAVE magic bytes verification + is_wav = audio_bytes.startswith(b"RIFF") and audio_bytes[8:12] == b"WAVE" + + # ID3 metadata or MPEG sync word verification + is_mp3 = length >= 3 and ( + audio_bytes.startswith(b"ID3") or + (audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0) + ) + + # FLAC magic bytes verification + is_flac = audio_bytes.startswith(b"fLaC") + + if is_wav: + return "wav" + elif is_mp3: + return "mp3" + elif is_flac: + return "flac" + else: + raise ValueError( + "Unsupported audio format detected via magic bytes. " + "The underlying C++ miniaudio backend ONLY supports WAV, MP3, and FLAC." + ) + + DEFAULT_HTTP_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/148.0.0.0 Safari/537.36" + ), + } + + @staticmethod + def _load_bytes(media_url: str, timeout: int = 15, kind: str = "media") -> bytes: + """ + Load raw bytes from a data URI, local file path, or remote HTTP/HTTPS URL. + """ + media_bytes = b"" + + # 1. Handle data URI + if media_url.strip().startswith("data:"): + comma_pos = media_url.find(",") + if comma_pos == -1: + raise ValueError("Invalid data URI: missing comma separator") + + base64_data = media_url[comma_pos + 1:] + media_bytes = base64.b64decode(base64_data) + + # 2. Handle local file path + elif os.path.exists(media_url): + with open(media_url, "rb") as f: + media_bytes = f.read() + + # 3. Handle remote URL via HTTP/HTTPS + else: + req = urllib.request.Request( + media_url, + headers=MTMDChatHandler.DEFAULT_HTTP_HEADERS, + ) + try: + with urllib.request.urlopen(req, timeout=timeout) as f: + media_bytes = f.read() + except (URLError, HTTPError) as e: + raise ConnectionError(f"Failed to download {kind} from {media_url}: {e}") + + if not media_bytes: + raise ValueError(f"Empty {kind} data received") + + return media_bytes + + @staticmethod + def _load_image(image_url: str) -> bytes: + """ + Load an image from either a URL or a data URI and return it as JPEG bytes. + + Supports: + - Remote images via HTTP/HTTPS (with proper User-Agent) + - Data URIs (base64-encoded, e.g., data:image/png;base64,...) + - Images with alpha channel (PNG, WebP, etc.) → automatically composites on white/black background + - Any format that Pillow can open. See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + + Returns: + JPEG-encoded bytes (quality=95) in RGB mode, suitable for most vision models. + """ + # 1. Load image bytes from image_url + image_bytes = MTMDChatHandler._load_bytes( + image_url, + timeout=15, + kind="image", + ) + + # 2. Check if image_bytes is empty. + if not image_bytes: + raise ValueError("Empty image data received") + + # 3. Open image with Pillow + try: + from PIL import Image, ImageStat + except ImportError: + raise ImportError("Pillow is required for image processing. Install with: pip install pillow") + + import io + image = Image.open(io.BytesIO(image_bytes)) + + # 4. Handle transparency (RGBA, LA, P with transparency, etc.) + if image.mode in ("RGBA", "LA", "PA") or (image.mode == "P" and "transparency" in image.info): + # Use alpha channel as mask + if image.mode == "P": + image = image.convert("RGBA") + + alpha = image.split()[-1] # Last channel is alpha + # Compute average brightness of visible (non-transparent) pixels + stat = ImageStat.Stat(image.convert("L"), mask=alpha) + + # Choose background: white for dark content, black for bright content + bg_color = (255, 255, 255) # white + if stat.count[0] > 0 and stat.mean[0] > 127: + bg_color = (0, 0, 0) # black + + background = Image.new("RGB", image.size, bg_color) + background.paste(image, mask=alpha) + image = background + + # 5. Ensure RGB mode for formats like CMYK, palette, etc. + elif image.mode != "RGB": + image = image.convert("RGB") + + # 6. Save as high-quality JPEG, suitable for most vision models. + output = io.BytesIO() + image.save(output, format="JPEG", quality=95, optimize=True, progressive=True) + return output.getvalue() + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "MTMDChatHandler": + import fnmatch + from pathlib import Path + + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface_hub package. " + "You can install it with `pip install --upgrade huggingface_hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + # split each file into repo_id, subfolder, filename + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + # download the file + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + mmproj_path=model_path, + **kwargs, + ) + +# Experiments are not recommended for this purpose at this time. +class GenericMTMDChatHandler(MTMDChatHandler): + KNOWN_MEDIA_TAGS = [ + "<|image_pad|>", + "<|audio_pad|>", + "<|video_pad|>", + "<|image|>", + "<|audio|>", + "<|video|>", + "[IMG]" + ] + + def __init__( + self, + chat_format: str, + mmproj_path: str, + verbose: bool = True, + **kwargs + ) -> None: + + self.chat_format = chat_format + if self.chat_format is None: + raise ValueError("Failed to get model chat template automatically.") + + self.verbose = verbose + if self.verbose: + print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True) + + super().__init__(mmproj_path = mmproj_path, verbose = verbose, **kwargs) + + def __call__(self, **kwargs): + self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format] + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Llava15ChatHandler(MTMDChatHandler): + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + + "{% if message.role == 'user' %}" + "{% if message.content is string %}" + "\nUSER: {{ message.content }}" + "{% elif message.content is iterable %}" + "\nUSER: " + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{{ content.image_url if content.image_url is string else content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% endif %}" + + "{% if message.role == 'assistant' and message.content is not none %}" + "\nASSISTANT: {{ message.content }}" + "{% endif %}" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "\nASSISTANT: " + "{% endif %}" + ) + + +class ObsidianChatHandler(MTMDChatHandler): + # Prompt Format + # The model followed ChatML format. However, with ### as the seperator + + # <|im_start|>user + # What is this sign about?\n + # ### + # <|im_start|>assistant + # The sign is about bullying, and it is placed on a black background with a red background. + # ### + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}\n" + "###\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "###\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "###\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class MoondreamChatHandler(MTMDChatHandler): + # Chat Format: + # f"\n\n{chat_history}Question: {question}\n\nAnswer:" + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "Question: {{ content.text }}\n\n" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "Question: {{ message.content }}\n\n" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "Answer:{{ message.content }}\n\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + + +class Llava16ChatHandler(MTMDChatHandler): + # Example prompt + # "DEFAULT_SYSTEM_MESSAGE + USER: \nWhat is shown in this image? ASSISTANT:" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.role == 'user' %}" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}\n" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "{{ message.content }}" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "Answer:" + "{% endif %}" + ) + + +class NanoLlavaChatHandler(MTMDChatHandler): + # Prompt Format + # The model follow the ChatML standard, however, without \n at the end of <|im_end|>: + + # <|im_start|>system + # Answer the question<|im_end|><|im_start|>user + # + # What is the picture about?<|im_end|><|im_start|>assistant + DEFAULT_SYSTEM_MESSAGE = "Answer the question" + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System message + "{% if message.role == 'system' %}" + "<|im_start|>system\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "<|im_start|>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "<|im_end|>" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + "<|im_start|>assistant\n" + "{{ message.content }}" + "<|im_end|>" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class Llama3VisionAlphaChatHandler(MTMDChatHandler): + # question = "" + q + + # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + + CHAT_FORMAT = ( + "{% for message in messages %}" + "<|start_header_id|>" + "{% if message.role == 'user' %}" + "user<|end_header_id|>\n\n" + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "assistant<|end_header_id|>\n\n" + "{{ message.content }}" + "{% endif %}" + "<|eot_id|>" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + "{% endif %}" + ) + + +# alias +Llama3VisionAlpha = Llama3VisionAlphaChatHandler + + +class MiniCPMv26ChatHandler(MTMDChatHandler): + + CHAT_FORMAT = ( + "{% set image_count = namespace(value=0) %}" + "{% for message in messages %}" + "{% if loop.first and messages[0]['role'] != 'system' %}" + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is iterable %}" + "{% for content in message['content'] %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{% set image_count.value = image_count.value + 1 %}" + "{{ image_count.value }}: {{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{% set image_count.value = image_count.value + 1 %}" + "{{ image_count.value }}: {{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + + "{% for content in message['content'] %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% if message['content'] is string %}" + "{{ message['content'] }}" + "{% endif %}" + "<|im_end|>\n" + "{% endfor %}" + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + +class MiniCPMv45ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V 4.5 models. + + Supports: + - Multi-step tool calls with and XML tags. + - Integrated reasoning (thinking) process with tags. + - Specialized system prompt handling with tool definitions. + - Global image numbering for multi-image processing. + """ + + # Model specific control tokens + MINICPMV_BOS_TOKEN = "<|im_start|>" + MINICPMV_EOS_TOKEN = "<|im_end|>" + MINICPMV_PAD_TOKEN = "<|endoftext|>" + + # Image placeholder tags + MINICPMV_IMAGE_START_TOKEN = "" + MINICPMV_IMAGE_END_TOKEN = "" + MINICPMV_IMAGE_ID_START_TOKEN = "" + MINICPMV_IMAGE_ID_END_TOKEN = "" + + CHAT_FORMAT = ( + # --- 1. First System Message & Tools Definitions --- + "{%- if tools %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}" + "{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}" + "{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}" + "{{- 'You are provided with function signatures within XML tags:\\n' }}" + "{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}" + "{{- '\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\"name\": , \"arguments\": }\\n" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- elif messages[0].role == 'system' %}" + "{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + + # --- 2. Message Stream Processing --- + "{% set image_count = namespace(value=0) %}" + "{%- for message in messages %}" + # --- Unified Role Handling (User, Assistant, and subsequent Systems) --- + "{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}" + "{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}" + + "{%- set content = message.content %}" + "{%- if content is not string %}" + "{%- set ns = namespace(content_str='') %}" + "{%- for item in content %}" + # --- Explicit image_url type and value checking --- + "{%- if item.type == 'image_url' %}" + "{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}" + "{%- set image_count.value = image_count.value + 1 %}" + # Format: N: IMAGE_URL + "{%- set ns.content_str = ns.content_str + '' + (image_count.value | string) + ': ' + image_url + '' %}" + "{%- elif item.type == 'text' %}" + "{%- set ns.content_str = ns.content_str + item.text %}" + "{%- endif %}" + "{%- endfor %}" + "{%- set content = ns.content_str %}" + "{%- endif %}" + + "{{- content -}}" + + # Append tool_calls to assistant messages if they exist + "{%- if message.role == 'assistant' and message.tool_calls %}" + "{%- for tool_call in message.tool_calls %}" + "{%- set tc = tool_call.function if tool_call.function else tool_call %}" + "{{- '\\n\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}" + "{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}" + "{{- '}\\n' }}" + "{%- endfor %}" + "{%- endif %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + + # --- Specialized Tool Response Handling --- + # Group consecutive tool responses under a single user-like block + "{%- elif message.role == 'tool' %}" + "{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}" + "{{- '" + MINICPMV_BOS_TOKEN + "user' }}" + "{%- endif %}" + "{{- '\\n\\n' + message.content + '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}" + "{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + + # --- 3. Generation Prompt --- + "{%- if add_generation_prompt %}" + "{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}" + # Handle thinking/reasoning block visibility based on configuration + "{%- if enable_thinking is defined and enable_thinking is false %}" + "{{- '\\n\\n\\n\\n' }}" + "{%- elif enable_thinking is defined and enable_thinking is true %}" + "{{- '\\n' }}" + "{%- endif %}" + "{%- endif %}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V 4.5 Handler. + + Args: + enable_thinking (bool): If True, model generates reasoning before the final answer. + **kwargs: Additional arguments for the base MTMDChatHandler. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject thinking control flag into the template + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set stop token patch + kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + return super().__call__(**kwargs) + + +class MiniCPMV46ChatHandler(MTMDChatHandler): + """ + Handler for MiniCPM-V-4.6 models. + + Features: + - Aligned with official tokenizer_config.json special tokens. + - Custom `<|image_pad|>` and `<|video_pad|>` multimodal tokens. + - Integrated MTMD-style URL and Base64 injection for visual content. + - Specialized `` and `` block generation. + - Autonomously folds previous reasoning paths using `last_query_index`. + - Toggles `` block generation via `enable_thinking` (Defaults to False). + """ + + # Core tokens + MINICPM_BOS_TOKEN = "<|im_start|>" + MINICPM_EOS_TOKEN = "<|im_end|>" + MINICPM_PAD_TOKEN = "<|endoftext|>" + + # Vision tokens + MINICPM_VISION_BOS_TOKEN = "<|vision_start|>" + MINICPM_VISION_EOS_TOKEN = "<|vision_end|>" + MINICPM_IMAGE_TOKEN = "<|image_pad|>" + MINICPM_VIDEO_TOKEN = "<|video_pad|>" + + CHAT_FORMAT = ( + "{%- if enable_thinking is not defined -%}\n" + " {%- set enable_thinking = false -%}\n" + "{%- endif -%}\n" + "{%- macro render_content(content, is_system_content=false) -%}\n" + " {%- if content is string -%}\n" + " {{- content -}}\n" + " {%- elif content is iterable and content is not mapping -%}\n" + " {%- set ns = namespace(parts=[]) -%}\n" + " {%- for item in content -%}\n" + " {%- if 'image' in item or 'image_url' in item or item.type == 'image' -%}\n" + " {%- if is_system_content -%}\n" + " {{- raise_exception('System message cannot contain images.') -}}\n" + " {%- endif -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.type == 'image_url' -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {%- set ns.parts = ns.parts + ['<|image_pad|>' + url_val] -%}\n" + # " {%- elif 'video' in item or 'video_url' in item or item.type == 'video' -%}\n" + # " {%- if is_system_content -%}\n" + # " {{- raise_exception('System message cannot contain videos.') -}}\n" + # " {%- endif -%}\n" + # " {%- set url_val = '' -%}\n" + # " {%- if item.type == 'video_url' -%}\n" + # " {%- set url_val = item.video_url if item.video_url is string else item.video_url.url -%}\n" + # " {%- endif -%}\n" + # " {%- set ns.parts = ns.parts + ['<|video_pad|>' + url_val] -%}\n" + " {%- elif 'text' in item -%}\n" + " {%- set ns.parts = ns.parts + [item.text] -%}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected item type in content.') -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.parts | join('\\n') -}}\n" + " {%- elif content is none or content is undefined -%}\n" + " {{- '' -}}\n" + " {%- else -%}\n" + " {{- raise_exception('Unexpected content type.') -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- if not messages %}\n" + " {{- raise_exception('No messages provided.') }}\n" + "{%- endif %}\n" + "{%- if tools and tools is iterable and tools is not mapping %}\n" + " {{- '<|im_start|>system\\n' }}\n" + " {{- '# Tools\\n\\nYou have access to the following functions:\\n\\n' }}\n" + " {%- for tool in tools %}\n" + " {{- '\\n' }}\n" + " {{- tool | tojson }}\n" + " {%- endfor %}\n" + " {{- '\\n' }}\n" + " {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n\\n\\n\\nvalue_1\\n\\n\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n\\n\\n\\n\\n\\nReminder:\\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n' }}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {%- if content %}\n" + " {{- '\\n\\n' + content }}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + "{%- else %}\n" + " {%- if messages[0].role == 'system' %}\n" + " {%- set content = render_content(messages[0].content, true)|trim %}\n" + " {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n" + "{%- for message in messages[::-1] %}\n" + " {%- set index = (messages|length - 1) - loop.index0 %}\n" + " {%- if ns.multi_step_tool and message.role == 'user' %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if not(content.startswith('') and content.endswith('')) %}\n" + " {%- set ns.multi_step_tool = false %}\n" + " {%- set ns.last_query_index = index %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if ns.multi_step_tool %}\n" + " {{- raise_exception('No user query found in messages.') }}\n" + "{%- endif %}\n" + "{%- for message in messages %}\n" + " {%- set content = render_content(message.content)|trim %}\n" + " {%- if message.role == 'system' %}\n" + " {%- if not loop.first %}\n" + " {{- raise_exception('System message must be at the beginning.') }}\n" + " {%- endif %}\n" + " {%- elif message.role == 'user' %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n" + " {%- elif message.role == 'assistant' %}\n" + " {%- set reasoning_content = '' %}\n" + " {%- if message.reasoning_content is string %}\n" + " {%- set reasoning_content = message.reasoning_content %}\n" + " {%- else %}\n" + " {%- if '' in content %}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') %}\n" + " {%- endif %}\n" + " {%- endif %}\n" + " {%- set reasoning_content = reasoning_content|trim %}\n" + " {%- if loop.index0 > ns.last_query_index %}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n\\n' + content }}\n" + " {%- else %}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content }}\n" + " {%- endif %}\n" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n" + " {%- for tool_call in message.tool_calls %}\n" + " {%- if tool_call.function is defined %}\n" + " {%- set tool_call = tool_call.function %}\n" + " {%- endif %}\n" + " {%- if loop.first %}\n" + " {%- if content|trim %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- '\\n\\n\\n' }}\n" + " {%- endif %}\n" + " {%- if tool_call.arguments is defined %}\n" + " {%- for args_name, args_value in tool_call.arguments|items %}\n" + " {{- '\\n' }}\n" + " {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n" + " {{- args_value }}\n" + " {{- '\\n\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '\\n' }}\n" + " {%- endfor %}\n" + " {%- endif %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif message.role == 'tool' %}\n" + " {%- if loop.previtem and loop.previtem.role != 'tool' %}\n" + " {{- '<|im_start|>user' }}\n" + " {%- endif %}\n" + " {{- '\\n\\n' }}\n" + " {{- content }}\n" + " {{- '\\n' }}\n" + " {%- if not loop.last and loop.nextitem.role != 'tool' %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- elif loop.last %}\n" + " {{- '<|im_end|>\\n' }}\n" + " {%- endif %}\n" + " {%- else %}\n" + " {{- raise_exception('Unexpected message role.') }}\n" + " {%- endif %}\n" + "{%- endfor %}\n" + "{%- if add_generation_prompt %}\n" + " {{- '<|im_start|>assistant\\n' }}\n" + " {%- if enable_thinking is defined and enable_thinking is false %}\n" + " {{- '\\n\\n\\n\\n' }}\n" + " {%- else %}\n" + " {{- '\\n' }}\n" + " {%- endif %}\n" + "{%- endif %}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the MiniCPM-V-4.6 Handler. + + Args: + enable_thinking (bool): Controls whether to open a `` block for reasoning. + Defaults to False as per the standard template logic. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # MiniCPM uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.MINICPM_PAD_TOKEN, self.MINICPM_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class Gemma3ChatHandler(MTMDChatHandler): + + GEMMA3_BOI_TOKEN = "" + GEMMA3_EOI_TOKEN = "" + GEMMA3_BOS_TOKEN = "" + GEMMA3_EOS_TOKEN = "" + + CHAT_FORMAT = ( + "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" + "{% if messages[0]['content'] is string %}" + "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" + "{% else %}" + "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" + "{% endif %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set first_user_prefix = '' %}" + "{% endif %}" + + "{% for message in loop_messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + "{% endif %}" + + "{% if message['role'] == 'assistant' %}" + "{% set role = 'model' %}" + "{% else %}" + "{% set role = message['role'] %}" + "{% endif %}" + + "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" + + "{% if message['content'] is string %}" + "{{ message['content'] | trim }}" + "{% elif message['content'] is iterable %}" + "{% for item in message['content'] %}" + "{% if item['type'] == 'image_url' and item['image_url'] is string %}" + "{{ '' + item['image_url'] + '' }}" + "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" + "{{ '' + item['image_url']['url'] + '' }}" + "{% elif item['type'] == 'text' %}" + "{{ item['text'] | trim }}" + "{% endif %}" + "{% endfor %}" + "{% else %}" + "{{ raise_exception('Invalid content type') }}" + "{% endif %}" + + "\n" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + +class Gemma4ChatHandler(MTMDChatHandler): + """ + Handler for Gemma 4 models. + + Note on `enable_thinking`: + The `enable_thinking` toggle is currently ONLY supported by Gemma4 31B and 26BA4B models. + It is NOT supported by Gemma4 E2B and E4B models. + + [Important Note for Audio Processing!] + It is recommended to use BF16 mmproj for Gemma4 E2B and E4B models. + Other quantizations are known to have degraded performance; + ref comment: https://github.com/ggml-org/llama.cpp/pull/21421#issuecomment-4230306463 + """ + + # The special token in Gemma 4 + GEMMA4_BOI_TOKEN = "<|image>" + GEMMA4_EOI_TOKEN = "" + GEMMA4_BOA_TOKEN = "<|audio>" + GEMMA4_EOA_TOKEN = "" + GEMMA4_BOS_TOKEN = "" + GEMMA4_EOS_TOKEN = "" + GEMMA4_SOT_TOKEN = "<|turn>" + GEMMA4_EOT_TOKEN = "" + GEMMA4_SOC_TOKEN = "<|channel>" + GEMMA4_EOC_TOKEN = "" + GEMMA4_STC_TOKEN = "<|tool_call>" + GEMMA4_ETC_TOKEN = "" + GEMMA4_STD_TOKEN = "<|tool>" + GEMMA4_ETD_TOKEN = "" + GEMMA4_STR_TOKEN = "<|tool_response>" + GEMMA4_ETR_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro format_parameters(properties, required, filter_keys=false) -%}\n" + " {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in properties | dictsort -%}\n" + " {%- set add_comma = false -%}\n" + " {%- if not filter_keys or key not in standard_keys -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {{ key }}:{\n" + " {%- if value['description'] -%}\n" + " description:<|\"|>{{ value['description'] }}<|\"|>\n" + " {%- set add_comma = true -%}\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'STRING' -%}\n" + " {%- if value['enum'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " enum:{{ format_argument(value['enum']) }}\n" + " {%- endif -%}\n" + " {%- elif value['type'] | upper == 'ARRAY' -%}\n" + " {%- if value['items'] is mapping and value['items'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " items:{\n" + " {%- set ns_items = namespace(found_first=false) -%}\n" + " {%- for item_key, item_value in value['items'] | dictsort -%}\n" + " {%- if item_value is not none -%}\n" + " {%- if ns_items.found_first %},{% endif -%}\n" + " {%- set ns_items.found_first = true -%}\n" + " {%- if item_key == 'properties' -%}\n" + " properties:{\n" + " {%- if item_value is mapping -%}\n" + " {{- format_parameters(item_value, value['items']['required'] | default([])) -}}\n" + " {%- endif -%}\n" + " }\n" + " {%- elif item_key == 'required' -%}\n" + " required:[\n" + " {%- for req_item in item_value -%}\n" + " <|\"|>{{- req_item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- elif item_key == 'type' -%}\n" + " {%- if item_value is string -%}\n" + " type:{{ format_argument(item_value | upper) }}\n" + " {%- else -%}\n" + " type:{{ format_argument(item_value | map('upper') | list) }}\n" + " {%- endif -%}\n" + " {%- else -%}\n" + " {{ item_key }}:{{ format_argument(item_value) }}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " }\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if value['nullable'] %}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " nullable:true\n" + " {%- endif -%}\n" + " {%- if value['type'] | upper == 'OBJECT' -%}\n" + " {%- if value['properties'] is defined and value['properties'] is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value['properties'], value['required'] | default([])) -}}\n" + " }\n" + " {%- elif value is mapping -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " properties:{\n" + " {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}\n" + " }\n" + " {%- endif -%}\n" + " {%- if value['required'] -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " required:[\n" + " {%- for item in value['required'] | default([]) -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " ]\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}\n" + " type:<|\"|>{{ value['type'] | upper }}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + "{%- endmacro -%}\n" + "{%- macro format_function_declaration(tool_data) -%}\n" + " declaration:{{- tool_data['function']['name'] -}}{description:<|\"|>{{- tool_data['function']['description'] -}}<|\"|>\n" + " {%- set params = tool_data['function']['parameters'] -%}\n" + " {%- if params -%}\n" + " ,parameters:{\n" + " {%- if params.get('properties') -%}\n" + " properties:{ {{- format_parameters(params['properties'], params['required']) -}} },\n" + " {%- endif -%}\n" + " {%- if params.get('required') -%}\n" + " required:[\n" + " {%- for item in params['required'] -%}\n" + " <|\"|>{{- item -}}<|\"|>\n" + " {{- ',' if not loop.last -}}\n" + " {%- endfor -%}\n" + " ],\n" + " {%- endif -%}\n" + " {%- if params.get('type') -%}\n" + " type:<|\"|>{{- params['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if 'response' in tool_data['function'] -%}\n" + " {%- set response_declaration = tool_data['function']['response'] -%}\n" + " ,response:{\n" + " {%- if response_declaration['description'] -%}\n" + " description:<|\"|>{{- response_declaration['description'] -}}<|\"|>,\n" + " {%- endif -%}\n" + " {%- if response_declaration['type'] | upper == 'OBJECT' -%}\n" + " type:<|\"|>{{- response_declaration['type'] | upper -}}<|\"|>}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " }\n" + "{%- endmacro -%}\n" + "{%- macro format_argument(argument, escape_keys=True) -%}\n" + " {%- if argument is string -%}\n" + " {{- '<|\"|>' + argument + '<|\"|>' -}}\n" + " {%- elif argument is boolean -%}\n" + " {{- 'true' if argument else 'false' -}}\n" + " {%- elif argument is mapping -%}\n" + " {{- '{' -}}\n" + " {%- set ns = namespace(found_first=false) -%}\n" + " {%- for key, value in argument | dictsort -%}\n" + " {%- if ns.found_first %},{% endif -%}\n" + " {%- set ns.found_first = true -%}\n" + " {%- if escape_keys -%}\n" + " {{- '<|\"|>' + key + '<|\"|>' -}}\n" + " {%- else -%}\n" + " {{- key -}}\n" + " {%- endif -%}\n" + " :{{- format_argument(value, escape_keys=escape_keys) -}}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- elif argument is sequence -%}\n" + " {{- '[' -}}\n" + " {%- for item in argument -%}\n" + " {{- format_argument(item, escape_keys=escape_keys) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- ']' -}}\n" + " {%- else -%}\n" + " {{- argument -}}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "{%- macro strip_thinking(text) -%}\n" + " {%- set ns = namespace(result='') -%}\n" + " {%- for part in text.split('') -%}\n" + " {%- if '<|channel>' in part -%}\n" + " {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}\n" + " {%- else -%}\n" + " {%- set ns.result = ns.result + part -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- ns.result | trim -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- macro format_tool_response_block(tool_name, response) -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- if response is mapping -%}\n" + " {{- 'response:' + tool_name + '{' -}}\n" + " {%- for key, value in response | dictsort -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- if not loop.last %},{% endif -%}\n" + " {%- endfor -%}\n" + " {{- '}' -}}\n" + " {%- else -%}\n" + " {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n" + " {%- endif -%}\n" + " {{- '' -}}\n" + "{%- endmacro -%}\n" + "\n" + "{%- set ns = namespace(prev_message_type=None) -%}\n" + "{%- set loop_messages = messages -%}\n" + "{{- bos_token -}}\n" + "{#- Handle System/Tool Definitions Block -#}\n" + "{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}\n" + " {{- '<|turn>system\\n' -}}\n" + " {#- Inject Thinking token at the very top of the FIRST system turn -#}\n" + " {%- if enable_thinking is defined and enable_thinking -%}\n" + " {{- '<|think|>\\n' -}}\n" + " {%- set ns.prev_message_type = 'think' -%}\n" + " {%- endif -%}\n" + " {%- if messages[0]['role'] in ['system', 'developer'] -%}\n" + " {%- if messages[0]['content'] is string -%}\n" + " {{- messages[0]['content'] | trim -}}\n" + " {%- elif messages[0]['content'] is sequence -%}\n" + " {%- for item in messages[0]['content'] -%}\n" + " {{- item['text'] | trim + ' '-}}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set loop_messages = messages[1:] -%}\n" + " {%- endif -%}\n" + " {%- if tools -%}\n" + " {%- for tool in tools %}\n" + " {{- '<|tool>' -}}\n" + " {{- format_function_declaration(tool) | trim -}}\n" + " {{- '' -}}\n" + " {%- endfor %}\n" + " {%- set ns.prev_message_type = 'tool' -%}\n" + " {%- endif -%}\n" + " {{- '\\n' -}}\n" + "{%- endif %}\n" + "\n" + "{#- Pre-scan: find last user message index for reasoning guard -#}\n" + "{%- set ns_turn = namespace(last_user_idx=-1) -%}\n" + "{%- for i in range(loop_messages | length) -%}\n" + " {%- if loop_messages[i]['role'] == 'user' -%}\n" + " {%- set ns_turn.last_user_idx = i -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{#- Loop through messages -#}\n" + "{%- for message in loop_messages -%}\n" + " {%- if message['role'] != 'tool' -%}\n" + " {%- set ns.prev_message_type = None -%}\n" + " {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n" + " {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n" + " {%- set prev_nt = namespace(role=None, found=false) -%}\n" + " {%- if loop.index0 > 0 -%}\n" + " {%- for j in range(loop.index0 - 1, -1, -1) -%}\n" + " {%- if not prev_nt.found -%}\n" + " {%- if loop_messages[j]['role'] != 'tool' -%}\n" + " {%- set prev_nt.role = loop_messages[j]['role'] -%}\n" + " {%- set prev_nt.found = true -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n" + " {%- if not continue_same_model_turn -%}\n" + " {{- '<|turn>' + role + '\\n' }}\n" + " {%- endif -%}\n" + "\n" + " {#- Render reasoning/reasoning_content as thinking channel -#}\n" + " {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n" + " {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n" + " {{- '<|channel>thought\\n' + thinking_text + '\\n' -}}\n" + " {%- endif -%}\n" + "\n" + " {%- if message.get('tool_calls') -%}\n" + " {%- for tool_call in message['tool_calls'] -%}\n" + " {%- set function = tool_call['function'] -%}\n" + " {{- '<|tool_call>call:' + function['name'] + '{' -}}\n" + " {%- if function['arguments'] is mapping -%}\n" + " {%- set ns_args = namespace(found_first=false) -%}\n" + " {%- for key, value in function['arguments'] | dictsort -%}\n" + " {%- if ns_args.found_first %},{% endif -%}\n" + " {%- set ns_args.found_first = true -%}\n" + " {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n" + " {%- endfor -%}\n" + " {%- elif function['arguments'] is string -%}\n" + " {{- function['arguments'] -}}\n" + " {%- endif -%}\n" + " {{- '}' -}}\n" + " {%- endfor -%}\n" + " {%- set ns.prev_message_type = 'tool_call' -%}\n" + " {%- endif -%}\n" + "\n" + " {%- set ns_tr_out = namespace(flag=false) -%}\n" + " {%- if message.get('tool_responses') -%}\n" + " {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n" + " {%- for tool_response in message['tool_responses'] -%}\n" + " {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endfor -%}\n" + " {%- elif message.get('tool_calls') -%}\n" + " {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n" + " {%- set ns_tool_scan = namespace(stopped=false) -%}\n" + " {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n" + " {%- if ns_tool_scan.stopped -%}\n" + " {%- elif loop_messages[k]['role'] != 'tool' -%}\n" + " {%- set ns_tool_scan.stopped = true -%}\n" + " {%- else -%}\n" + " {%- set follow = loop_messages[k] -%}\n" + " {#- Resolve tool_call_id to function name -#}\n" + " {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n" + " {%- for tc in message['tool_calls'] -%}\n" + " {%- if tc.get('id') == follow.get('tool_call_id') -%}\n" + " {%- set ns_tname.name = tc['function']['name'] -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {#- Handle content as string or content-parts array -#}\n" + " {%- set tool_body = follow.get('content') -%}\n" + " {%- if tool_body is string -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- elif tool_body is sequence and tool_body is not string -%}\n" + " {%- set ns_txt = namespace(s='') -%}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'text' -%}\n" + " {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n" + " {%- for part in tool_body -%}\n" + " {%- if part.get('type') == 'image_url' -%}\n" + " {%- set url_val = part['image_url'] if part['image_url'] is string else part['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- elif part.get('type') in ['audio_url', 'input_audio'] -%}\n" + " {%- if part.get('type') == 'audio_url' -%}\n" + " {%- set audio_val = part['audio_url'] if part['audio_url'] is string else part['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif part.get('type') == 'input_audio' -%}\n" + " {%- set audio_val = part['input_audio'] if part['input_audio'] is string else ('data:audio/' + part['input_audio']['format'] + ';base64,' + part['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + # " {%- elif part.get('type') == 'video_url' -%}\n" + # " {%- set video_val = part['video_url'] if part['video_url'] is string else part['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- else -%}\n" + " {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n" + " {%- endif -%}\n" + " {%- set ns_tr_out.flag = true -%}\n" + " {%- set ns.prev_message_type = 'tool_response' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "\n" + " {%- set captured_content -%}\n" + " {%- if message['content'] is string -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(message['content']) -}}\n" + " {%- else -%}\n" + " {{- message['content'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif message['content'] is sequence -%}\n" + " {%- for item in message['content'] -%}\n" + " {%- if item['type'] == 'text' -%}\n" + " {%- if role == 'model' -%}\n" + " {{- strip_thinking(item['text']) -}}\n" + " {%- else -%}\n" + " {{- item['text'] | trim -}}\n" + " {%- endif -%}\n" + " {%- elif item['type'] == 'image_url' -%}\n" + " {%- set url_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {{- '<|image|>' + url_val -}}\n" + " {%- set ns.prev_message_type = 'image' -%}\n" + " {%- elif item['type'] in ['audio_url', 'input_audio'] -%}\n" + " {%- if item['type'] == 'audio_url' -%}\n" + " {%- set audio_val = item['audio_url'] if item['audio_url'] is string else item['audio_url']['url'] -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- elif item['type'] == 'input_audio' -%}\n" + " {%- set audio_val = item['input_audio'] if item['input_audio'] is string else ('data:audio/' + item['input_audio']['format'] + ';base64,' + item['input_audio']['data']) -%}\n" + " {{- '<|audio|>' + audio_val -}}\n" + " {%- endif -%}\n" + " {%- set ns.prev_message_type = 'audio' -%}\n" + " {%- endif -%}\n" + # " {%- elif item['type'] == 'video_url' -%}\n" + # " {%- set video_val = item['video_url'] if item['video_url'] is string else item['video_url']['url'] -%}\n" + # " {{- '<|video|>' + video_val -}}\n" + # " {%- set ns.prev_message_type = 'video' -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endset -%}\n" + "\n" + " {{- captured_content -}}\n" + " {%- set has_content = captured_content | trim | length > 0 -%}\n" + "\n" + " {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}\n" + " {{- '<|tool_response>' -}}\n" + " {%- elif not (ns_tr_out.flag and not has_content) -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- if add_generation_prompt -%}\n" + " {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}\n" + " {{- '<|turn>model\\n' -}}\n" + " {%- if not enable_thinking | default(false) -%}\n" + " {{- '<|channel>thought\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Gemma 4 Handler. + + Args: + enable_thinking (bool): Controls whether the <|think|> tag is injected and + manages <|channel>thought behavior. + Note: ONLY supported on Gemma4 31B and 26BA4B models. + NOT supported on Gemma4 E2B and E4B models. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject the thinking variable into the Jinja environment + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Set the stop token based on Gemma 4's format () + # generation_config.json: "eos_token_id": [1, 106, 50] + kwargs['stop'] = [self.GEMMA4_EOS_TOKEN, self.GEMMA4_EOT_TOKEN, self.GEMMA4_STR_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class GLM41VChatHandler(MTMDChatHandler): + # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32. + + GLM41V_EOS_TOKEN = "<|endoftext|>" + GLM41V_PAD_TOKEN = "<|endoftext|>" + GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]\n" + "{%- for msg in messages -%}" + "{%- if msg.role == 'system' -%}" + "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'user' -%}" + "<|user|>\n" + "{%- if msg.content is string -%}" + "{{ msg.content }}" + "{%- else -%}" + "{%- for item in msg.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}{{ GLM41V_EOS_TOKEN }}" + "{%- elif msg.role == 'assistant' -%}" + "{%- if msg.metadata -%}" + "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- else -%}" + "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{%- endif -%}" + ) + + def __call__(self, **kwargs): + self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN + # https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking/blob/main/generation_config.json + stop_tokens = [self.GLM41V_EOS_TOKEN, "<|user|>", "<|observation|>", ""] # Stop token patch + kwargs['stop'] = stop_tokens + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + + +class GLM46VChatHandler(MTMDChatHandler): + GLM46V_EOS_TOKEN = "<|endoftext|>" + GLM46V_PAD_TOKEN = "<|endoftext|>" + GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>" + GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>" + + CHAT_FORMAT = ( + "[gMASK]" + "{%- if tools -%}" + "<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n" + "You are provided with function signatures within XML tags:\n\n" + "{%- for tool in tools -%}" + "{{ tool | tojson(ensure_ascii=False) }}\n" + "{%- endfor -%}" + "\n\nFor each function call, output the function name and arguments within the following XML format:\n" + "{function-name}\n{arg-key-1}\n{arg-value-1}\n...\n" + "{%- endif -%}" + + "{%- for m in messages -%}" + "{%- if m.role == 'system' -%}" + "<|system|>\n{{ m.content }}" + "{%- elif m.role == 'user' -%}" + "<|user|>\n" + "{%- if m.content is string -%}" + "{{ m.content }}" + "{%- else -%}" + "{%- for item in m.content -%}" + "{%- if item.type == 'image_url' or 'image_url' in item -%}" + "<|begin_of_image|>" + "{%- if item.image_url is string -%}" + "{{- item.image_url -}}" + "{%- else -%}" + "{{- item.image_url.url -}}" + "{%- endif -%}" + "<|end_of_image|>" + "{%- elif item.type == 'text' -%}" + "{{ item.text }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + # If enable_thinking is disabled, insert `/nothink` according to the source code logic. + "{{ '/nothink' if not enable_thinking else '' }}" + "{%- elif m.role == 'assistant' -%}" + "<|assistant|>" + "{%- if enable_thinking -%}" + "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}" + "\n{{ reasoning.strip() }}" + "{%- else -%}" + "\n" + "{%- endif -%}" + "{{ '\n' + m.content.strip() if m.content.strip() else '' }}" + "{%- endif -%}" + "{{ GLM46V_EOS_TOKEN }}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "<|assistant|>\n" + "{{ '' if enable_thinking else '\n' }}" + "{%- endif -%}" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + GLM-4.6V Handler + Parameters: + - enable_thinking (bool): Whether to enable the model's think process. The default is True. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + self.extra_template_arguments["GLM46V_EOS_TOKEN"] = self.GLM46V_EOS_TOKEN + + # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json + kwargs['stop'] = [self.GLM46V_EOS_TOKEN, "<|user|>", "<|observation|>", "<|code_middle|>"] # Stop token patch + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) + + +class GraniteDoclingChatHandler(MTMDChatHandler): + """ + Handler for Granite-Docling models. + + Format(512x512): Content + + Note(JamePeng): The GGUF files for Model and MMPROJ should be BF16 version !!! + Since the model does not have special tokens for the start and end of an image, + it is recommended to process only one image at a time. + You can iterate through the images individually for recognition. + + """ + GRANITE_BOS_TOKEN = "<|start_of_role|>" + GRANITE_EOS_TOKEN = "<|end_of_text|>" + GRANITE_PAD_TOKEN = "<|end_of_text|>" + GRANITE_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for part in message['content'] -%}" + "{%- if part['type'] == 'text' -%}" + "{{- part['text'] -}}" + "{%- elif part['type'] == 'image_url' -%}" + "{%- if part.image_url is string -%}" + "{{- part.image_url -}}" + "{%- else -%}" + "{{- part.image_url.url -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '<|end_of_text|>\n' -}}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|start_of_role|>assistant' -}}" + # Support the 'controls' parameter if present in generation arguments + "{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}" + "{{- '<|end_of_role|>' -}}" + "{%- endif -%}" + ) + + def __init__(self, controls: dict = None, **kwargs): + """ + Granite-Docling Handler + Args: + controls (dict, optional): Operational parameters passed to the assistant role. + + The 'controls' parameter is used to guide the model's behavior or output format. + Common examples for 'controls' include: + - Document Parsing: {"mode": "document_parsing", "format": "json"} + """ + self.controls = controls + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Inject controls into the template environment + self.extra_template_arguments["controls"] = self.controls + self.DEFAULT_SYSTEM_MESSAGE = None + kwargs['stop'] = [self.GRANITE_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + + return super().__call__(**kwargs) + + +class LFM2VLChatHandler(MTMDChatHandler): + LFM2VL_BOS_TOKEN = "<|startoftext|>" + LFM2VL_EOS_TOKEN = "<|im_end|>" + LFM2VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM2VL_IMAGE_END_TOKEN = "<|image_end|>" + + CHAT_FORMAT = ( + "{%- for message in messages -%}" + "{{ '<|im_start|>' + message['role'] + '\n' }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] }}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if 'image_url' in content -%}" + "{%- if content.image_url is string -%}" + "<|image_start|>{{ content.image_url }}<|image_end|>" + "{%- else -%}" + "<|image_start|>{{ content.image_url.url }}<|image_end|>" + "{%- endif -%}" + "{%- elif content['type'] == 'text' -%}" + "{{ content['text'] }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{ '<|im_end|>\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ '<|im_start|>assistant\n' }}" + "{%- endif -%}" + ) + + def __init__(self, image_min_tokens: int = -1, image_max_tokens: int = -1, **kwargs): + """ + LFM2-VL Handler + LiquidAI officially recommends configuring LFM2-VL with the following Vision parameters: min_image_tokens=64, max_image_tokens=256 + """ + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__(image_min_tokens=self.image_min_tokens, image_max_tokens=self.image_max_tokens, **kwargs) + + def __call__(self, **kwargs): + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + return super().__call__(**kwargs) + + +class LFM25VLChatHandler(MTMDChatHandler): + """ + Handler for LFM2.5-VL multimodal models. + + Note(JamePeng): The suggestion is to compress the input image to 512x512 pixels to achieve native resolution processing. + """ + # Aligned with LFM2.5-VL tokenizer_config + LFM25VL_BOS_TOKEN = "<|startoftext|>" + LFM25VL_EOS_TOKEN = "<|im_end|>" + LFM25VL_PAD_TOKEN = "<|pad|>" + + # Image specific tokens + LFM25VL_IMAGE_TOKEN = "" + LFM25VL_IMAGE_START_TOKEN = "<|image_start|>" + LFM25VL_IMAGE_END_TOKEN = "<|image_end|>" + LFM25VL_IMAGE_THUMBNAIL = "<|img_thumbnail|>" + + CHAT_FORMAT = ( + "{{- bos_token -}}\n" + "{%- set keep_past_thinking = keep_past_thinking | default(false) -%}\n" + "{%- set ns = namespace(system_prompt='', content='') -%}\n" + "{%- if messages[0]['role'] == 'system' -%}\n" + " {%- set ns.system_prompt = messages[0]['content'] -%}\n" + " {%- set messages = messages[1:] -%}\n" + "{%- endif -%}\n" + "{%- if tools -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ('\\n' if ns.system_prompt else '') + 'List of tools: [' -%}\n" + " {%- for tool in tools -%}\n" + " {%- if tool is not string -%}\n" + " {%- set tool = tool | tojson -%}\n" + " {%- endif -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + tool -%}\n" + " {%- if not loop.last -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ', ' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set ns.system_prompt = ns.system_prompt + ']' -%}\n" + "{%- endif -%}\n" + "{%- if ns.system_prompt -%}\n" + " {{- '<|im_start|>system\\n' + ns.system_prompt + '<|im_end|>\\n' -}}\n" + "{%- endif -%}\n" + "{%- set ns.last_assistant_index = -1 -%}\n" + "{%- for message in messages -%}\n" + " {%- if message['role'] == 'assistant' -%}\n" + " {%- set ns.last_assistant_index = loop.index0 -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- for message in messages -%}\n" + " {{- '<|im_start|>' + message['role'] + '\\n' -}}\n" + " {%- set content = message['content'] -%}\n" + " {%- if content is not string -%}\n" + " {%- set ns.content = '' -%}\n" + " {#- MTMD-style Multimodal Injection (Audio stripped for VL model) -#}\n" + " {%- for item in content -%}\n" + " {%- if item['type'] == 'image_url' -%}\n" + " {%- set img_val = item['image_url'] if item['image_url'] is string else item['image_url']['url'] -%}\n" + " {%- set ns.content = ns.content + img_val -%}\n" + " {%- elif item['type'] == 'text' -%}\n" + " {%- set ns.content = ns.content + item['text'] -%}\n" + " {%- else -%}\n" + " {%- set ns.content = ns.content + (item | tojson) -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- set content = ns.content -%}\n" + " {%- endif -%}\n" + " {%- if message['role'] == 'assistant' and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}\n" + " {%- if '' in content -%}\n" + " {%- set content = content.split('')[-1] | trim -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {{- content + '<|im_end|>\\n' -}}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, keep_past_thinking: bool = False, **kwargs): + self.keep_past_thinking = keep_past_thinking + super().__init__(**kwargs) + + + def __call__(self, **kwargs): + if self.image_min_tokens > 256: + if self.verbose: + print(f"{self.log_prefix}: For LFM2.5-VL, using values higher than 256 for `image_min_tokens` could cause errors. Please reset it to between 64 and 256.") + self.image_min_tokens = -1 + + self.extra_template_arguments["keep_past_thinking"] = self.keep_past_thinking + + kwargs['stop'] = [self.LFM25VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(keep_past_thinking={self.keep_past_thinking}) - Start processing") + return super().__call__(**kwargs) + + +class PaddleOCRChatHandler(MTMDChatHandler): + """ + Handler for PaddleOCR 1.5/1.6 multimodal models. + """ + + PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>" + PADDLEOCR_BOS_TOKEN = "" + PADDLEOCR_EOS_TOKEN = "" + PADDLEOCR_SEP_TOKEN = "<|end_of_sentence|>" + PADDLEOCR_IMAGE_BOS_TOKEN = "<|IMAGE_START|>" + PADDLEOCR_IMAGE_EOS_TOKEN = "<|IMAGE_END|>" + + CHAT_FORMAT = ( + "{%- if not add_generation_prompt is defined -%}{%- set add_generation_prompt = true -%}{%- endif -%}" + "{%- if not cls_token is defined -%}{%- set cls_token = '" + PADDLEOCR_CLS_TOKEN + "' -%}{%- endif -%}" + "{%- if not eos_token is defined -%}{%- set eos_token = '" + PADDLEOCR_EOS_TOKEN + "' -%}{%- endif -%}" + + "{{- cls_token -}}" + "{%- for message in messages -%}" + "{%- if message['role'] == 'user' -%}" + "{{- 'User: ' -}}" + + # Robust parsing: Check if content is string or list + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + # Pass 1: Render all images first + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'image_url' and 'image_url' in content -%}" + "{{- '<|IMAGE_START|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|IMAGE_END|>' -}}" + "{%- endif -%}" + "{%- endfor -%}" + + # Pass 2: Render all text second + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- '\\n' -}}" + + "{%- elif message['role'] == 'assistant' -%}" + "{{- 'Assistant:\\n' -}}" + "{%- if message['content'] is string -%}" + "{{- message['content'] -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{{- eos_token -}}" + + "{%- elif message['role'] == 'system' -%}" + "{%- if message['content'] is string -%}" + "{{- message['content'] + '\\n' -}}" + "{%- else -%}" + "{%- for content in message['content'] -%}" + "{%- if content['type'] == 'text' -%}" + "{{- content['text'] + '\\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- endif -%}" + "{%- endfor -%}" + + "{%- if add_generation_prompt -%}" + "{{- 'Assistant:\\n' -}}" + "{%- endif -%}" + ) + + def __init__( + self, + image_min_tokens: int = -1, + image_max_tokens: int = -1, + **kwargs + ): + self.image_min_tokens = image_min_tokens + self.image_max_tokens = image_max_tokens + super().__init__( + image_min_tokens=self.image_min_tokens, + image_max_tokens=self.image_max_tokens, + **kwargs + ) + + def __call__(self, **kwargs): + # Set the specific stop token defined in the PaddleOCR template + kwargs['stop'] = [self.PADDLEOCR_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + return super().__call__(**kwargs) + + +class Qwen25VLChatHandler(MTMDChatHandler): + + QWEN25_VL_BOS_TOKEN = "<|endoftext|>" + QWEN25_VL_PAD_TOKEN = "<|endoftext|>" + QWEN25_VL_EOS_TOKEN = "<|im_end|>" + + CHAT_FORMAT = ( + "{% set image_count = namespace(value=0) %}" + "{% for message in messages %}" + "{% if loop.first and message['role'] != 'system' %}" + "<|im_start|>system\n" + "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is string %}" + "{{ message['content'] }}<|im_end|>\n" + "{% else %}" + "{% for content in message['content'] %}" + "{% if content['type'] == 'image_url' %}" + "{% if content.image_url is string %}" + "{% set image_count.value = image_count.value + 1 %}" + "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" + "{% else %}" + "{% set image_count.value = image_count.value + 1 %}" + "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" + "{% endif %}" + "{% elif content['type'] == 'text' %}" + "{{ content['text'] }}" + "{% endif %}" + "{% endfor %}" + "<|im_end|>\n" + "{% endif %}" + "{% endfor %}" + "<|im_start|>assistant\n" + ) + + def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Qwen3ASRChatHandler(MTMDChatHandler): + """ + Handler for Qwen 3 ASR (Automatic Speech Recognition) models. + + Features: + - Highly specialized for Speech-to-Text tasks. + - Aggregates all system text into a single cohesive system block. + - Drops user text entirely, extracting ONLY audio data into a unified user turn. + - Wraps audio with <|audio_start|><|audio_pad|>[DATA]<|audio_end|>. + - Integrated MTMD-style URL and Base64 injection for input_audio and audio_url. + """ + + DEFAULT_SYSTEM_MESSAGE = """ + You are an advanced multilingual Speech-to-Text model. Accurately transcribe the audio into text in its original spoken language. + You should ignore background noise, filler words, and stutters where possible, and format the final output with correct grammar and capitalization. + """ + + QWEN3_ASR_BOS_TOKEN = "<|im_start|>" + QWEN3_ASR_PAD_TOKEN = "<|endoftext|>" + QWEN3_ASR_EOS_TOKEN = "<|im_end|>" + + + QWEN3_ASR_AUDIO_BOS_TOKEN = "<|audio_start|>" + QWEN3_ASR_AUDIO_PAD_TOKEN = "<|audio_pad|>" + QWEN3_ASR_AUDIO_EOS_TOKEN = "<|audio_end|>" + + CHAT_FORMAT = ( + "{%- set ns = namespace(system_text='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.role == 'system' -%}\n" + " {%- if m.content is string -%}\n" + " {%- set ns.system_text = ns.system_text + m.content -%}\n" + " {%- else -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'text' and (c.text is defined) -%}\n" + " {%- set ns.system_text = ns.system_text + c.text -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- set ns2 = namespace(audio_tokens='') -%}\n" + "{%- for m in messages -%}\n" + " {%- if m.content is not string -%}\n" + " {%- for c in m.content -%}\n" + " {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) or c.type == 'input_audio' -%}\n" + " {#- MTMD Audio Injection -#}\n" + " {%- set audio_val = '' -%}\n" + " {%- if c.type == 'audio_url' or 'audio_url' in c -%}\n" + " {%- set audio_val = c.audio_url if c.audio_url is string else c.audio_url.url -%}\n" + " {%- elif c.type == 'input_audio' or 'input_audio' in c -%}\n" + " {%- set audio_val = c.input_audio if c.input_audio is string else ('data:audio/' + c.input_audio.format + ';base64,' + c.input_audio.data) -%}\n" + " {%- endif -%}\n" + " {%- set ns2.audio_tokens = ns2.audio_tokens + '<|audio_start|><|audio_pad|>' + audio_val + '<|audio_end|>' -%}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n" + "{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token + kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)") + + return super().__call__(**kwargs) + +class Qwen3VLChatHandler(MTMDChatHandler): + + QWEN3_VL_BOS_TOKEN = "<|endoftext|>" + QWEN3_VL_PAD_TOKEN = "<|endoftext|>" + QWEN3_VL_EOS_TOKEN = "<|im_end|>" + + CHAT_FORMAT = ( + "{{- '<|im_start|>system\n' -}}" + "{%- if messages[0].content is string and messages[0].role == 'system' -%}" + "{{- messages[0].content -}}" + "{%- elif messages[0].role == 'system' -%}" + "{%- if 'text' in messages[0].content -%}" + "{{- messages[0].content.text -}}" + "{%- else -%}" + "{{- 'You are a helpful assistant.' -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- if tools -%}" + "{{- '\n\n' -}}" + "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" + "{%- for tool in tools -%}" + "{{- '\n' -}}" + "{{- tool | tojson -}}" + "{%- endfor -%}" + "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\nRESPONSE FOR THE USER HERE\n\n{\"name\": , \"arguments\": }\n' -}}" + "{%- endif -%}" + "{{- '<|im_end|>\n' -}}" + "{%- set image_count = namespace(value=0) -%}" + #"{%- set video_count = namespace(value=0) -%}" + "{%- for message in messages -%}" + "{%- if message.role == 'tool' -%}" + "{{- '<|im_start|>user\n\n' -}}" + "{%- elif message.role != 'system' -%}" + "{{- '<|im_start|>' + message.role + '\n' -}}" + "{%- endif -%}" + "{%- if message.content is string and message.role != 'system' -%}" + "{{- message.content -}}" + "{%- elif message.role != 'system' -%}" + "{%- for content in message.content -%}" + "{%- if 'image_url' in content -%}" + "{%- set image_count.value = image_count.value + 1 -%}" + "{%- if add_vision_id -%}" + "{{- 'Picture ' -}}" + "{{- image_count.value | string -}}" + "{{- ': ' -}}" + "{%- endif -%}" + "{{- '<|vision_start|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|vision_end|>' -}}" + "{%- endif -%}" + # Video not supported yet + "{%- if 'text' in content -%}" + "{{- content.text -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- if message.role == 'assistant' -%}" + "{%- if message.tool_calls -%}" + "{%- for tool_call in message.tool_calls -%}" + "{%- if (loop.first and message.content) or (not loop.first) -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- if tool_call.function -%}" + "{%- set tool_call = tool_call.function -%}" + "{%- endif -%}" + "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" + "{%- if tool_call.arguments is string -%}" + "{{- tool_call.arguments -}}" + "{%- else -%}" + "{{- tool_call.arguments | tojson -}}" + "{%- endif -%}" + "{{- '}\n' -}}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- elif message.role == 'tool' -%}" + "{{- '' -}}" + "{%- endif -%}" + "{%- if message.role != 'system' -%}" + "{{- '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- '<|im_start|>assistant\n' -}}" + "{%- if force_reasoning -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + force_reasoning: bool = False, + add_vision_id: bool = True, + **kwargs, + ): + """ + Parameters: + - force_reasoning (bool): + - True: Force the reasoning in the model by adding to the chat template. + - False (default): Don't force the reasoning. + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + """ + super().__init__(**kwargs) + self.force_reasoning = force_reasoning + self.extra_template_arguments["force_reasoning"] = force_reasoning + self.extra_template_arguments["add_vision_id"] = add_vision_id + + def __call__(self, **kwargs): + kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN] + + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(force_reasoning={self.force_reasoning}) - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + +class Qwen35ChatHandler(MTMDChatHandler): + """ + Handler for Qwen3.5/Qwen3.6 models. + """ + CHAT_FORMAT = ( + "{%- set image_count = namespace(value=0) -%}" + "{%- set video_count = namespace(value=0) -%}" + "{%- macro render_content(content, do_vision_count, is_system_content=false) -%}" + " {%- if content is string -%}" + " {{- content -}}" + " {%- elif content is iterable and content is not mapping -%}" + " {%- for item in content -%}" + " {%- if 'image_url' in item or item.type == 'image_url' -%}" + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain images.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set image_count.value = image_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Picture ' -}}" + " {{- image_count.value | string -}}" + " {{- ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {%- if item.image_url is string -%}" + " {{- item.image_url -}}" + " {%- else -%}" + " {{- item.image_url.url -}}" + " {%- endif -%}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'video' in item -%}" + " {{- raise_exception('llama.cpp does not currently support video.') -}}" # Video not supported, raise exception + " {%- if is_system_content -%}" + " {{- raise_exception('System message cannot contain videos.') -}}" + " {%- endif -%}" + " {%- if do_vision_count -%}" + " {%- set video_count.value = video_count.value + 1 -%}" + " {%- endif -%}" + " {%- if add_vision_id -%}" + " {{- 'Video ' ~ video_count.value ~ ': ' -}}" + " {%- endif -%}" + " {{- '<|vision_start|>' -}}" + " {{- item.video -}}" + " {{- '<|vision_end|>' -}}" + " {%- elif 'text' in item -%}" + " {{- item.text -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected item type in content.') -}}" + " {%- endif -%}" + " {%- endfor -%}" + " {%- elif content is none or content is undefined -%}" + " {{- '' -}}" + " {%- else -%}" + " {{- raise_exception('Unexpected content type.') -}}" + " {%- endif -%}" + "{%- endmacro -%}" + "{%- if not messages -%}" + " {{- raise_exception('No messages provided.') -}}" + "{%- endif -%}" + "{%- if tools and tools is iterable and tools is not mapping -%}" + " {{- '<|im_start|>system\n' -}}" + " {{- '# Tools\n\nYou have access to the following functions:\n\n' -}}" + " {%- for tool in tools -%}" + " {{- '\n' -}}" + " {{- tool | tojson -}}" + " {%- endfor -%}" + " {{- '\n' -}}" + " {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' -}}" + " {%- if messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) | trim -%}" + " {%- if content -%}" + " {{- '\n\n' + content -}}" + " {%- endif -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + "{%- elif messages[0].role == 'system' -%}" + " {%- set content = render_content(messages[0].content, false, true) -%}" + " {{- '<|im_start|>system\n' + content + '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages | length - 1) -%}" + "{%- for message in messages[::-1] -%}" + " {%- set index = messages | length - 1 - loop.index0 -%}" + " {%- if ns.multi_step_tool and message.role == 'user' -%}" + " {%- set content = render_content(message.content, false) | trim -%}" + " {%- if not (content.startswith('') and content.endswith('')) -%}" + " {%- set ns.multi_step_tool = false -%}" + " {%- set ns.last_query_index = index -%}" + " {%- endif -%}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if ns.multi_step_tool -%}" + " {{- raise_exception('No user query found in messages.') -}}" + "{%- endif -%}" + "{%- for message in messages -%}" + " {%- set content = render_content(message.content, true) | trim -%}" + " {%- if message.role == 'system' -%}" + " {%- if not loop.first -%}" + " {{- raise_exception('System message must be at the beginning.') -}}" + " {%- endif -%}" + " {%- elif message.role == 'user' -%}" + " {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' -}}" + " {%- elif message.role == 'assistant' -%}" + " {%- set reasoning_content = '' -%}" + " {%- if message.reasoning_content is string -%}" + " {%- set reasoning_content = message.reasoning_content -%}" + " {%- elif '' in content -%}" + " {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') -%}" + " {%- set content = content.split('')[-1].lstrip('\n') -%}" + " {%- endif -%}" + " {%- set reasoning_content = reasoning_content | trim -%}" + " {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) -%}" + " {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content -}}" + " {%- else -%}" + " {{- '<|im_start|>' + message.role + '\n' + content -}}" + " {%- endif -%}" + " {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping -%}" + " {%- for tool_call in message.tool_calls -%}" + " {%- if tool_call.function is defined -%}" + " {%- set tool_call = tool_call.function -%}" + " {%- endif -%}" + " {%- if loop.first -%}" + " {%- if content | trim -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- '\n\n\n' -}}" + " {%- endif -%}" + " {%- if tool_call.arguments is defined -%}" + " {%- for (args_name, args_value) in tool_call.arguments | items -%}" + " {{- '\n' -}}" + " {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}" + " {{- args_value -}}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '\n' -}}" + " {%- endfor -%}" + " {%- endif -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif message.role == 'tool' -%}" + " {%- if loop.previtem and loop.previtem.role != 'tool' -%}" + " {{- '<|im_start|>user' -}}" + " {%- endif -%}" + " {{- '\n\n' -}}" + " {{- content -}}" + " {{- '\n' -}}" + " {%- if not loop.last and loop.nextitem.role != 'tool' -%}" + " {{- '<|im_end|>\n' -}}" + " {%- elif loop.last -%}" + " {{- '<|im_end|>\n' -}}" + " {%- endif -%}" + " {%- else -%}" + " {{- raise_exception('Unexpected message role.') -}}" + " {%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + " {{- '<|im_start|>assistant\n' -}}" + " {%- if enable_thinking is defined and enable_thinking is false -%}" + " {{- '\n\n\n\n' -}}" + " {%- else -%}" + " {{- '\n' -}}" + " {%- endif -%}" + "{%- endif -%}" + ) + + def __init__( + self, + add_vision_id: bool = True, + enable_thinking: bool = True, + preserve_thinking: bool = False, + **kwargs, + ): + """ + Parameters: + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. + - enable_thinking (bool): + - True (default): Enables reasoning for better results. + - False: Disables reasoning for faster results. + - preserve_thinking (bool): + - True: Keeps reasoning process for ALL historical conversational turns. + - False (default): Only keeps for the latest assistant reply to save tokens. + """ + super().__init__(**kwargs) + self.enable_thinking = enable_thinking + self.preserve_thinking = preserve_thinking + self.extra_template_arguments["add_vision_id"] = add_vision_id + self.extra_template_arguments["enable_thinking"] = enable_thinking + self.extra_template_arguments["preserve_thinking"] = preserve_thinking + + def __call__(self, **kwargs): + llama = kwargs['llama'] + + if hasattr(llama, 'input_ids'): + llama.input_ids.fill(0) + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}, preserve_thinking={self.preserve_thinking}) - Start processing") + + # Use parent implementation + return super().__call__(**kwargs) + + +class Step3VLChatHandler(MTMDChatHandler): + """ + Handler for Step3-VL models. + """ + + STEP3VL_BOS_TOKEN = "<|im_start|>" + STEP3VL_EOS_TOKEN = "<|im_end|>" + STEP3VL_PAD_TOKEN = "<|endoftext|>" + STEP3VL_IMAGE_TOKEN = "" + + CHAT_FORMAT = ( + "{%- macro render_content(content) -%}\n" + " {%- if content is none -%}{{- '' -}}\n" + " {%- elif content is string -%}{{- content -}}\n" + " {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n" + " {%- elif content is iterable -%}\n" + " {%- for item in content -%}\n" + " {%- if item.type == 'text' -%}\n" + " {{- item['value'] if 'value' in item else item['text'] -}}\n" + " {%- elif item.type in ['image', 'image_url'] -%}\n" + " {%- set url_val = '' -%}\n" + " {%- if item.image_url -%}\n" + " {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n" + " {%- endif -%}\n" + " {{- '' + url_val -}}\n" + " {%- endif -%}\n" + " {%- endfor -%}\n" + " {%- endif -%}\n" + "{%- endmacro -%}\n" + "\n" + "{%- if tools -%}\n" + " {{- '<|im_start|>system\\n' -}}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- render_content(messages[0].content) + '\\n\\n' -}}\n" + " {%- endif -%}\n" + " {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n' -}}\n" + " {%- for tool in tools -%}\n" + " {{- '\\n' -}}\n" + " {{- tool | tojson -}}\n" + " {%- endfor -%}\n" + " {{- '\\n\\n\\nAlways adhere to this exact format for tool use:\\n\\n\\n{\"name\": , \"arguments\": }\\n\\n{additional_tool_calls}\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within XML tags.\\n- `` must be an exact match to one of the available tools.\\n- `` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n" + "{%- else -%}\n" + " {%- if messages[0].role == 'system' -%}\n" + " {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + "{%- endif -%}\n" + "\n" + "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n" + "{%- for message in messages[::-1] -%}\n" + " {%- set index = (messages|length - 1) - loop.index0 -%}\n" + " {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('') and render_content(message.content).endswith('')) -%}\n" + " {%- set ns.multi_step_tool = false -%}\n" + " {%- set ns.last_query_index = index -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "\n" + "{%- for message in messages -%}\n" + " {%- set content = render_content(message.content) -%}\n" + " {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n" + " {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n" + " {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n" + " {%- elif message.role == 'assistant' -%}\n" + " {%- if message.reasoning_content is string -%}\n" + " {%- set reasoning_content = render_content(message.reasoning_content) -%}\n" + " {%- else -%}\n" + " {%- if '' in content -%}\n" + " {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') -%}\n" + " {%- set content = content.split('')[-1].lstrip('\\n') -%}\n" + " {%- else -%}\n" + " {%- set reasoning_content = '' -%}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + " {%- if loop.index0 > ns.last_query_index -%}\n" + " {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content + '\\n\\n' + content -}}\n" + " {%- else -%}\n" + " {{- '<|im_start|>' + message.role + '\\n' + content -}}\n" + " {%- endif -%}\n" + " {%- if message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- for tool_call in message.tool_calls -%}\n" + " {{- '\\n' -}}\n" + " {%- if tool_call.function -%}\n" + " {%- set tool_call = tool_call.function -%}\n" + " {%- endif -%}\n" + " {{- '\\n{\"name\": \"' -}}\n" + " {{- tool_call.name -}}\n" + " {{- '\", \"arguments\": ' -}}\n" + " {%- if tool_call.arguments is string -%}\n" + " {{- tool_call.arguments -}}\n" + " {%- else -%}\n" + " {{- tool_call.arguments | tojson -}}\n" + " {%- endif -%}\n" + " {{- '}\\n' -}}\n" + " {%- endfor -%}\n" + " {{- '\\n' -}}\n" + " {%- endif -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- elif message.role == 'tool' -%}\n" + " {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n" + " {{- '<|im_start|>tool_response' -}}\n" + " {%- endif -%}\n" + " {{- '\\n\\n' -}}\n" + " {{- content -}}\n" + " {{- '\\n' -}}\n" + " {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n" + " {{- '<|im_end|>\\n' -}}\n" + " {%- endif -%}\n" + " {%- endif -%}\n" + "{%- endfor -%}\n" + "{%- if add_generation_prompt -%}\n" + " {{- '<|im_start|>assistant\\n\\n\\n\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n' -}}\n" + "{%- endif -%}\n" + ) + + def __init__(self, enable_thinking: bool = True, **kwargs): + """ + Initializes the Step3-VL Handler. + + Args: + enable_thinking (bool): If False, injects an empty block to bypass reasoning. + """ + self.enable_thinking = enable_thinking + super().__init__(**kwargs) + + def __call__(self, **kwargs): + # Pass thinking toggle into Jinja + self.extra_template_arguments["enable_thinking"] = self.enable_thinking + + # Step3 uses standard <|im_end|> ChatML stop formatting + kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN] + + if self.verbose: + print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing") + + return super().__call__(**kwargs) From d84b0c21fa4a131df5c17ddd1b2447929dc1973f Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 22:08:32 +0800 Subject: [PATCH 505/518] fix(model): handle missing chat templates - Update LlamaModel.model_chat_template() to return Optional[str] and accept name=None for the default model chat template. - llama_model_chat_template() may return nullptr when no chat template is available. Handle that case explicitly instead of decoding a null pointer, and return None so callers can apply their own fallback logic. Signed-off-by: JamePeng --- llama_cpp/_internals.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 434921e6bd..91befb2247 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -152,12 +152,17 @@ def model_size(self) -> int: """ return llama_cpp.llama_model_size(self.model) - def model_chat_template(self, name: bytes) -> str: + def model_chat_template(self, name: Optional[bytes] = None) -> Optional[str]: """ - Get the default chat template. Returns nullptr if not available - If name is NULL, returns the default chat template + Get a chat template from the model. + + If name is None, returns the default chat template. + Returns None if no chat template is available. """ - return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8") + template = llama_cpp.llama_model_chat_template(self.model, name) + if template is None: + return None + return template.decode("utf-8") def n_params(self) -> int: """ From c9745316d748cec408b07c2f3a43fd97fa921e73 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 14 Jun 2026 23:43:33 +0800 Subject: [PATCH 506/518] feat(mtmd): enhance generic chat template support - Enhance GenericMTMDChatHandler to better support model-provided chat templates. - Allow the generic handler to accept an optional named chat template, load it from the model at call time via llama_model_chat_template(), fall back to the model's default chat template, and finally use the built-in MTMD CHAT_FORMAT when no model template is available. - Also expand the generic media placeholder list for common multimodal templates and document the handler as a template-driven MTMD implementation. This prepares the generic path for a later render-driven placeholder replacement pass. Signed-off-by: JamePeng --- llama_cpp/llama.py | 2 + llama_cpp/llama_multimodal.py | 118 +++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index dbc60eaf76..b6a2c8d5a7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -174,6 +174,7 @@ def __init__( log_filters: Optional[Sequence[str]] = None, log_filters_case_sensitive: bool = True, # Extra Params + chat_template_name: Optional[str] = None, chat_handler_kwargs: Dict[str, Any] = {}, **kwargs, # type: ignore ): @@ -721,6 +722,7 @@ def __init__( chat_format = self.metadata.get("tokenizer.chat_template", None), mmproj_path = mmproj_path, verbose = self.verbose, + chat_template_name=chat_template_name, **chat_handler_kwargs ) diff --git a/llama_cpp/llama_multimodal.py b/llama_cpp/llama_multimodal.py index a055869543..a0f7e594e4 100644 --- a/llama_cpp/llama_multimodal.py +++ b/llama_cpp/llama_multimodal.py @@ -91,6 +91,8 @@ class MTMDChatHandler: "{% endif %}" ) + KNOWN_MEDIA_TAGS: List[str] = [] + def __init__( self, mmproj_path: Optional[str] = None, @@ -1189,41 +1191,137 @@ def from_pretrained( **kwargs, ) -# Experiments are not recommended for this purpose at this time. +# Generic template-driven MTMD handler. class GenericMTMDChatHandler(MTMDChatHandler): + """ + Generic MTMD chat handler backed by the model-provided chat template. + + This handler is intentionally template-driven. It renders the model's + tokenizer.chat_template first, then normalizes rendered media URLs or + placeholder tokens into MTMD media markers before tokenization. + + It is designed for model templates that emit media placeholders such as + <|image_pad|>, <|image|>, , [IMG], or Kimi-style <|media_pad|>. + Model-specific handlers may still be preferable when a model requires + special stop tokens, generation flags, or non-standard template arguments. + """ + KNOWN_MEDIA_TAGS = [ + # Pad placeholders inside model-specific wrappers. "<|image_pad|>", "<|audio_pad|>", "<|video_pad|>", + + # Direct placeholders inside Gemma/Llama/GLM-style wrappers. "<|image|>", "<|audio|>", "<|video|>", - "[IMG]" + + # LLaVA / LFM / Mistral-style placeholders. + "", + "