From 65b50ca36af4a00096267ef7b400f82933c33b9d Mon Sep 17 00:00:00 2001 From: Andrei Date: Fri, 12 Jun 2026 08:17:04 -0700 Subject: [PATCH 01/15] feat: update llama.cpp to ggml-org/llama.cpp@3e7bd4f39 (#2298) --- CHANGELOG.md | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c4c744d9..46c57c5d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291 -- feat: update llama.cpp to ggml-org/llama.cpp@ac4cddeb0 +- feat: update llama.cpp to ggml-org/llama.cpp@3e7bd4f39 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288 - fix(ci): Repair Linux accelerator wheels for manylinux publishing diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ac4cddeb0..3e7bd4f39 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ac4cddeb0dbd778f650bf568f6f08344a06abe3a +Subproject commit 3e7bd4f39ac59167f82103e1fc22dc4585c489d3 From 565d3c5c1c13a6255a81da09d2cc9f7b3df75c2a Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 13 Jun 2026 10:43:21 -0700 Subject: [PATCH 02/15] feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676 (#2300) --- CHANGELOG.md | 2 +- llama_cpp/mtmd_cpp.py | 56 ++++++++++++++++++++++++++++++++++++++++++- vendor/llama.cpp | 2 +- 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46c57c5d9..084865cd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291 -- feat: update llama.cpp to ggml-org/llama.cpp@3e7bd4f39 +- feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288 - fix(ci): Repair Linux accelerator wheels for manylinux publishing diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 919cefb35..46eb2c879 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -76,6 +76,9 @@ mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) mtmd_input_chunks_p_ctypes = c_void_p +mtmd_batch_p = NewType("mtmd_batch_p", int) +mtmd_batch_p_ctypes = c_void_p + # Enums MTMD_INPUT_CHUNK_TYPE_TEXT = 0 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 @@ -102,6 +105,7 @@ class mtmd_context_params(Structure): image_max_tokens: int cb_eval: llama_cpp.ggml_backend_sched_eval_callback cb_eval_user_data: c_void_p + batch_max_tokens: int _fields_ = [ ("use_gpu", c_bool), @@ -115,6 +119,7 @@ class mtmd_context_params(Structure): ("image_max_tokens", c_int), ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), + ("batch_max_tokens", c_int), ] @@ -596,7 +601,7 @@ def mtmd_image_tokens_get_decoder_pos( c_int, ) def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int: - """Run an MTMD encode pass for image tokens.""" + """Run a deprecated MTMD encode pass for image tokens.""" ... @@ -618,6 +623,55 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float ... +# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +@ctypes_function("mtmd_batch_init", [mtmd_context_p_ctypes], mtmd_batch_p_ctypes) +def mtmd_batch_init(ctx: mtmd_context_p, /) -> Optional[mtmd_batch_p]: + """Initialize an MTMD media chunk batch for a context.""" + ... + + +# MTMD_API void mtmd_batch_free(mtmd_batch * batch); +@ctypes_function("mtmd_batch_free", [mtmd_batch_p_ctypes], None) +def mtmd_batch_free(batch: mtmd_batch_p, /): ... + + +# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_batch_add_chunk", + [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes], + c_int, +) +def mtmd_batch_add_chunk( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> int: + """Add a media chunk to an MTMD batch.""" + ... + + +# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +@ctypes_function("mtmd_batch_encode", [mtmd_batch_p_ctypes], c_int) +def mtmd_batch_encode(batch: mtmd_batch_p, /) -> int: + """Run an MTMD encode pass for all chunks in a batch.""" + ... + + +# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_batch_get_output_embd", + [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes], + POINTER(c_float), +) +def mtmd_batch_get_output_embd( + batch: mtmd_batch_p, + chunk: mtmd_input_chunk_p, + /, +) -> Optional[CtypesArray[c_float]]: + """Get output embeddings for a chunk from the last batch encode pass.""" + ... + + # MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); @ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps) def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3e7bd4f39..f05cf4676 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3e7bd4f39ac59167f82103e1fc22dc4585c489d3 +Subproject commit f05cf4676af46c2f017c0e6ba25b6e20204f700e From a52702fce21d736d2de0c7c2f5325e9408fa8374 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 13 Jun 2026 11:16:38 -0700 Subject: [PATCH 03/15] feat(example): use MTMD batch encoding (#2301) --- CHANGELOG.md | 1 + examples/server/README.md | 10 +- examples/server/server.py | 246 +++++++++++++++++++++++++++----------- 3 files changed, 182 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 084865cd0..905245e8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat(example): use MTMD batch encoding by @abetlen in #2301 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291 - feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288 diff --git a/examples/server/README.md b/examples/server/README.md index ff04374fc..b2819a244 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -291,7 +291,7 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha ## Multimodal `model.mtmd` -`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image and audio content parts. +`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image, audio, and video content parts. ```json { @@ -305,8 +305,10 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha "path": ".cache/mtmd-embeddings", "max_bytes": 1073741824 }, + "batch_max_tokens": 1024, "image_max_bytes": 20971520, "audio_max_bytes": 104857600, + "video_max_bytes": 536870912, "image_timeout_seconds": 10.0 } } @@ -317,11 +319,13 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha | --- | --- | | `mmproj_path` | Local multimodal projector path. | | `mmproj_from_pretrained` | Hugging Face projector source. | -| `embedding_cache.path` | Directory for cached image and audio embeddings. | +| `embedding_cache.path` | Directory for cached image, audio, and video embeddings. | | `embedding_cache.max_bytes` | Maximum embedding cache size. | +| `batch_max_tokens` | Maximum number of media output tokens per MTMD projector-side encode batch. | | `image_max_bytes` | Maximum image payload size. | | `audio_max_bytes` | Maximum audio payload size. | -| `image_timeout_seconds` | Timeout for remote image and audio URL fetches. | +| `video_max_bytes` | Maximum video payload size. | +| `image_timeout_seconds` | Timeout for remote image, audio, and video URL fetches. | Send image inputs with OpenAI chat content parts. diff --git a/examples/server/server.py b/examples/server/server.py index e8034a214..16f8c9f7e 100644 --- a/examples/server/server.py +++ b/examples/server/server.py @@ -3223,6 +3223,7 @@ class MTMDOptions(BaseModel): embedding_cache: Optional["ConfigFile.MTMDEmbeddingCacheOptions"] = None allowed_media_domains: Optional[List[str]] = None allowed_local_media_path: Optional[str] = None + batch_max_tokens: int = Field(default=1024, ge=1) image_max_bytes: int = Field(default=20 * 1024 * 1024, ge=1) audio_max_bytes: int = Field(default=100 * 1024 * 1024, ge=1) video_max_bytes: int = Field(default=512 * 1024 * 1024, ge=1) @@ -10410,6 +10411,21 @@ class MTMDLoadedMedia: class MTMDProcessor: + @dataclass + class MediaChunk: + kind: Literal["image", "audio", "video"] + key: str + chunk: Any + n_tokens: int + decode_n_pos: int + non_causal: bool + embeddings: Optional[np.ndarray] = None + + @dataclass + class ParsedChunk: + text_tokens: Optional[List[int]] = None + media: Optional["MTMDProcessor.MediaChunk"] = None + def __init__( self, *, @@ -10422,6 +10438,7 @@ def __init__( n_ubatch: int, n_threads_batch: int, mmproj_path: str, + batch_max_tokens: int, embedding_cache: Optional[MTMDEmbeddingCache], allowed_media_domains: Optional[List[str]], allowed_local_media_path: Optional[str], @@ -10437,6 +10454,7 @@ def __init__( self.n_ubatch = n_ubatch self.mmproj_path = mmproj_path self.embedding_cache = embedding_cache + self.batch_max_tokens = batch_max_tokens self.model_fingerprint = MTMDEmbeddingCache.fingerprint_file(model_path) self.mmproj_fingerprint = MTMDEmbeddingCache.fingerprint_file(mmproj_path) self.allowed_media_domains = ( @@ -10456,6 +10474,7 @@ def __init__( self.lock = threading.Lock() params = mtmd_cpp.mtmd_context_params_default() params.n_threads = max(1, n_threads_batch) + params.batch_max_tokens = batch_max_tokens self.ctx = mtmd_cpp.mtmd_init_from_file( mmproj_path.encode("utf-8"), llama_model, @@ -10705,37 +10724,91 @@ def _media_identity_tokens( tokens.append(-1 - (int.from_bytes(digest[:4], "little") & 0x3FFFFFFF)) return tokens - def _encode_media_chunk( - self, - *, - kind: Literal["image", "audio", "video"], - key: str, - chunk: Any, - ) -> np.ndarray: - n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)) - if self.embedding_cache is not None: - cached = self.embedding_cache.load(key) - if ( - cached is not None - and cached.embeddings.shape == (n_tokens, self.n_embd_inp) - ): - return cached.embeddings - result = int(mtmd_cpp.mtmd_encode_chunk(self.ctx, chunk)) - if result != 0: - raise CompletionRequestValidationError( - f"failed to encode {kind} chunk: error code {result}" - ) - output = mtmd_cpp.mtmd_get_output_embd(self.ctx) - if output is None: - raise CompletionRequestValidationError(f"MTMD {kind} encoder returned no embeddings") + def _embeddings_from_pointer(self, output: Any, n_tokens: int) -> np.ndarray: flat = np.ctypeslib.as_array(output, shape=(n_tokens * self.n_embd_inp,)) - embeddings = np.array(flat, dtype=np.float32, copy=True).reshape( + return np.array(flat, dtype=np.float32, copy=True).reshape( n_tokens, self.n_embd_inp, ) - if self.embedding_cache is not None: - self.embedding_cache.save(key, embeddings) - return embeddings + + def _load_cached_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> bool: + if self.embedding_cache is None: + return False + cached = self.embedding_cache.load(media_chunk.key) + if cached is None or cached.embeddings.shape != ( + media_chunk.n_tokens, + self.n_embd_inp, + ): + return False + media_chunk.embeddings = cached.embeddings + return True + + def _save_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> None: + if self.embedding_cache is None or media_chunk.embeddings is None: + return + self.embedding_cache.save(media_chunk.key, media_chunk.embeddings) + + def _encode_media_batch( + self, + media_chunks: Sequence["MTMDProcessor.MediaChunk"], + start_index: int, + ) -> int: + batch = mtmd_cpp.mtmd_batch_init(self.ctx) + if batch is None: + raise CompletionRequestValidationError("failed to create MTMD media batch") + try: + first = media_chunks[start_index] + result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, first.chunk)) + if result != 0: + raise CompletionRequestValidationError( + f"failed to add {first.kind} chunk to MTMD batch: error code {result}" + ) + group = [first] + next_index = start_index + 1 + while next_index < len(media_chunks): + candidate = media_chunks[next_index] + result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, candidate.chunk)) + if result == 0: + group.append(candidate) + next_index += 1 + continue + if result in {2, 3}: + break + raise CompletionRequestValidationError( + f"failed to add {candidate.kind} chunk to MTMD batch: error code {result}" + ) + result = int(mtmd_cpp.mtmd_batch_encode(batch)) + if result != 0: + raise CompletionRequestValidationError( + f"failed to encode MTMD media batch: error code {result}" + ) + for media_chunk in group: + output = mtmd_cpp.mtmd_batch_get_output_embd(batch, media_chunk.chunk) + if output is None: + raise CompletionRequestValidationError( + f"MTMD {media_chunk.kind} encoder returned no embeddings" + ) + media_chunk.embeddings = self._embeddings_from_pointer( + output, + media_chunk.n_tokens, + ) + self._save_media_chunk(media_chunk) + return len(group) + finally: + mtmd_cpp.mtmd_batch_free(batch) + + def _encode_media_chunks( + self, + media_chunks: Sequence["MTMDProcessor.MediaChunk"], + ) -> None: + uncached = [ + media_chunk + for media_chunk in media_chunks + if not self._load_cached_media_chunk(media_chunk) + ] + index = 0 + while index < len(uncached): + index += self._encode_media_batch(uncached, index) def _positions_for_chunk(self, chunk: Any, start_pos: int) -> np.ndarray: n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)) @@ -10858,12 +10931,8 @@ def _build_prompt_plan_locked( raise CompletionRequestValidationError( f"failed to tokenize MTMD prompt: error code {result}" ) - segments: List[PromptSegment] = [] - identity_tokens: List[int] = [] - text_tokens: List[int] = [] - text_token_index_by_pos: Dict[int, int] = {} - identity_pos = 0 - decode_pos = 0 + parsed_chunks: List[MTMDProcessor.ParsedChunk] = [] + media_chunks: List[MTMDProcessor.MediaChunk] = [] video_index = 0 used_media_keys = set() n_chunks = int(mtmd_cpp.mtmd_input_chunks_size(chunks)) @@ -10884,24 +10953,9 @@ def _build_prompt_plan_locked( else [] ) if tokens: - start_pos = identity_pos - segments.append( - PromptSegment( - kind="text", - start_pos=start_pos, - n_pos=len(tokens), - identity_tokens=list(tokens), - decode_start_pos=decode_pos, - decode_n_pos=len(tokens), - text_tokens=list(tokens), - ) + parsed_chunks.append( + MTMDProcessor.ParsedChunk(text_tokens=tokens) ) - for offset, token in enumerate(tokens): - text_token_index_by_pos[start_pos + offset] = len(text_tokens) - text_tokens.append(token) - identity_tokens.extend(tokens) - identity_pos += len(tokens) - decode_pos += len(tokens) continue if chunk_type == mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE: chunk_kind: Literal["image", "audio"] = "image" @@ -10951,37 +11005,84 @@ def _build_prompt_plan_locked( decode_n_pos = int(mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk)) if decode_n_pos <= 0: raise CompletionRequestValidationError("MTMD media chunk has no decoder positions") - embeddings = self._encode_media_chunk(kind=kind, key=key, chunk=chunk) - n_tokens = int(embeddings.shape[0]) + n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)) if n_tokens <= 0: - raise CompletionRequestValidationError("MTMD media chunk has no embeddings") + raise CompletionRequestValidationError("MTMD media chunk has no embedding tokens") non_causal = bool(mtmd_cpp.mtmd_decode_use_non_causal(self.ctx, chunk)) - segment_identity = self._media_identity_tokens(kind, key, n_tokens) - positions = self._positions_for_chunk(chunk, decode_pos) - segment = PromptSegment( + media_chunk = MTMDProcessor.MediaChunk( kind=kind, - start_pos=identity_pos, - n_pos=n_tokens, - identity_tokens=segment_identity, - decode_start_pos=decode_pos, + key=key, + chunk=chunk, + n_tokens=n_tokens, decode_n_pos=decode_n_pos, - media=PromptSegment.Media( - embeddings=embeddings, - positions=positions, - non_causal=non_causal, - ), + non_causal=non_causal, ) - if non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch): + parsed_chunks.append(MTMDProcessor.ParsedChunk(media=media_chunk)) + media_chunks.append(media_chunk) + if used_media_keys != {media.key for media in loaded_media}: + raise CompletionRequestValidationError("not all media inputs were consumed by MTMD") + self._encode_media_chunks(media_chunks) + segments: List[PromptSegment] = [] + identity_tokens: List[int] = [] + text_tokens: List[int] = [] + text_token_index_by_pos: Dict[int, int] = {} + identity_pos = 0 + decode_pos = 0 + for parsed_chunk in parsed_chunks: + if parsed_chunk.text_tokens is not None: + tokens = parsed_chunk.text_tokens + start_pos = identity_pos + segments.append( + PromptSegment( + kind="text", + start_pos=start_pos, + n_pos=len(tokens), + identity_tokens=list(tokens), + decode_start_pos=decode_pos, + decode_n_pos=len(tokens), + text_tokens=list(tokens), + ) + ) + for offset, token in enumerate(tokens): + text_token_index_by_pos[start_pos + offset] = len(text_tokens) + text_tokens.append(token) + identity_tokens.extend(tokens) + identity_pos += len(tokens) + decode_pos += len(tokens) + continue + media_chunk = parsed_chunk.media + if media_chunk is None or media_chunk.embeddings is None: + raise CompletionRequestValidationError("MTMD media chunk has no embeddings") + embeddings = media_chunk.embeddings + if media_chunk.non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch): raise CompletionRequestValidationError( - f"non-causal {kind} embedding chunk exceeds model batch limits; " + f"non-causal {media_chunk.kind} embedding chunk exceeds model batch limits; " "increase n_batch and n_ubatch" ) - segments.append(segment) + segment_identity = self._media_identity_tokens( + media_chunk.kind, + media_chunk.key, + media_chunk.n_tokens, + ) + positions = self._positions_for_chunk(media_chunk.chunk, decode_pos) + segments.append( + PromptSegment( + kind=media_chunk.kind, + start_pos=identity_pos, + n_pos=media_chunk.n_tokens, + identity_tokens=segment_identity, + decode_start_pos=decode_pos, + decode_n_pos=media_chunk.decode_n_pos, + media=PromptSegment.Media( + embeddings=embeddings, + positions=positions, + non_causal=media_chunk.non_causal, + ), + ) + ) identity_tokens.extend(segment_identity) - identity_pos += n_tokens - decode_pos += decode_n_pos - if used_media_keys != {media.key for media in loaded_media}: - raise CompletionRequestValidationError("not all media inputs were consumed by MTMD") + identity_pos += media_chunk.n_tokens + decode_pos += media_chunk.decode_n_pos return PromptPlan( text=prompt, generation_prompt=generation_prompt, @@ -16211,6 +16312,7 @@ def main() -> None: n_ubatch=model.n_ubatch, n_threads_batch=model.n_threads_batch, mmproj_path=mmproj_path, + batch_max_tokens=config.model.mtmd.batch_max_tokens, embedding_cache=embedding_cache, allowed_media_domains=config.model.mtmd.allowed_media_domains, allowed_local_media_path=config.model.mtmd.allowed_local_media_path, From ddc0d15bde47a2dd7ae03546447ae73dc681a698 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 13 Jun 2026 12:36:30 -0700 Subject: [PATCH 04/15] chore: bump version to 0.3.29 (#2302) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 905245e8a..56c5ffb55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.29] + - feat(example): use MTMD batch encoding by @abetlen in #2301 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291 - feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 13668893f..42f807ef6 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.28" +__version__ = "0.3.29" From e8070920c165b678ed0fb5255fbcc7e81bf8f5db Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 13 Jun 2026 13:19:32 -0700 Subject: [PATCH 05/15] fix(ci): skip mtmd CLI wrappers in package builds (#2303) --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b2744cdc..623ab2162 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,15 @@ if (LLAMA_BUILD) # Building llava add_subdirectory(vendor/llama.cpp/tools/mtmd) + # The Python package only ships mtmd as a shared library. + # Upstream mtmd also defines CLI compatibility wrappers, but those are + # not installed here and can fail to link in minimal Docker toolchains. + foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-debug) + if (TARGET ${target}) + set_target_properties(${target} PROPERTIES EXCLUDE_FROM_ALL TRUE) + endif() + endforeach() + if (WIN32) set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF) endif() From 3850aff7afa720aa9f492b788720e46ae117c0cd Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 13 Jun 2026 18:04:40 -0700 Subject: [PATCH 06/15] fix(ci): use C++ compiler for Docker builds (#2304) --- CMakeLists.txt | 2 +- docker/simple/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 623ab2162..0474863a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,7 +179,7 @@ if (LLAMA_BUILD) # The Python package only ships mtmd as a shared library. # Upstream mtmd also defines CLI compatibility wrappers, but those are # not installed here and can fail to link in minimal Docker toolchains. - foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-debug) + foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-cli llama-mtmd-debug) if (TARGET ${target}) set_target_properties(${target} PROPERTIES EXCLUDE_FROM_ALL TRUE) endif() diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index bad4f456f..22b2335a3 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -27,7 +27,7 @@ RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context -RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose +RUN CC=gcc CXX=g++ CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose # Set environment variable for the host ENV HOST=0.0.0.0 From 541b08cca566fbfb686287bfbbcfc6d4087e2c8a Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 15 Jun 2026 00:55:48 -0700 Subject: [PATCH 07/15] feat: update llama.cpp to ggml-org/llama.cpp@6e9007ae6 (#2307) --- CHANGELOG.md | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56c5ffb55..5804e0208 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: update llama.cpp to ggml-org/llama.cpp@6e9007ae6 + ## [0.3.29] - feat(example): use MTMD batch encoding by @abetlen in #2301 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f05cf4676..6e9007ae6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f05cf4676af46c2f017c0e6ba25b6e20204f700e +Subproject commit 6e9007ae61f4e994c27484759caac6ef2aa32b30 From 824565a96bf1580266cd41f73263dec2cd13a9a7 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 15 Jun 2026 01:10:49 -0700 Subject: [PATCH 08/15] feat: update llama.cpp to 6eab47181 (#2308) --- CHANGELOG.md | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5804e0208..0b2b5eb31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: update llama.cpp to ggml-org/llama.cpp@6e9007ae6 +- feat: update llama.cpp to ggml-org/llama.cpp@6eab47181 ## [0.3.29] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6e9007ae6..6eab47181 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6e9007ae61f4e994c27484759caac6ef2aa32b30 +Subproject commit 6eab47181cbd3532c88a105682b81b4729ab809b From 822146b7cdb710e064462a1939e489bb4d330df2 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 15 Jun 2026 23:45:52 -0700 Subject: [PATCH 09/15] feat: update llama.cpp to e3a74b299 (#2310) --- CHANGELOG.md | 2 +- llama_cpp/mtmd_cpp.py | 14 +++++++++++++- vendor/llama.cpp | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b2b5eb31..2cc78bfc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: update llama.cpp to ggml-org/llama.cpp@6eab47181 +- feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299 ## [0.3.29] diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 46eb2c879..78f068aa9 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -169,6 +169,12 @@ class mtmd_caps(Structure): POINTER(c_char_p), ) +mtmd_helper_post_decode_callback = CFUNCTYPE( + c_int, + llama_cpp.llama_batch, + c_void_p, +) + class mtmd_helper_bitmap_wrapper(Structure): """Bitmap wrapper returned by MTMD helper media loaders.""" @@ -860,7 +866,9 @@ def mtmd_helper_eval_chunk_single( # llama_pos n_past, # llama_seq_id seq_id, # int32_t n_batch, -# llama_pos * new_n_past); +# llama_pos * new_n_past, +# mtmd_helper_post_decode_callback callback, +# void * user_data); @ctypes_function( "mtmd_helper_decode_image_chunk", [ @@ -872,6 +880,8 @@ def mtmd_helper_eval_chunk_single( llama_cpp.llama_seq_id, c_int, POINTER(llama_cpp.llama_pos), + mtmd_helper_post_decode_callback, + c_void_p, ], c_int, ) @@ -884,6 +894,8 @@ def mtmd_helper_decode_image_chunk( seq_id: llama_cpp.llama_seq_id, n_batch: Union[c_int, int], new_n_past: "_Pointer[llama_cpp.llama_pos]", + callback: Optional[mtmd_helper_post_decode_callback], + user_data: c_void_p, /, ) -> int: """Decode a pre-encoded image chunk.""" diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6eab47181..e3a74b299 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6eab47181cbd3532c88a105682b81b4729ab809b +Subproject commit e3a74b299085cd00013804f7fca2e03441b2da20 From a8042331fbea6121f56fa6d0e1ddb764c790a364 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 16 Jun 2026 00:14:32 -0700 Subject: [PATCH 10/15] feat: add Pyodide wheel support (#2309) * feat: update llama.cpp to 6eab47181 * feat: add Pyodide wheel support * docs: fix Pyodide changelog entry * feat: enable mtmd for emscripten --- .github/workflows/build-and-release.yaml | 33 +++++++++++++++- CHANGELOG.md | 1 + CMakeLists.txt | 50 ++++++++++++++++++++---- llama_cpp/_ctypes_extensions.py | 37 +++++++++++++++++- 4 files changed, 111 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 4ae37b174..c931ead34 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -139,6 +139,37 @@ jobs: name: wheels_riscv64 path: ./wheelhouse/*.whl + build_wheels_pyodide: + name: Build Pyodide wheel + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + submodules: "recursive" + + - uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Build wheel + uses: pypa/cibuildwheel@v4.1.0 + env: + CIBW_PLATFORM: "pyodide" + CIBW_BUILD: "cp314-pyodide_wasm32" + CIBW_BUILD_VERBOSITY: "1" + CIBW_REPAIR_WHEEL_COMMAND: "" + CIBW_BEFORE_TEST: "curl -L --fail --retry 3 -o /tmp/stories260K.gguf https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf" + CIBW_TEST_COMMAND: "python -c \"import llama_cpp.mtmd_cpp as mtmd; from llama_cpp import Llama; print('mtmd marker', mtmd.mtmd_default_marker().decode()); llm = Llama(model_path='/tmp/stories260K.gguf', n_ctx=64, n_batch=8, n_threads=1, verbose=False); print('loaded', llm.n_vocab(), llm.n_ctx()); print('generated', llm('Once upon a', max_tokens=1, temperature=0)['choices'][0]['text'])\"" + CMAKE_ARGS: "-DLLAMA_WASM_MEM64=OFF -DEMSCRIPTEN_SYSTEM_PROCESSOR=wasm32 -DGGML_NATIVE=OFF -DGGML_OPENMP=OFF -DGGML_METAL=OFF -DGGML_BLAS=OFF -DGGML_CUDA=OFF -DGGML_HIP=OFF -DGGML_VULKAN=OFF -DGGML_OPENCL=OFF -DGGML_RPC=OFF -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TOOLS=OFF -DLLAMA_BUILD_SERVER=OFF" + with: + output-dir: wheelhouse + + - name: Upload wheels as artifacts + uses: actions/upload-artifact@v7 + with: + name: wheels_pyodide + path: ./wheelhouse/*.whl + build_sdist: name: Build source distribution runs-on: ubuntu-latest @@ -183,7 +214,7 @@ jobs: release: name: Release - needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist] + needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_wheels_pyodide, build_sdist] if: startsWith(github.ref, 'refs/tags/') runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cc78bfc7..59307cec2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299 +- feat: add Pyodide wheel support by @abetlen in #2309 ## [0.3.29] diff --git a/CMakeLists.txt b/CMakeLists.txt index 0474863a4..5feaaca5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,14 +10,22 @@ function(llama_cpp_python_install_target target) return() endif() - install( - TARGETS ${target} - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib - ) + if(EMSCRIPTEN) + set_target_properties(${target} PROPERTIES + OUTPUT_NAME "${target}.cpython-00-wasm32-emscripten" + ) + endif() + + if(NOT EMSCRIPTEN) + install( + TARGETS ${target} + LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ) + endif() install( TARGETS ${target} LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib @@ -65,6 +73,32 @@ if (LLAMA_BUILD) # Disable building curl support set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE) + if (EMSCRIPTEN) + if (DEFINED EMSCRIPTEN_SYSTEM_PROCESSOR) + set(CMAKE_SYSTEM_PROCESSOR ${EMSCRIPTEN_SYSTEM_PROCESSOR} CACHE STRING "Target processor" FORCE) + else() + set(CMAKE_SYSTEM_PROCESSOR wasm32 CACHE STRING "Target processor" FORCE) + endif() + + set(LLAMA_WASM_MEM64 OFF CACHE BOOL "llama.cpp: enable wasm64 memory" FORCE) + set(GGML_NATIVE OFF CACHE BOOL "ggml: enable -march=native" FORCE) + set(GGML_OPENMP OFF CACHE BOOL "ggml: use OpenMP" FORCE) + set(GGML_METAL OFF CACHE BOOL "ggml: use Metal" FORCE) + set(GGML_BLAS OFF CACHE BOOL "ggml: use BLAS" FORCE) + set(GGML_CUDA OFF CACHE BOOL "ggml: use CUDA" FORCE) + set(GGML_HIP OFF CACHE BOOL "ggml: use HIP" FORCE) + set(GGML_VULKAN OFF CACHE BOOL "ggml: use Vulkan" FORCE) + set(GGML_OPENCL OFF CACHE BOOL "ggml: use OpenCL" FORCE) + set(GGML_RPC OFF CACHE BOOL "ggml: use RPC" FORCE) + + # Pyodide auto-loads side modules from top-level site-packages/lib + # before Python imports run, so keep upstream installs package-local. + set(CMAKE_INSTALL_BINDIR llama_cpp/lib CACHE PATH "Install binaries" FORCE) + set(CMAKE_INSTALL_INCLUDEDIR llama_cpp/include CACHE PATH "Install headers" FORCE) + set(CMAKE_INSTALL_LIBDIR llama_cpp/lib CACHE PATH "Install libraries" FORCE) + set(LLAMA_BUILD_COMMON OFF CACHE BOOL "Build llama.cpp common library" FORCE) + endif() + # Architecture detection and settings for Apple platforms if (APPLE) # Get the target architecture diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index e88ed387d..02cee8a88 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -19,6 +19,9 @@ from typing_extensions import TypeAlias +_EMSCRIPTEN_SIDE_MODULE_SUFFIX = ".cpython-00-wasm32-emscripten.so" + + # Load the library def load_shared_library(lib_base_name: str, base_path: pathlib.Path): """Platform independent shared library loader""" @@ -26,7 +29,12 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): # for llamacpp) and "llama" (default name for this repo) lib_paths: List[pathlib.Path] = [] # Determine the file extension based on the platform - if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): + if sys.platform == "emscripten": + # Use a CPython-style tag that Pyodide skips during package auto-load. + lib_paths += [ + base_path / f"lib{lib_base_name}{_EMSCRIPTEN_SIDE_MODULE_SUFFIX}", + ] + elif sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): lib_paths += [ base_path / f"lib{lib_base_name}.so", ] @@ -60,6 +68,33 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path): os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) cdll_args["winmode"] = ctypes.RTLD_GLOBAL + if sys.platform == "emscripten": + cdll_args["mode"] = ctypes.RTLD_GLOBAL + lib_dir = str(base_path) + ld_library_path = os.environ.get("LD_LIBRARY_PATH", "") + if lib_dir not in ld_library_path.split(os.pathsep): + os.environ["LD_LIBRARY_PATH"] = ( + lib_dir + if not ld_library_path + else f"{lib_dir}{os.pathsep}{ld_library_path}" + ) + + emscripten_dependencies = { + "llama": ("ggml-base", "ggml-cpu", "ggml"), + "mtmd": ("ggml-base", "ggml-cpu", "ggml", "llama"), + } + for dependency in emscripten_dependencies.get(lib_base_name, ()): + dependency_path = ( + base_path / f"lib{dependency}{_EMSCRIPTEN_SIDE_MODULE_SUFFIX}" + ) + if dependency_path.exists(): + try: + ctypes.CDLL(str(dependency_path), **cdll_args) # type: ignore + except Exception as e: + raise RuntimeError( + f"Failed to load shared library '{dependency_path}': {e}" + ) + # Try to load the shared library, handling potential errors for lib_path in lib_paths: if lib_path.exists(): From ddb6a05848a550bd7383ab7e2341f76ec7e46af9 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 16 Jun 2026 00:47:08 -0700 Subject: [PATCH 11/15] chore: bump version to 0.3.30 (#2311) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59307cec2..5b337ad92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.30] + - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299 - feat: add Pyodide wheel support by @abetlen in #2309 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 42f807ef6..b72459f65 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.29" +__version__ = "0.3.30" From 7440aaa3d3a793826b233e7334ea1f1ab1606bda Mon Sep 17 00:00:00 2001 From: Andrei Date: Fri, 19 Jun 2026 18:05:37 -0700 Subject: [PATCH 12/15] feat: update llama.cpp to f449e0553 (#2312) --- CHANGELOG.md | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b337ad92..a9371d342 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: update llama.cpp to ggml-org/llama.cpp@f449e0553 + ## [0.3.30] - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e3a74b299..f449e0553 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e3a74b299085cd00013804f7fca2e03441b2da20 +Subproject commit f449e0553708b895adbd94a301431cef691f632d From b11fe078898ef2e385c7d78563a300a8ab73cd27 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 20 Jun 2026 01:43:22 -0700 Subject: [PATCH 13/15] chore: bump version to 0.3.31 (#2317) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9371d342..d6d33dbbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.31] + - feat: update llama.cpp to ggml-org/llama.cpp@f449e0553 ## [0.3.30] diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index b72459f65..ed3c342f2 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.30" +__version__ = "0.3.31" From 9be3cd135bb87ef5c97662c8e60f5ec9689e94e5 Mon Sep 17 00:00:00 2001 From: Ankur Kaul Date: Mon, 22 Jun 2026 11:38:18 +0530 Subject: [PATCH 14/15] fix: preserve recurrent/hybrid model state when the full prompt is already cached (#2306) Co-authored-by: Ankur Kaul --- CHANGELOG.md | 2 + llama_cpp/llama.py | 49 +++++--- tests/test_llama.py | 288 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 323 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6d33dbbe..b1d5fb880 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306 + ## [0.3.31] - feat: update llama.cpp to ggml-org/llama.cpp@f449e0553 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4a09b55ee..b5bffd46b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -471,6 +471,8 @@ def free_lora_adapter(): self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 + # Restored or truncated state must decode before sampling. + self._requires_eval = True self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) self.scores: npt.NDArray[np.single] = np.ndarray( (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single @@ -647,6 +649,7 @@ def set_seed(self, seed: int): def reset(self): """Reset the model state.""" self.n_tokens = 0 + self._requires_eval = True if self._is_recurrent or self._is_hybrid: mem = llama_cpp.llama_get_memory(self._ctx.ctx) @@ -689,6 +692,7 @@ def eval(self, tokens: Sequence[int]): pass # Update n_tokens self.n_tokens += n_tokens + self._requires_eval = False def _init_sampler( self, @@ -900,41 +904,53 @@ def generate( grammar=grammar, ) + tokens = list(tokens) + # Check for kv cache prefix match if reset and self.n_tokens > 0: longest_prefix = 0 - for a, b in zip(self._input_ids, tokens[:-1]): + for a, b in zip(self._input_ids, tokens): if a == b: longest_prefix += 1 else: break - # Recurrent and hybrid models cannot rewind state; reset if needed - if ( - self._is_recurrent or self._is_hybrid - ) and longest_prefix < self.n_tokens: - longest_prefix = 0 - reset = True + prompt_consumed = longest_prefix == len(tokens) + exact_prompt_cached = self.n_tokens == len(tokens) and prompt_consumed + + # Exact cache hits can sample immediately only when the current + # logits were produced by a live decode, not restored state. + if exact_prompt_cached and not self._requires_eval: + reset = False + tokens = [] + reuse_prefix = 0 if self.verbose: print( - "Llama.generate: recurrent/hybrid model requires full state reset", + "Llama.generate: full prompt already cached, skipping reset", file=sys.stderr, ) - - if longest_prefix > 0: - if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1): + else: + # If there is no suffix to decode, replay one token to refresh + # logits after truncating to a valid prefix. + reuse_prefix = longest_prefix - 1 if prompt_consumed else longest_prefix + + # Prefix hits can reuse memory because the suffix decode refreshes + # logits before sampling. + if reuse_prefix > 0: + if self._ctx.kv_cache_seq_rm(-1, reuse_prefix, -1): reset = False - tokens = tokens[longest_prefix:] - self.n_tokens = longest_prefix + tokens = tokens[reuse_prefix:] + self.n_tokens = reuse_prefix + self._requires_eval = True if self.verbose: print( - f"Llama.generate: {longest_prefix} prefix-match hit, " + f"Llama.generate: {reuse_prefix} prefix-match hit, " f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr, ) elif self.verbose: print( - f"Llama.generate: {longest_prefix} prefix-match found " + f"Llama.generate: {reuse_prefix} prefix-match found " f"but partial kv removal not supported, re-evaluating full prompt", file=sys.stderr, ) @@ -948,7 +964,6 @@ def generate( # grammar.reset() sample_idx = self.n_tokens + len(tokens) - 1 - tokens = list(tokens) # Eval and sample while True: @@ -988,6 +1003,7 @@ def generate( if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]: self.n_tokens = sample_idx self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) + self._requires_eval = True break if self.draft_model is not None: @@ -2217,6 +2233,7 @@ def load_state(self, state: LlamaState) -> None: rest[rest > 0] = 0.0 self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens + self._requires_eval = True self._seed = state.seed state_size = state.llama_state_size LLamaStateArrayType = ctypes.c_uint8 * state_size diff --git a/tests/test_llama.py b/tests/test_llama.py index 336d6a612..70fce12d8 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -1,4 +1,5 @@ import ctypes +import itertools import multiprocessing import numpy as np @@ -64,6 +65,14 @@ def llama_cpp_model_path(): return model_path +@pytest.fixture +def llama_cpp_transformer_model_path(): + repo_id = "ggml-org/models" + filename = "tinyllamas/stories15M-q4_0.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + + @pytest.fixture def llama_cpp_embedding_model_path(): repo_id = "CompendiumLabs/bge-small-en-v1.5-gguf" @@ -339,6 +348,285 @@ def test_hybrid_model_prompt_cache_reset(llama_cpp_hybrid_model_path): ) +def _create_test_model(model_path): + return llama_cpp.Llama( + model_path, + n_ctx=64, + n_batch=64, + n_ubatch=64, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + verbose=False, + ) + + +def _generate_test_tokens(model, tokens, max_tokens=3): + return list( + itertools.islice( + model.generate( + tokens, + temp=0.0, + ), + max_tokens, + ) + ) + + +MODEL_CACHE_CASES = ( + ("llama_cpp_transformer_model_path", False, False), + ("llama_cpp_recurrent_model_path", True, False), + ("llama_cpp_hybrid_model_path", False, True), +) + +RESTORED_CACHE_CASES = MODEL_CACHE_CASES + + +def _eval_alternate_same_length_prompt(model, tokens, expected_next_token): + replacement_tokens = ( + model.token_eos(), + model.token_nl(), + 0, + 1, + 2, + model.n_vocab() - 1, + ) + + for replacement_token in replacement_tokens: + alternate_tokens = list(tokens) + alternate_tokens[-1] = replacement_token + if alternate_tokens == tokens: + continue + + model.reset() + model.eval(alternate_tokens) + if model.sample(temp=0.0, idx=len(tokens) - 1) != expected_next_token: + return + + raise AssertionError("failed to find an alternate same-length prompt") + + +def _assert_exact_cached_prompt_reuse_matches_fresh( + model_path, + *, + is_recurrent: bool, + is_hybrid: bool, +): + prompt = "The quick brown fox" + fresh = _create_test_model(model_path) + tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True) + + assert fresh._is_recurrent is is_recurrent + assert fresh._is_hybrid is is_hybrid + + expected_tokens = _generate_test_tokens(fresh, tokens) + + cached = _create_test_model(model_path) + assert cached._is_recurrent is is_recurrent + assert cached._is_hybrid is is_hybrid + + cached.eval(tokens) + assert cached.n_tokens == len(tokens) + assert cached.input_ids[: cached.n_tokens].tolist() == tokens + assert cached.sample(temp=0.0, idx=len(tokens) - 1) == expected_tokens[0] + + reset_calls = 0 + original_reset = cached.reset + + def reset_tracker(): + nonlocal reset_calls + reset_calls += 1 + original_reset() + + cached.reset = reset_tracker + + cached_tokens = _generate_test_tokens(cached, tokens) + assert reset_calls == 0 + assert cached_tokens == expected_tokens + assert cached.n_tokens == len(tokens) + len(cached_tokens) - 1 + + +def _assert_loaded_exact_cached_prompt_reuse_matches_fresh( + model_path, + *, + is_recurrent: bool, + is_hybrid: bool, +): + prompt = "The quick brown fox" + fresh = _create_test_model(model_path) + tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True) + expected_tokens = _generate_test_tokens(fresh, tokens) + + source = _create_test_model(model_path) + assert source._is_recurrent is is_recurrent + assert source._is_hybrid is is_hybrid + + source.eval(tokens) + state = source.save_state() + + loaded = _create_test_model(model_path) + assert loaded._is_recurrent is is_recurrent + assert loaded._is_hybrid is is_hybrid + + _eval_alternate_same_length_prompt( + loaded, + tokens, + expected_tokens[0], + ) + loaded.load_state(state) + + assert loaded.n_tokens == len(tokens) + assert loaded.input_ids[: loaded.n_tokens].tolist() == tokens + + loaded_tokens = _generate_test_tokens(loaded, tokens) + assert loaded_tokens == expected_tokens + assert loaded.n_tokens == len(tokens) + len(loaded_tokens) - 1 + + +def _assert_ram_cache_exact_prompt_hit_matches_fresh( + model_path, + *, + is_recurrent: bool, + is_hybrid: bool, +): + prompt = "The quick brown fox" + fresh = _create_test_model(model_path) + tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True) + expected = fresh.create_completion( + tokens, + max_tokens=1, + temperature=0.0, + seed=1337, + ) + + cache = llama_cpp.LlamaRAMCache() + writer = _create_test_model(model_path) + writer.set_cache(cache) + writer.create_completion( + tokens, + max_tokens=1, + temperature=0.0, + seed=1337, + ) + + cached = _create_test_model(model_path) + assert cached._is_recurrent is is_recurrent + assert cached._is_hybrid is is_hybrid + cached.set_cache(cache) + + load_state_calls = 0 + original_load_state = cached.load_state + + def load_state_tracker(state): + nonlocal load_state_calls + load_state_calls += 1 + original_load_state(state) + + cached.load_state = load_state_tracker + + actual = cached.create_completion( + tokens, + max_tokens=1, + temperature=0.0, + seed=1337, + ) + + assert load_state_calls == 1 + assert actual["choices"][0]["text"] == expected["choices"][0]["text"] + assert ( + actual["usage"]["completion_tokens"] == expected["usage"]["completion_tokens"] + ) + + +def _assert_shorter_prompt_prefix_reuse_matches_fresh( + model_path, + *, + is_recurrent: bool, + is_hybrid: bool, +): + prompt = "The quick brown fox" + history = " jumps over the lazy dog" + fresh = _create_test_model(model_path) + tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True) + history_tokens = fresh.tokenize(history.encode(), add_bos=False, special=True) + expected_tokens = _generate_test_tokens(fresh, tokens) + + cached = _create_test_model(model_path) + assert cached._is_recurrent is is_recurrent + assert cached._is_hybrid is is_hybrid + + cached.eval(tokens + history_tokens) + assert cached.n_tokens > len(tokens) + assert cached.input_ids[: len(tokens)].tolist() == tokens + + cached_tokens = _generate_test_tokens(cached, tokens) + assert cached_tokens == expected_tokens + + +@pytest.mark.parametrize( + ("model_path_fixture", "is_recurrent", "is_hybrid"), MODEL_CACHE_CASES +) +def test_exact_cached_prompt_reuse_matches_fresh( + request, + model_path_fixture, + is_recurrent, + is_hybrid, +): + _assert_exact_cached_prompt_reuse_matches_fresh( + request.getfixturevalue(model_path_fixture), + is_recurrent=is_recurrent, + is_hybrid=is_hybrid, + ) + + +@pytest.mark.parametrize( + ("model_path_fixture", "is_recurrent", "is_hybrid"), RESTORED_CACHE_CASES +) +def test_loaded_exact_cached_prompt_reuse_matches_fresh( + request, + model_path_fixture, + is_recurrent, + is_hybrid, +): + _assert_loaded_exact_cached_prompt_reuse_matches_fresh( + request.getfixturevalue(model_path_fixture), + is_recurrent=is_recurrent, + is_hybrid=is_hybrid, + ) + + +@pytest.mark.parametrize( + ("model_path_fixture", "is_recurrent", "is_hybrid"), RESTORED_CACHE_CASES +) +def test_ram_cache_exact_prompt_hit_matches_fresh( + request, + model_path_fixture, + is_recurrent, + is_hybrid, +): + _assert_ram_cache_exact_prompt_hit_matches_fresh( + request.getfixturevalue(model_path_fixture), + is_recurrent=is_recurrent, + is_hybrid=is_hybrid, + ) + + +@pytest.mark.parametrize( + ("model_path_fixture", "is_recurrent", "is_hybrid"), MODEL_CACHE_CASES +) +def test_shorter_prompt_prefix_reuse_matches_fresh( + request, + model_path_fixture, + is_recurrent, + is_hybrid, +): + _assert_shorter_prompt_prefix_reuse_matches_fresh( + request.getfixturevalue(model_path_fixture), + is_recurrent=is_recurrent, + is_hybrid=is_hybrid, + ) + + def test_real_llama_embeddings(llama_cpp_embedding_model_path): model = llama_cpp.Llama( llama_cpp_embedding_model_path, From 4bee85b352ec4aa7034dc13c3d80688805e47d63 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 23 Jun 2026 07:56:15 -0700 Subject: [PATCH 15/15] feat: update llama.cpp to 92e854ab8 (#2318) --- CHANGELOG.md | 1 + llama_cpp/llama_cpp.py | 5 +++++ llama_cpp/llama_cpp_ext.py | 19 +++++++++++++++++++ llama_cpp/mtmd_cpp.py | 7 +++++++ vendor/llama.cpp | 2 +- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d5fb880..925e941d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: update llama.cpp to ggml-org/llama.cpp@92e854ab8 - fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306 ## [0.3.31] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 21f85c81c..176709d96 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1744,6 +1744,11 @@ def llama_model_n_embd_out(model: llama_model_p, /) -> int: def llama_model_n_layer(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model); +@ctypes_function("llama_model_n_layer_nextn", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_layer_nextn(model: llama_model_p, /) -> int: ... + + # LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_head(model: llama_model_p, /) -> int: ... diff --git a/llama_cpp/llama_cpp_ext.py b/llama_cpp/llama_cpp_ext.py index 284811086..a4b424eb6 100644 --- a/llama_cpp/llama_cpp_ext.py +++ b/llama_cpp/llama_cpp_ext.py @@ -62,6 +62,25 @@ def llama_set_embeddings_nextn( ... +# LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset); +@_ctypes_function_from_names( + ( + "llama_set_nextn_layer_offset", + "_Z28llama_set_nextn_layer_offsetP13llama_contexti", + "?llama_set_nextn_layer_offset@@YAXPEAUllama_context@@H@Z", + ), + [llama_cpp.llama_context_p_ctypes, ctypes.c_int32], + None, +) +def llama_set_nextn_layer_offset( + ctx: llama_cpp.llama_context_p, + offset: Union[ctypes.c_int32, int], + /, +): + """Select which appended NextN block the decoder MTP graph runs.""" + ... + + # LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); @_ctypes_function_from_names( ( diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 78f068aa9..35357a327 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -20,6 +20,7 @@ ) import pathlib from typing import ( + Callable, Union, NewType, Optional, @@ -84,6 +85,8 @@ MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2 +mtmd_progress_callback = CFUNCTYPE(c_bool, c_float, c_void_p) + # Structures class mtmd_context_params(Structure): @@ -106,6 +109,8 @@ class mtmd_context_params(Structure): cb_eval: llama_cpp.ggml_backend_sched_eval_callback cb_eval_user_data: c_void_p batch_max_tokens: int + progress_callback: Callable[[float, c_void_p], bool] + progress_callback_user_data: c_void_p _fields_ = [ ("use_gpu", c_bool), @@ -120,6 +125,8 @@ class mtmd_context_params(Structure): ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), ("batch_max_tokens", c_int), + ("progress_callback", mtmd_progress_callback), + ("progress_callback_user_data", c_void_p), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f449e0553..92e854ab8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f449e0553708b895adbd94a301431cef691f632d +Subproject commit 92e854ab836254bb7f2eb49babd5613474bdb700