From 65b50ca36af4a00096267ef7b400f82933c33b9d Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 12 Jun 2026 08:17:04 -0700
Subject: [PATCH 01/15] feat: update llama.cpp to ggml-org/llama.cpp@3e7bd4f39
 (#2298)

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c4c744d9..46c57c5d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291
-- feat: update llama.cpp to ggml-org/llama.cpp@ac4cddeb0
+- feat: update llama.cpp to ggml-org/llama.cpp@3e7bd4f39
 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288
 - fix(ci): Repair Linux accelerator wheels for manylinux publishing
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index ac4cddeb0..3e7bd4f39 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit ac4cddeb0dbd778f650bf568f6f08344a06abe3a
+Subproject commit 3e7bd4f39ac59167f82103e1fc22dc4585c489d3

From 565d3c5c1c13a6255a81da09d2cc9f7b3df75c2a Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 13 Jun 2026 10:43:21 -0700
Subject: [PATCH 02/15] feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676
 (#2300)

---
 CHANGELOG.md          |  2 +-
 llama_cpp/mtmd_cpp.py | 56 ++++++++++++++++++++++++++++++++++++++++++-
 vendor/llama.cpp      |  2 +-
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 46c57c5d9..084865cd0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291
-- feat: update llama.cpp to ggml-org/llama.cpp@3e7bd4f39
+- feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676
 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288
 - fix(ci): Repair Linux accelerator wheels for manylinux publishing
 
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 919cefb35..46eb2c879 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -76,6 +76,9 @@
 mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
 mtmd_input_chunks_p_ctypes = c_void_p
 
+mtmd_batch_p = NewType("mtmd_batch_p", int)
+mtmd_batch_p_ctypes = c_void_p
+
 # Enums
 MTMD_INPUT_CHUNK_TYPE_TEXT = 0
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
@@ -102,6 +105,7 @@ class mtmd_context_params(Structure):
         image_max_tokens: int
         cb_eval: llama_cpp.ggml_backend_sched_eval_callback
         cb_eval_user_data: c_void_p
+        batch_max_tokens: int
 
     _fields_ = [
         ("use_gpu", c_bool),
@@ -115,6 +119,7 @@ class mtmd_context_params(Structure):
         ("image_max_tokens", c_int),
         ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
+        ("batch_max_tokens", c_int),
     ]
 
 
@@ -596,7 +601,7 @@ def mtmd_image_tokens_get_decoder_pos(
     c_int,
 )
 def mtmd_encode(ctx: mtmd_context_p, image_tokens: mtmd_image_tokens_p, /) -> int:
-    """Run an MTMD encode pass for image tokens."""
+    """Run a deprecated MTMD encode pass for image tokens."""
     ...
 
 
@@ -618,6 +623,55 @@ def mtmd_get_output_embd(ctx: mtmd_context_p, /) -> Optional[CtypesArray[c_float
     ...
 
 
+# MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
+@ctypes_function("mtmd_batch_init", [mtmd_context_p_ctypes], mtmd_batch_p_ctypes)
+def mtmd_batch_init(ctx: mtmd_context_p, /) -> Optional[mtmd_batch_p]:
+    """Initialize an MTMD media chunk batch for a context."""
+    ...
+
+
+# MTMD_API void mtmd_batch_free(mtmd_batch * batch);
+@ctypes_function("mtmd_batch_free", [mtmd_batch_p_ctypes], None)
+def mtmd_batch_free(batch: mtmd_batch_p, /): ...
+
+
+# MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_batch_add_chunk",
+    [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes],
+    c_int,
+)
+def mtmd_batch_add_chunk(
+    batch: mtmd_batch_p,
+    chunk: mtmd_input_chunk_p,
+    /,
+) -> int:
+    """Add a media chunk to an MTMD batch."""
+    ...
+
+
+# MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
+@ctypes_function("mtmd_batch_encode", [mtmd_batch_p_ctypes], c_int)
+def mtmd_batch_encode(batch: mtmd_batch_p, /) -> int:
+    """Run an MTMD encode pass for all chunks in a batch."""
+    ...
+
+
+# MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_batch_get_output_embd",
+    [mtmd_batch_p_ctypes, mtmd_input_chunk_p_ctypes],
+    POINTER(c_float),
+)
+def mtmd_batch_get_output_embd(
+    batch: mtmd_batch_p,
+    chunk: mtmd_input_chunk_p,
+    /,
+) -> Optional[CtypesArray[c_float]]:
+    """Get output embeddings for a chunk from the last batch encode pass."""
+    ...
+
+
 # MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
 @ctypes_function("mtmd_get_cap_from_file", [c_char_p], mtmd_caps)
 def mtmd_get_cap_from_file(mmproj_fname: bytes, /) -> mtmd_caps:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 3e7bd4f39..f05cf4676 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 3e7bd4f39ac59167f82103e1fc22dc4585c489d3
+Subproject commit f05cf4676af46c2f017c0e6ba25b6e20204f700e

From a52702fce21d736d2de0c7c2f5325e9408fa8374 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 13 Jun 2026 11:16:38 -0700
Subject: [PATCH 03/15] feat(example): use MTMD batch encoding (#2301)

---
 CHANGELOG.md              |   1 +
 examples/server/README.md |  10 +-
 examples/server/server.py | 246 +++++++++++++++++++++++++++-----------
 3 files changed, 182 insertions(+), 75 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 084865cd0..905245e8a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat(example): use MTMD batch encoding by @abetlen in #2301
 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291
 - feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676
 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288
diff --git a/examples/server/README.md b/examples/server/README.md
index ff04374fc..b2819a244 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -291,7 +291,7 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha
 
 ## Multimodal `model.mtmd`
 
-`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image and audio content parts.
+`model.mtmd` loads a llama.cpp multimodal projector and enables OpenAI-style image, audio, and video content parts.
 
 ```json
 {
@@ -305,8 +305,10 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha
         "path": ".cache/mtmd-embeddings",
         "max_bytes": 1073741824
       },
+      "batch_max_tokens": 1024,
       "image_max_bytes": 20971520,
       "audio_max_bytes": 104857600,
+      "video_max_bytes": 536870912,
       "image_timeout_seconds": 10.0
     }
   }
@@ -317,11 +319,13 @@ See [Hugging Face response parsing](https://huggingface.co/docs/transformers/cha
 | --- | --- |
 | `mmproj_path` | Local multimodal projector path. |
 | `mmproj_from_pretrained` | Hugging Face projector source. |
-| `embedding_cache.path` | Directory for cached image and audio embeddings. |
+| `embedding_cache.path` | Directory for cached image, audio, and video embeddings. |
 | `embedding_cache.max_bytes` | Maximum embedding cache size. |
+| `batch_max_tokens` | Maximum number of media output tokens per MTMD projector-side encode batch. |
 | `image_max_bytes` | Maximum image payload size. |
 | `audio_max_bytes` | Maximum audio payload size. |
-| `image_timeout_seconds` | Timeout for remote image and audio URL fetches. |
+| `video_max_bytes` | Maximum video payload size. |
+| `image_timeout_seconds` | Timeout for remote image, audio, and video URL fetches. |
 
 Send image inputs with OpenAI chat content parts.
 
diff --git a/examples/server/server.py b/examples/server/server.py
index e8034a214..16f8c9f7e 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -3223,6 +3223,7 @@ class MTMDOptions(BaseModel):
         embedding_cache: Optional["ConfigFile.MTMDEmbeddingCacheOptions"] = None
         allowed_media_domains: Optional[List[str]] = None
         allowed_local_media_path: Optional[str] = None
+        batch_max_tokens: int = Field(default=1024, ge=1)
         image_max_bytes: int = Field(default=20 * 1024 * 1024, ge=1)
         audio_max_bytes: int = Field(default=100 * 1024 * 1024, ge=1)
         video_max_bytes: int = Field(default=512 * 1024 * 1024, ge=1)
@@ -10410,6 +10411,21 @@ class MTMDLoadedMedia:
 
 
 class MTMDProcessor:
+    @dataclass
+    class MediaChunk:
+        kind: Literal["image", "audio", "video"]
+        key: str
+        chunk: Any
+        n_tokens: int
+        decode_n_pos: int
+        non_causal: bool
+        embeddings: Optional[np.ndarray] = None
+
+    @dataclass
+    class ParsedChunk:
+        text_tokens: Optional[List[int]] = None
+        media: Optional["MTMDProcessor.MediaChunk"] = None
+
     def __init__(
         self,
         *,
@@ -10422,6 +10438,7 @@ def __init__(
         n_ubatch: int,
         n_threads_batch: int,
         mmproj_path: str,
+        batch_max_tokens: int,
         embedding_cache: Optional[MTMDEmbeddingCache],
         allowed_media_domains: Optional[List[str]],
         allowed_local_media_path: Optional[str],
@@ -10437,6 +10454,7 @@ def __init__(
         self.n_ubatch = n_ubatch
         self.mmproj_path = mmproj_path
         self.embedding_cache = embedding_cache
+        self.batch_max_tokens = batch_max_tokens
         self.model_fingerprint = MTMDEmbeddingCache.fingerprint_file(model_path)
         self.mmproj_fingerprint = MTMDEmbeddingCache.fingerprint_file(mmproj_path)
         self.allowed_media_domains = (
@@ -10456,6 +10474,7 @@ def __init__(
         self.lock = threading.Lock()
         params = mtmd_cpp.mtmd_context_params_default()
         params.n_threads = max(1, n_threads_batch)
+        params.batch_max_tokens = batch_max_tokens
         self.ctx = mtmd_cpp.mtmd_init_from_file(
             mmproj_path.encode("utf-8"),
             llama_model,
@@ -10705,37 +10724,91 @@ def _media_identity_tokens(
             tokens.append(-1 - (int.from_bytes(digest[:4], "little") & 0x3FFFFFFF))
         return tokens
 
-    def _encode_media_chunk(
-        self,
-        *,
-        kind: Literal["image", "audio", "video"],
-        key: str,
-        chunk: Any,
-    ) -> np.ndarray:
-        n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk))
-        if self.embedding_cache is not None:
-            cached = self.embedding_cache.load(key)
-            if (
-                cached is not None
-                and cached.embeddings.shape == (n_tokens, self.n_embd_inp)
-            ):
-                return cached.embeddings
-        result = int(mtmd_cpp.mtmd_encode_chunk(self.ctx, chunk))
-        if result != 0:
-            raise CompletionRequestValidationError(
-                f"failed to encode {kind} chunk: error code {result}"
-            )
-        output = mtmd_cpp.mtmd_get_output_embd(self.ctx)
-        if output is None:
-            raise CompletionRequestValidationError(f"MTMD {kind} encoder returned no embeddings")
+    def _embeddings_from_pointer(self, output: Any, n_tokens: int) -> np.ndarray:
         flat = np.ctypeslib.as_array(output, shape=(n_tokens * self.n_embd_inp,))
-        embeddings = np.array(flat, dtype=np.float32, copy=True).reshape(
+        return np.array(flat, dtype=np.float32, copy=True).reshape(
             n_tokens,
             self.n_embd_inp,
         )
-        if self.embedding_cache is not None:
-            self.embedding_cache.save(key, embeddings)
-        return embeddings
+
+    def _load_cached_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> bool:
+        if self.embedding_cache is None:
+            return False
+        cached = self.embedding_cache.load(media_chunk.key)
+        if cached is None or cached.embeddings.shape != (
+            media_chunk.n_tokens,
+            self.n_embd_inp,
+        ):
+            return False
+        media_chunk.embeddings = cached.embeddings
+        return True
+
+    def _save_media_chunk(self, media_chunk: "MTMDProcessor.MediaChunk") -> None:
+        if self.embedding_cache is None or media_chunk.embeddings is None:
+            return
+        self.embedding_cache.save(media_chunk.key, media_chunk.embeddings)
+
+    def _encode_media_batch(
+        self,
+        media_chunks: Sequence["MTMDProcessor.MediaChunk"],
+        start_index: int,
+    ) -> int:
+        batch = mtmd_cpp.mtmd_batch_init(self.ctx)
+        if batch is None:
+            raise CompletionRequestValidationError("failed to create MTMD media batch")
+        try:
+            first = media_chunks[start_index]
+            result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, first.chunk))
+            if result != 0:
+                raise CompletionRequestValidationError(
+                    f"failed to add {first.kind} chunk to MTMD batch: error code {result}"
+                )
+            group = [first]
+            next_index = start_index + 1
+            while next_index < len(media_chunks):
+                candidate = media_chunks[next_index]
+                result = int(mtmd_cpp.mtmd_batch_add_chunk(batch, candidate.chunk))
+                if result == 0:
+                    group.append(candidate)
+                    next_index += 1
+                    continue
+                if result in {2, 3}:
+                    break
+                raise CompletionRequestValidationError(
+                    f"failed to add {candidate.kind} chunk to MTMD batch: error code {result}"
+                )
+            result = int(mtmd_cpp.mtmd_batch_encode(batch))
+            if result != 0:
+                raise CompletionRequestValidationError(
+                    f"failed to encode MTMD media batch: error code {result}"
+                )
+            for media_chunk in group:
+                output = mtmd_cpp.mtmd_batch_get_output_embd(batch, media_chunk.chunk)
+                if output is None:
+                    raise CompletionRequestValidationError(
+                        f"MTMD {media_chunk.kind} encoder returned no embeddings"
+                    )
+                media_chunk.embeddings = self._embeddings_from_pointer(
+                    output,
+                    media_chunk.n_tokens,
+                )
+                self._save_media_chunk(media_chunk)
+            return len(group)
+        finally:
+            mtmd_cpp.mtmd_batch_free(batch)
+
+    def _encode_media_chunks(
+        self,
+        media_chunks: Sequence["MTMDProcessor.MediaChunk"],
+    ) -> None:
+        uncached = [
+            media_chunk
+            for media_chunk in media_chunks
+            if not self._load_cached_media_chunk(media_chunk)
+        ]
+        index = 0
+        while index < len(uncached):
+            index += self._encode_media_batch(uncached, index)
 
     def _positions_for_chunk(self, chunk: Any, start_pos: int) -> np.ndarray:
         n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk))
@@ -10858,12 +10931,8 @@ def _build_prompt_plan_locked(
                 raise CompletionRequestValidationError(
                     f"failed to tokenize MTMD prompt: error code {result}"
                 )
-            segments: List[PromptSegment] = []
-            identity_tokens: List[int] = []
-            text_tokens: List[int] = []
-            text_token_index_by_pos: Dict[int, int] = {}
-            identity_pos = 0
-            decode_pos = 0
+            parsed_chunks: List[MTMDProcessor.ParsedChunk] = []
+            media_chunks: List[MTMDProcessor.MediaChunk] = []
             video_index = 0
             used_media_keys = set()
             n_chunks = int(mtmd_cpp.mtmd_input_chunks_size(chunks))
@@ -10884,24 +10953,9 @@ def _build_prompt_plan_locked(
                         else []
                     )
                     if tokens:
-                        start_pos = identity_pos
-                        segments.append(
-                            PromptSegment(
-                                kind="text",
-                                start_pos=start_pos,
-                                n_pos=len(tokens),
-                                identity_tokens=list(tokens),
-                                decode_start_pos=decode_pos,
-                                decode_n_pos=len(tokens),
-                                text_tokens=list(tokens),
-                            )
+                        parsed_chunks.append(
+                            MTMDProcessor.ParsedChunk(text_tokens=tokens)
                         )
-                        for offset, token in enumerate(tokens):
-                            text_token_index_by_pos[start_pos + offset] = len(text_tokens)
-                            text_tokens.append(token)
-                        identity_tokens.extend(tokens)
-                        identity_pos += len(tokens)
-                        decode_pos += len(tokens)
                     continue
                 if chunk_type == mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE:
                     chunk_kind: Literal["image", "audio"] = "image"
@@ -10951,37 +11005,84 @@ def _build_prompt_plan_locked(
                 decode_n_pos = int(mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk))
                 if decode_n_pos <= 0:
                     raise CompletionRequestValidationError("MTMD media chunk has no decoder positions")
-                embeddings = self._encode_media_chunk(kind=kind, key=key, chunk=chunk)
-                n_tokens = int(embeddings.shape[0])
+                n_tokens = int(mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk))
                 if n_tokens <= 0:
-                    raise CompletionRequestValidationError("MTMD media chunk has no embeddings")
+                    raise CompletionRequestValidationError("MTMD media chunk has no embedding tokens")
                 non_causal = bool(mtmd_cpp.mtmd_decode_use_non_causal(self.ctx, chunk))
-                segment_identity = self._media_identity_tokens(kind, key, n_tokens)
-                positions = self._positions_for_chunk(chunk, decode_pos)
-                segment = PromptSegment(
+                media_chunk = MTMDProcessor.MediaChunk(
                     kind=kind,
-                    start_pos=identity_pos,
-                    n_pos=n_tokens,
-                    identity_tokens=segment_identity,
-                    decode_start_pos=decode_pos,
+                    key=key,
+                    chunk=chunk,
+                    n_tokens=n_tokens,
                     decode_n_pos=decode_n_pos,
-                    media=PromptSegment.Media(
-                        embeddings=embeddings,
-                        positions=positions,
-                        non_causal=non_causal,
-                    ),
+                    non_causal=non_causal,
                 )
-                if non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch):
+                parsed_chunks.append(MTMDProcessor.ParsedChunk(media=media_chunk))
+                media_chunks.append(media_chunk)
+            if used_media_keys != {media.key for media in loaded_media}:
+                raise CompletionRequestValidationError("not all media inputs were consumed by MTMD")
+            self._encode_media_chunks(media_chunks)
+            segments: List[PromptSegment] = []
+            identity_tokens: List[int] = []
+            text_tokens: List[int] = []
+            text_token_index_by_pos: Dict[int, int] = {}
+            identity_pos = 0
+            decode_pos = 0
+            for parsed_chunk in parsed_chunks:
+                if parsed_chunk.text_tokens is not None:
+                    tokens = parsed_chunk.text_tokens
+                    start_pos = identity_pos
+                    segments.append(
+                        PromptSegment(
+                            kind="text",
+                            start_pos=start_pos,
+                            n_pos=len(tokens),
+                            identity_tokens=list(tokens),
+                            decode_start_pos=decode_pos,
+                            decode_n_pos=len(tokens),
+                            text_tokens=list(tokens),
+                        )
+                    )
+                    for offset, token in enumerate(tokens):
+                        text_token_index_by_pos[start_pos + offset] = len(text_tokens)
+                        text_tokens.append(token)
+                    identity_tokens.extend(tokens)
+                    identity_pos += len(tokens)
+                    decode_pos += len(tokens)
+                    continue
+                media_chunk = parsed_chunk.media
+                if media_chunk is None or media_chunk.embeddings is None:
+                    raise CompletionRequestValidationError("MTMD media chunk has no embeddings")
+                embeddings = media_chunk.embeddings
+                if media_chunk.non_causal and embeddings.shape[0] > min(self.n_batch, self.n_ubatch):
                     raise CompletionRequestValidationError(
-                        f"non-causal {kind} embedding chunk exceeds model batch limits; "
+                        f"non-causal {media_chunk.kind} embedding chunk exceeds model batch limits; "
                         "increase n_batch and n_ubatch"
                     )
-                segments.append(segment)
+                segment_identity = self._media_identity_tokens(
+                    media_chunk.kind,
+                    media_chunk.key,
+                    media_chunk.n_tokens,
+                )
+                positions = self._positions_for_chunk(media_chunk.chunk, decode_pos)
+                segments.append(
+                    PromptSegment(
+                        kind=media_chunk.kind,
+                        start_pos=identity_pos,
+                        n_pos=media_chunk.n_tokens,
+                        identity_tokens=segment_identity,
+                        decode_start_pos=decode_pos,
+                        decode_n_pos=media_chunk.decode_n_pos,
+                        media=PromptSegment.Media(
+                            embeddings=embeddings,
+                            positions=positions,
+                            non_causal=media_chunk.non_causal,
+                        ),
+                    )
+                )
                 identity_tokens.extend(segment_identity)
-                identity_pos += n_tokens
-                decode_pos += decode_n_pos
-            if used_media_keys != {media.key for media in loaded_media}:
-                raise CompletionRequestValidationError("not all media inputs were consumed by MTMD")
+                identity_pos += media_chunk.n_tokens
+                decode_pos += media_chunk.decode_n_pos
             return PromptPlan(
                 text=prompt,
                 generation_prompt=generation_prompt,
@@ -16211,6 +16312,7 @@ def main() -> None:
             n_ubatch=model.n_ubatch,
             n_threads_batch=model.n_threads_batch,
             mmproj_path=mmproj_path,
+            batch_max_tokens=config.model.mtmd.batch_max_tokens,
             embedding_cache=embedding_cache,
             allowed_media_domains=config.model.mtmd.allowed_media_domains,
             allowed_local_media_path=config.model.mtmd.allowed_local_media_path,

From ddc0d15bde47a2dd7ae03546447ae73dc681a698 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 13 Jun 2026 12:36:30 -0700
Subject: [PATCH 04/15] chore: bump version to 0.3.29 (#2302)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 905245e8a..56c5ffb55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.29]
+
 - feat(example): use MTMD batch encoding by @abetlen in #2301
 - feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291
 - feat: update llama.cpp to ggml-org/llama.cpp@f05cf4676
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 13668893f..42f807ef6 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.28"
+__version__ = "0.3.29"

From e8070920c165b678ed0fb5255fbcc7e81bf8f5db Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 13 Jun 2026 13:19:32 -0700
Subject: [PATCH 05/15] fix(ci): skip mtmd CLI wrappers in package builds
 (#2303)

---
 CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b2744cdc..623ab2162 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,6 +176,15 @@ if (LLAMA_BUILD)
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
+        # The Python package only ships mtmd as a shared library.
+        # Upstream mtmd also defines CLI compatibility wrappers, but those are
+        # not installed here and can fail to link in minimal Docker toolchains.
+        foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-debug)
+            if (TARGET ${target})
+                set_target_properties(${target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif()
+        endforeach()
+
         if (WIN32)
             set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()

From 3850aff7afa720aa9f492b788720e46ae117c0cd Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 13 Jun 2026 18:04:40 -0700
Subject: [PATCH 06/15] fix(ci): use C++ compiler for Docker builds (#2304)

---
 CMakeLists.txt           | 2 +-
 docker/simple/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 623ab2162..0474863a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,7 +179,7 @@ if (LLAMA_BUILD)
         # The Python package only ships mtmd as a shared library.
         # Upstream mtmd also defines CLI compatibility wrappers, but those are
         # not installed here and can fail to link in minimal Docker toolchains.
-        foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-debug)
+        foreach(target llama-llava-cli llama-gemma3-cli llama-minicpmv-cli llama-qwen2vl-cli llama-mtmd-cli llama-mtmd-debug)
             if (TARGET ${target})
                 set_target_properties(${target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
             endif()
diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
index bad4f456f..22b2335a3 100644
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@@ -27,7 +27,7 @@ RUN python3 -m pip install --upgrade pip
 
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
-RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose
+RUN CC=gcc CXX=g++ CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose
 
 # Set environment variable for the host
 ENV HOST=0.0.0.0

From 541b08cca566fbfb686287bfbbcfc6d4087e2c8a Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 15 Jun 2026 00:55:48 -0700
Subject: [PATCH 07/15] feat: update llama.cpp to ggml-org/llama.cpp@6e9007ae6
 (#2307)

---
 CHANGELOG.md     | 2 ++
 vendor/llama.cpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 56c5ffb55..5804e0208 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@6e9007ae6
+
 ## [0.3.29]
 
 - feat(example): use MTMD batch encoding by @abetlen in #2301
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f05cf4676..6e9007ae6 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f05cf4676af46c2f017c0e6ba25b6e20204f700e
+Subproject commit 6e9007ae61f4e994c27484759caac6ef2aa32b30

From 824565a96bf1580266cd41f73263dec2cd13a9a7 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 15 Jun 2026 01:10:49 -0700
Subject: [PATCH 08/15] feat: update llama.cpp to 6eab47181 (#2308)

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5804e0208..0b2b5eb31 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: update llama.cpp to ggml-org/llama.cpp@6e9007ae6
+- feat: update llama.cpp to ggml-org/llama.cpp@6eab47181
 
 ## [0.3.29]
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 6e9007ae6..6eab47181 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 6e9007ae61f4e994c27484759caac6ef2aa32b30
+Subproject commit 6eab47181cbd3532c88a105682b81b4729ab809b

From 822146b7cdb710e064462a1939e489bb4d330df2 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 15 Jun 2026 23:45:52 -0700
Subject: [PATCH 09/15] feat: update llama.cpp to e3a74b299 (#2310)

---
 CHANGELOG.md          |  2 +-
 llama_cpp/mtmd_cpp.py | 14 +++++++++++++-
 vendor/llama.cpp      |  2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b2b5eb31..2cc78bfc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: update llama.cpp to ggml-org/llama.cpp@6eab47181
+- feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299
 
 ## [0.3.29]
 
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 46eb2c879..78f068aa9 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -169,6 +169,12 @@ class mtmd_caps(Structure):
     POINTER(c_char_p),
 )
 
+mtmd_helper_post_decode_callback = CFUNCTYPE(
+    c_int,
+    llama_cpp.llama_batch,
+    c_void_p,
+)
+
 
 class mtmd_helper_bitmap_wrapper(Structure):
     """Bitmap wrapper returned by MTMD helper media loaders."""
@@ -860,7 +866,9 @@ def mtmd_helper_eval_chunk_single(
 #                                                 llama_pos n_past,
 #                                                 llama_seq_id seq_id,
 #                                                 int32_t n_batch,
-#                                                 llama_pos * new_n_past);
+#                                                 llama_pos * new_n_past,
+#                                                 mtmd_helper_post_decode_callback callback,
+#                                                 void * user_data);
 @ctypes_function(
     "mtmd_helper_decode_image_chunk",
     [
@@ -872,6 +880,8 @@ def mtmd_helper_eval_chunk_single(
         llama_cpp.llama_seq_id,
         c_int,
         POINTER(llama_cpp.llama_pos),
+        mtmd_helper_post_decode_callback,
+        c_void_p,
     ],
     c_int,
 )
@@ -884,6 +894,8 @@ def mtmd_helper_decode_image_chunk(
     seq_id: llama_cpp.llama_seq_id,
     n_batch: Union[c_int, int],
     new_n_past: "_Pointer[llama_cpp.llama_pos]",
+    callback: Optional[mtmd_helper_post_decode_callback],
+    user_data: c_void_p,
     /,
 ) -> int:
     """Decode a pre-encoded image chunk."""
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 6eab47181..e3a74b299 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 6eab47181cbd3532c88a105682b81b4729ab809b
+Subproject commit e3a74b299085cd00013804f7fca2e03441b2da20

From a8042331fbea6121f56fa6d0e1ddb764c790a364 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 16 Jun 2026 00:14:32 -0700
Subject: [PATCH 10/15] feat: add Pyodide wheel support (#2309)

* feat: update llama.cpp to 6eab47181

* feat: add Pyodide wheel support

* docs: fix Pyodide changelog entry

* feat: enable mtmd for emscripten
---
 .github/workflows/build-and-release.yaml | 33 +++++++++++++++-
 CHANGELOG.md                             |  1 +
 CMakeLists.txt                           | 50 ++++++++++++++++++++----
 llama_cpp/_ctypes_extensions.py          | 37 +++++++++++++++++-
 4 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 4ae37b174..c931ead34 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -139,6 +139,37 @@ jobs:
           name: wheels_riscv64
           path: ./wheelhouse/*.whl
 
+  build_wheels_pyodide:
+    name: Build Pyodide wheel
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          submodules: "recursive"
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Build wheel
+        uses: pypa/cibuildwheel@v4.1.0
+        env:
+          CIBW_PLATFORM: "pyodide"
+          CIBW_BUILD: "cp314-pyodide_wasm32"
+          CIBW_BUILD_VERBOSITY: "1"
+          CIBW_REPAIR_WHEEL_COMMAND: ""
+          CIBW_BEFORE_TEST: "curl -L --fail --retry 3 -o /tmp/stories260K.gguf https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf"
+          CIBW_TEST_COMMAND: "python -c \"import llama_cpp.mtmd_cpp as mtmd; from llama_cpp import Llama; print('mtmd marker', mtmd.mtmd_default_marker().decode()); llm = Llama(model_path='/tmp/stories260K.gguf', n_ctx=64, n_batch=8, n_threads=1, verbose=False); print('loaded', llm.n_vocab(), llm.n_ctx()); print('generated', llm('Once upon a', max_tokens=1, temperature=0)['choices'][0]['text'])\""
+          CMAKE_ARGS: "-DLLAMA_WASM_MEM64=OFF -DEMSCRIPTEN_SYSTEM_PROCESSOR=wasm32 -DGGML_NATIVE=OFF -DGGML_OPENMP=OFF -DGGML_METAL=OFF -DGGML_BLAS=OFF -DGGML_CUDA=OFF -DGGML_HIP=OFF -DGGML_VULKAN=OFF -DGGML_OPENCL=OFF -DGGML_RPC=OFF -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TOOLS=OFF -DLLAMA_BUILD_SERVER=OFF"
+        with:
+          output-dir: wheelhouse
+
+      - name: Upload wheels as artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: wheels_pyodide
+          path: ./wheelhouse/*.whl
+
   build_sdist:
     name: Build source distribution
     runs-on: ubuntu-latest
@@ -183,7 +214,7 @@ jobs:
 
   release:
     name: Release
-    needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
+    needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_wheels_pyodide, build_sdist]
     if: startsWith(github.ref, 'refs/tags/')
     runs-on: ubuntu-latest
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2cc78bfc7..59307cec2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299
+- feat: add Pyodide wheel support by @abetlen in #2309
 
 ## [0.3.29]
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0474863a4..5feaaca5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,14 +10,22 @@ function(llama_cpp_python_install_target target)
         return()
     endif()
 
-    install(
-        TARGETS ${target}
-        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
-    )
+    if(EMSCRIPTEN)
+        set_target_properties(${target} PROPERTIES
+            OUTPUT_NAME "${target}.cpython-00-wasm32-emscripten"
+        )
+    endif()
+
+    if(NOT EMSCRIPTEN)
+        install(
+            TARGETS ${target}
+            LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
+            RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
+            ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
+            FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
+            RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
+        )
+    endif()
     install(
         TARGETS ${target}
         LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
@@ -65,6 +73,32 @@ if (LLAMA_BUILD)
     # Disable building curl support
     set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)
 
+    if (EMSCRIPTEN)
+        if (DEFINED EMSCRIPTEN_SYSTEM_PROCESSOR)
+            set(CMAKE_SYSTEM_PROCESSOR ${EMSCRIPTEN_SYSTEM_PROCESSOR} CACHE STRING "Target processor" FORCE)
+        else()
+            set(CMAKE_SYSTEM_PROCESSOR wasm32 CACHE STRING "Target processor" FORCE)
+        endif()
+
+        set(LLAMA_WASM_MEM64 OFF CACHE BOOL "llama.cpp: enable wasm64 memory" FORCE)
+        set(GGML_NATIVE OFF CACHE BOOL "ggml: enable -march=native" FORCE)
+        set(GGML_OPENMP OFF CACHE BOOL "ggml: use OpenMP" FORCE)
+        set(GGML_METAL OFF CACHE BOOL "ggml: use Metal" FORCE)
+        set(GGML_BLAS OFF CACHE BOOL "ggml: use BLAS" FORCE)
+        set(GGML_CUDA OFF CACHE BOOL "ggml: use CUDA" FORCE)
+        set(GGML_HIP OFF CACHE BOOL "ggml: use HIP" FORCE)
+        set(GGML_VULKAN OFF CACHE BOOL "ggml: use Vulkan" FORCE)
+        set(GGML_OPENCL OFF CACHE BOOL "ggml: use OpenCL" FORCE)
+        set(GGML_RPC OFF CACHE BOOL "ggml: use RPC" FORCE)
+
+        # Pyodide auto-loads side modules from top-level site-packages/lib
+        # before Python imports run, so keep upstream installs package-local.
+        set(CMAKE_INSTALL_BINDIR llama_cpp/lib CACHE PATH "Install binaries" FORCE)
+        set(CMAKE_INSTALL_INCLUDEDIR llama_cpp/include CACHE PATH "Install headers" FORCE)
+        set(CMAKE_INSTALL_LIBDIR llama_cpp/lib CACHE PATH "Install libraries" FORCE)
+        set(LLAMA_BUILD_COMMON OFF CACHE BOOL "Build llama.cpp common library" FORCE)
+    endif()
+
     # Architecture detection and settings for Apple platforms
     if (APPLE)
         # Get the target architecture
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
index e88ed387d..02cee8a88 100644
--- a/llama_cpp/_ctypes_extensions.py
+++ b/llama_cpp/_ctypes_extensions.py
@@ -19,6 +19,9 @@
 from typing_extensions import TypeAlias
 
 
+_EMSCRIPTEN_SIDE_MODULE_SUFFIX = ".cpython-00-wasm32-emscripten.so"
+
+
 # Load the library
 def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
     """Platform independent shared library loader"""
@@ -26,7 +29,12 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
     # for llamacpp) and "llama" (default name for this repo)
     lib_paths: List[pathlib.Path] = []
     # Determine the file extension based on the platform
-    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
+    if sys.platform == "emscripten":
+        # Use a CPython-style tag that Pyodide skips during package auto-load.
+        lib_paths += [
+            base_path / f"lib{lib_base_name}{_EMSCRIPTEN_SIDE_MODULE_SUFFIX}",
+        ]
+    elif sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
         lib_paths += [
             base_path / f"lib{lib_base_name}.so",
         ]
@@ -60,6 +68,33 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
             os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
         cdll_args["winmode"] = ctypes.RTLD_GLOBAL
 
+    if sys.platform == "emscripten":
+        cdll_args["mode"] = ctypes.RTLD_GLOBAL
+        lib_dir = str(base_path)
+        ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
+        if lib_dir not in ld_library_path.split(os.pathsep):
+            os.environ["LD_LIBRARY_PATH"] = (
+                lib_dir
+                if not ld_library_path
+                else f"{lib_dir}{os.pathsep}{ld_library_path}"
+            )
+
+        emscripten_dependencies = {
+            "llama": ("ggml-base", "ggml-cpu", "ggml"),
+            "mtmd": ("ggml-base", "ggml-cpu", "ggml", "llama"),
+        }
+        for dependency in emscripten_dependencies.get(lib_base_name, ()):
+            dependency_path = (
+                base_path / f"lib{dependency}{_EMSCRIPTEN_SIDE_MODULE_SUFFIX}"
+            )
+            if dependency_path.exists():
+                try:
+                    ctypes.CDLL(str(dependency_path), **cdll_args)  # type: ignore
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to load shared library '{dependency_path}': {e}"
+                    )
+
     # Try to load the shared library, handling potential errors
     for lib_path in lib_paths:
         if lib_path.exists():

From ddb6a05848a550bd7383ab7e2341f76ec7e46af9 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 16 Jun 2026 00:47:08 -0700
Subject: [PATCH 11/15] chore: bump version to 0.3.30 (#2311)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 59307cec2..5b337ad92 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.30]
+
 - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299
 - feat: add Pyodide wheel support by @abetlen in #2309
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 42f807ef6..b72459f65 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.29"
+__version__ = "0.3.30"

From 7440aaa3d3a793826b233e7334ea1f1ab1606bda Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 19 Jun 2026 18:05:37 -0700
Subject: [PATCH 12/15] feat: update llama.cpp to f449e0553 (#2312)

---
 CHANGELOG.md     | 2 ++
 vendor/llama.cpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b337ad92..a9371d342 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@f449e0553
+
 ## [0.3.30]
 
 - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e3a74b299..f449e0553 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e3a74b299085cd00013804f7fca2e03441b2da20
+Subproject commit f449e0553708b895adbd94a301431cef691f632d

From b11fe078898ef2e385c7d78563a300a8ab73cd27 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 20 Jun 2026 01:43:22 -0700
Subject: [PATCH 13/15] chore: bump version to 0.3.31 (#2317)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a9371d342..d6d33dbbe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.31]
+
 - feat: update llama.cpp to ggml-org/llama.cpp@f449e0553
 
 ## [0.3.30]
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index b72459f65..ed3c342f2 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.30"
+__version__ = "0.3.31"

From 9be3cd135bb87ef5c97662c8e60f5ec9689e94e5 Mon Sep 17 00:00:00 2001
From: Ankur Kaul <ankurkaul17@gmail.com>
Date: Mon, 22 Jun 2026 11:38:18 +0530
Subject: [PATCH 14/15] fix: preserve recurrent/hybrid model state when the
 full prompt is already cached (#2306)

Co-authored-by: Ankur Kaul <akaul36@gatech.edu>
---
 CHANGELOG.md        |   2 +
 llama_cpp/llama.py  |  49 +++++---
 tests/test_llama.py | 288 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 323 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6d33dbbe..b1d5fb880 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306
+
 ## [0.3.31]
 
 - feat: update llama.cpp to ggml-org/llama.cpp@f449e0553
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4a09b55ee..b5bffd46b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -471,6 +471,8 @@ def free_lora_adapter():
         self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
 
         self.n_tokens = 0
+        # Restored or truncated state must decode before sampling.
+        self._requires_eval = True
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
             (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
@@ -647,6 +649,7 @@ def set_seed(self, seed: int):
     def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
+        self._requires_eval = True
 
         if self._is_recurrent or self._is_hybrid:
             mem = llama_cpp.llama_get_memory(self._ctx.ctx)
@@ -689,6 +692,7 @@ def eval(self, tokens: Sequence[int]):
                 pass
             # Update n_tokens
             self.n_tokens += n_tokens
+            self._requires_eval = False
 
     def _init_sampler(
         self,
@@ -900,41 +904,53 @@ def generate(
             grammar=grammar,
         )
 
+        tokens = list(tokens)
+
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
 
-            # Recurrent and hybrid models cannot rewind state; reset if needed
-            if (
-                self._is_recurrent or self._is_hybrid
-            ) and longest_prefix < self.n_tokens:
-                longest_prefix = 0
-                reset = True
+            prompt_consumed = longest_prefix == len(tokens)
+            exact_prompt_cached = self.n_tokens == len(tokens) and prompt_consumed
+
+            # Exact cache hits can sample immediately only when the current
+            # logits were produced by a live decode, not restored state.
+            if exact_prompt_cached and not self._requires_eval:
+                reset = False
+                tokens = []
+                reuse_prefix = 0
                 if self.verbose:
                     print(
-                        "Llama.generate: recurrent/hybrid model requires full state reset",
+                        "Llama.generate: full prompt already cached, skipping reset",
                         file=sys.stderr,
                     )
-
-            if longest_prefix > 0:
-                if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
+            else:
+                # If there is no suffix to decode, replay one token to refresh
+                # logits after truncating to a valid prefix.
+                reuse_prefix = longest_prefix - 1 if prompt_consumed else longest_prefix
+
+            # Prefix hits can reuse memory because the suffix decode refreshes
+            # logits before sampling.
+            if reuse_prefix > 0:
+                if self._ctx.kv_cache_seq_rm(-1, reuse_prefix, -1):
                     reset = False
-                    tokens = tokens[longest_prefix:]
-                    self.n_tokens = longest_prefix
+                    tokens = tokens[reuse_prefix:]
+                    self.n_tokens = reuse_prefix
+                    self._requires_eval = True
                     if self.verbose:
                         print(
-                            f"Llama.generate: {longest_prefix} prefix-match hit, "
+                            f"Llama.generate: {reuse_prefix} prefix-match hit, "
                             f"remaining {len(tokens)} prompt tokens to eval",
                             file=sys.stderr,
                         )
                 elif self.verbose:
                     print(
-                        f"Llama.generate: {longest_prefix} prefix-match found "
+                        f"Llama.generate: {reuse_prefix} prefix-match found "
                         f"but partial kv removal not supported, re-evaluating full prompt",
                         file=sys.stderr,
                     )
@@ -948,7 +964,6 @@ def generate(
         #     grammar.reset()
 
         sample_idx = self.n_tokens + len(tokens) - 1
-        tokens = list(tokens)
 
         # Eval and sample
         while True:
@@ -988,6 +1003,7 @@ def generate(
                 if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
                     self.n_tokens = sample_idx
                     self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    self._requires_eval = True
                     break
 
             if self.draft_model is not None:
@@ -2217,6 +2233,7 @@ def load_state(self, state: LlamaState) -> None:
         rest[rest > 0] = 0.0
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
+        self._requires_eval = True
         self._seed = state.seed
         state_size = state.llama_state_size
         LLamaStateArrayType = ctypes.c_uint8 * state_size
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 336d6a612..70fce12d8 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -1,4 +1,5 @@
 import ctypes
+import itertools
 import multiprocessing
 
 import numpy as np
@@ -64,6 +65,14 @@ def llama_cpp_model_path():
     return model_path
 
 
+@pytest.fixture
+def llama_cpp_transformer_model_path():
+    repo_id = "ggml-org/models"
+    filename = "tinyllamas/stories15M-q4_0.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+
 @pytest.fixture
 def llama_cpp_embedding_model_path():
     repo_id = "CompendiumLabs/bge-small-en-v1.5-gguf"
@@ -339,6 +348,285 @@ def test_hybrid_model_prompt_cache_reset(llama_cpp_hybrid_model_path):
     )
 
 
+def _create_test_model(model_path):
+    return llama_cpp.Llama(
+        model_path,
+        n_ctx=64,
+        n_batch=64,
+        n_ubatch=64,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        verbose=False,
+    )
+
+
+def _generate_test_tokens(model, tokens, max_tokens=3):
+    return list(
+        itertools.islice(
+            model.generate(
+                tokens,
+                temp=0.0,
+            ),
+            max_tokens,
+        )
+    )
+
+
+MODEL_CACHE_CASES = (
+    ("llama_cpp_transformer_model_path", False, False),
+    ("llama_cpp_recurrent_model_path", True, False),
+    ("llama_cpp_hybrid_model_path", False, True),
+)
+
+RESTORED_CACHE_CASES = MODEL_CACHE_CASES
+
+
+def _eval_alternate_same_length_prompt(model, tokens, expected_next_token):
+    replacement_tokens = (
+        model.token_eos(),
+        model.token_nl(),
+        0,
+        1,
+        2,
+        model.n_vocab() - 1,
+    )
+
+    for replacement_token in replacement_tokens:
+        alternate_tokens = list(tokens)
+        alternate_tokens[-1] = replacement_token
+        if alternate_tokens == tokens:
+            continue
+
+        model.reset()
+        model.eval(alternate_tokens)
+        if model.sample(temp=0.0, idx=len(tokens) - 1) != expected_next_token:
+            return
+
+    raise AssertionError("failed to find an alternate same-length prompt")
+
+
+def _assert_exact_cached_prompt_reuse_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+
+    assert fresh._is_recurrent is is_recurrent
+    assert fresh._is_hybrid is is_hybrid
+
+    expected_tokens = _generate_test_tokens(fresh, tokens)
+
+    cached = _create_test_model(model_path)
+    assert cached._is_recurrent is is_recurrent
+    assert cached._is_hybrid is is_hybrid
+
+    cached.eval(tokens)
+    assert cached.n_tokens == len(tokens)
+    assert cached.input_ids[: cached.n_tokens].tolist() == tokens
+    assert cached.sample(temp=0.0, idx=len(tokens) - 1) == expected_tokens[0]
+
+    reset_calls = 0
+    original_reset = cached.reset
+
+    def reset_tracker():
+        nonlocal reset_calls
+        reset_calls += 1
+        original_reset()
+
+    cached.reset = reset_tracker
+
+    cached_tokens = _generate_test_tokens(cached, tokens)
+    assert reset_calls == 0
+    assert cached_tokens == expected_tokens
+    assert cached.n_tokens == len(tokens) + len(cached_tokens) - 1
+
+
+def _assert_loaded_exact_cached_prompt_reuse_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+    expected_tokens = _generate_test_tokens(fresh, tokens)
+
+    source = _create_test_model(model_path)
+    assert source._is_recurrent is is_recurrent
+    assert source._is_hybrid is is_hybrid
+
+    source.eval(tokens)
+    state = source.save_state()
+
+    loaded = _create_test_model(model_path)
+    assert loaded._is_recurrent is is_recurrent
+    assert loaded._is_hybrid is is_hybrid
+
+    _eval_alternate_same_length_prompt(
+        loaded,
+        tokens,
+        expected_tokens[0],
+    )
+    loaded.load_state(state)
+
+    assert loaded.n_tokens == len(tokens)
+    assert loaded.input_ids[: loaded.n_tokens].tolist() == tokens
+
+    loaded_tokens = _generate_test_tokens(loaded, tokens)
+    assert loaded_tokens == expected_tokens
+    assert loaded.n_tokens == len(tokens) + len(loaded_tokens) - 1
+
+
+def _assert_ram_cache_exact_prompt_hit_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+    expected = fresh.create_completion(
+        tokens,
+        max_tokens=1,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    cache = llama_cpp.LlamaRAMCache()
+    writer = _create_test_model(model_path)
+    writer.set_cache(cache)
+    writer.create_completion(
+        tokens,
+        max_tokens=1,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    cached = _create_test_model(model_path)
+    assert cached._is_recurrent is is_recurrent
+    assert cached._is_hybrid is is_hybrid
+    cached.set_cache(cache)
+
+    load_state_calls = 0
+    original_load_state = cached.load_state
+
+    def load_state_tracker(state):
+        nonlocal load_state_calls
+        load_state_calls += 1
+        original_load_state(state)
+
+    cached.load_state = load_state_tracker
+
+    actual = cached.create_completion(
+        tokens,
+        max_tokens=1,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    assert load_state_calls == 1
+    assert actual["choices"][0]["text"] == expected["choices"][0]["text"]
+    assert (
+        actual["usage"]["completion_tokens"] == expected["usage"]["completion_tokens"]
+    )
+
+
+def _assert_shorter_prompt_prefix_reuse_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    history = " jumps over the lazy dog"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+    history_tokens = fresh.tokenize(history.encode(), add_bos=False, special=True)
+    expected_tokens = _generate_test_tokens(fresh, tokens)
+
+    cached = _create_test_model(model_path)
+    assert cached._is_recurrent is is_recurrent
+    assert cached._is_hybrid is is_hybrid
+
+    cached.eval(tokens + history_tokens)
+    assert cached.n_tokens > len(tokens)
+    assert cached.input_ids[: len(tokens)].tolist() == tokens
+
+    cached_tokens = _generate_test_tokens(cached, tokens)
+    assert cached_tokens == expected_tokens
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), MODEL_CACHE_CASES
+)
+def test_exact_cached_prompt_reuse_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_exact_cached_prompt_reuse_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), RESTORED_CACHE_CASES
+)
+def test_loaded_exact_cached_prompt_reuse_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_loaded_exact_cached_prompt_reuse_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), RESTORED_CACHE_CASES
+)
+def test_ram_cache_exact_prompt_hit_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_ram_cache_exact_prompt_hit_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), MODEL_CACHE_CASES
+)
+def test_shorter_prompt_prefix_reuse_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_shorter_prompt_prefix_reuse_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
 def test_real_llama_embeddings(llama_cpp_embedding_model_path):
     model = llama_cpp.Llama(
         llama_cpp_embedding_model_path,

From 4bee85b352ec4aa7034dc13c3d80688805e47d63 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 23 Jun 2026 07:56:15 -0700
Subject: [PATCH 15/15] feat: update llama.cpp to 92e854ab8 (#2318)

---
 CHANGELOG.md               |  1 +
 llama_cpp/llama_cpp.py     |  5 +++++
 llama_cpp/llama_cpp_ext.py | 19 +++++++++++++++++++
 llama_cpp/mtmd_cpp.py      |  7 +++++++
 vendor/llama.cpp           |  2 +-
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b1d5fb880..925e941d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@92e854ab8
 - fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306
 
 ## [0.3.31]
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 21f85c81c..176709d96 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1744,6 +1744,11 @@ def llama_model_n_embd_out(model: llama_model_p, /) -> int:
 def llama_model_n_layer(model: llama_model_p, /) -> int: ...
 
 
+# LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
+@ctypes_function("llama_model_n_layer_nextn", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_layer_nextn(model: llama_model_p, /) -> int: ...
+
+
 # LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_head(model: llama_model_p, /) -> int: ...
diff --git a/llama_cpp/llama_cpp_ext.py b/llama_cpp/llama_cpp_ext.py
index 284811086..a4b424eb6 100644
--- a/llama_cpp/llama_cpp_ext.py
+++ b/llama_cpp/llama_cpp_ext.py
@@ -62,6 +62,25 @@ def llama_set_embeddings_nextn(
     ...
 
 
+# LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
+@_ctypes_function_from_names(
+    (
+        "llama_set_nextn_layer_offset",
+        "_Z28llama_set_nextn_layer_offsetP13llama_contexti",
+        "?llama_set_nextn_layer_offset@@YAXPEAUllama_context@@H@Z",
+    ),
+    [llama_cpp.llama_context_p_ctypes, ctypes.c_int32],
+    None,
+)
+def llama_set_nextn_layer_offset(
+    ctx: llama_cpp.llama_context_p,
+    offset: Union[ctypes.c_int32, int],
+    /,
+):
+    """Select which appended NextN block the decoder MTP graph runs."""
+    ...
+
+
 # LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
 @_ctypes_function_from_names(
     (
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 78f068aa9..35357a327 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -20,6 +20,7 @@
 )
 import pathlib
 from typing import (
+    Callable,
     Union,
     NewType,
     Optional,
@@ -84,6 +85,8 @@
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
 
+mtmd_progress_callback = CFUNCTYPE(c_bool, c_float, c_void_p)
+
 
 # Structures
 class mtmd_context_params(Structure):
@@ -106,6 +109,8 @@ class mtmd_context_params(Structure):
         cb_eval: llama_cpp.ggml_backend_sched_eval_callback
         cb_eval_user_data: c_void_p
         batch_max_tokens: int
+        progress_callback: Callable[[float, c_void_p], bool]
+        progress_callback_user_data: c_void_p
 
     _fields_ = [
         ("use_gpu", c_bool),
@@ -120,6 +125,8 @@ class mtmd_context_params(Structure):
         ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
         ("batch_max_tokens", c_int),
+        ("progress_callback", mtmd_progress_callback),
+        ("progress_callback_user_data", c_void_p),
     ]
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f449e0553..92e854ab8 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f449e0553708b895adbd94a301431cef691f632d
+Subproject commit 92e854ab836254bb7f2eb49babd5613474bdb700