From 7440aaa3d3a793826b233e7334ea1f1ab1606bda Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Fri, 19 Jun 2026 18:05:37 -0700
Subject: [PATCH 1/4] feat: update llama.cpp to f449e0553 (#2312)

---
 CHANGELOG.md     | 2 ++
 vendor/llama.cpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b337ad92..a9371d342 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@f449e0553
+
 ## [0.3.30]
 
 - feat: update llama.cpp to ggml-org/llama.cpp@e3a74b299
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e3a74b299..f449e0553 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e3a74b299085cd00013804f7fca2e03441b2da20
+Subproject commit f449e0553708b895adbd94a301431cef691f632d

From b11fe078898ef2e385c7d78563a300a8ab73cd27 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 20 Jun 2026 01:43:22 -0700
Subject: [PATCH 2/4] chore: bump version to 0.3.31 (#2317)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a9371d342..d6d33dbbe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.31]
+
 - feat: update llama.cpp to ggml-org/llama.cpp@f449e0553
 
 ## [0.3.30]
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index b72459f65..ed3c342f2 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.30"
+__version__ = "0.3.31"

From 9be3cd135bb87ef5c97662c8e60f5ec9689e94e5 Mon Sep 17 00:00:00 2001
From: Ankur Kaul <ankurkaul17@gmail.com>
Date: Mon, 22 Jun 2026 11:38:18 +0530
Subject: [PATCH 3/4] fix: preserve recurrent/hybrid model state when the full
 prompt is already cached (#2306)

Co-authored-by: Ankur Kaul <akaul36@gatech.edu>
---
 CHANGELOG.md        |   2 +
 llama_cpp/llama.py  |  49 +++++---
 tests/test_llama.py | 288 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 323 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6d33dbbe..b1d5fb880 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306
+
 ## [0.3.31]
 
 - feat: update llama.cpp to ggml-org/llama.cpp@f449e0553
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4a09b55ee..b5bffd46b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -471,6 +471,8 @@ def free_lora_adapter():
         self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
 
         self.n_tokens = 0
+        # Restored or truncated state must decode before sampling.
+        self._requires_eval = True
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
             (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
@@ -647,6 +649,7 @@ def set_seed(self, seed: int):
     def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
+        self._requires_eval = True
 
         if self._is_recurrent or self._is_hybrid:
             mem = llama_cpp.llama_get_memory(self._ctx.ctx)
@@ -689,6 +692,7 @@ def eval(self, tokens: Sequence[int]):
                 pass
             # Update n_tokens
             self.n_tokens += n_tokens
+            self._requires_eval = False
 
     def _init_sampler(
         self,
@@ -900,41 +904,53 @@ def generate(
             grammar=grammar,
         )
 
+        tokens = list(tokens)
+
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
 
-            # Recurrent and hybrid models cannot rewind state; reset if needed
-            if (
-                self._is_recurrent or self._is_hybrid
-            ) and longest_prefix < self.n_tokens:
-                longest_prefix = 0
-                reset = True
+            prompt_consumed = longest_prefix == len(tokens)
+            exact_prompt_cached = self.n_tokens == len(tokens) and prompt_consumed
+
+            # Exact cache hits can sample immediately only when the current
+            # logits were produced by a live decode, not restored state.
+            if exact_prompt_cached and not self._requires_eval:
+                reset = False
+                tokens = []
+                reuse_prefix = 0
                 if self.verbose:
                     print(
-                        "Llama.generate: recurrent/hybrid model requires full state reset",
+                        "Llama.generate: full prompt already cached, skipping reset",
                         file=sys.stderr,
                     )
-
-            if longest_prefix > 0:
-                if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
+            else:
+                # If there is no suffix to decode, replay one token to refresh
+                # logits after truncating to a valid prefix.
+                reuse_prefix = longest_prefix - 1 if prompt_consumed else longest_prefix
+
+            # Prefix hits can reuse memory because the suffix decode refreshes
+            # logits before sampling.
+            if reuse_prefix > 0:
+                if self._ctx.kv_cache_seq_rm(-1, reuse_prefix, -1):
                     reset = False
-                    tokens = tokens[longest_prefix:]
-                    self.n_tokens = longest_prefix
+                    tokens = tokens[reuse_prefix:]
+                    self.n_tokens = reuse_prefix
+                    self._requires_eval = True
                     if self.verbose:
                         print(
-                            f"Llama.generate: {longest_prefix} prefix-match hit, "
+                            f"Llama.generate: {reuse_prefix} prefix-match hit, "
                             f"remaining {len(tokens)} prompt tokens to eval",
                             file=sys.stderr,
                         )
                 elif self.verbose:
                     print(
-                        f"Llama.generate: {longest_prefix} prefix-match found "
+                        f"Llama.generate: {reuse_prefix} prefix-match found "
                         f"but partial kv removal not supported, re-evaluating full prompt",
                         file=sys.stderr,
                     )
@@ -948,7 +964,6 @@ def generate(
         #     grammar.reset()
 
         sample_idx = self.n_tokens + len(tokens) - 1
-        tokens = list(tokens)
 
         # Eval and sample
         while True:
@@ -988,6 +1003,7 @@ def generate(
                 if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
                     self.n_tokens = sample_idx
                     self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    self._requires_eval = True
                     break
 
             if self.draft_model is not None:
@@ -2217,6 +2233,7 @@ def load_state(self, state: LlamaState) -> None:
         rest[rest > 0] = 0.0
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
+        self._requires_eval = True
         self._seed = state.seed
         state_size = state.llama_state_size
         LLamaStateArrayType = ctypes.c_uint8 * state_size
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 336d6a612..70fce12d8 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -1,4 +1,5 @@
 import ctypes
+import itertools
 import multiprocessing
 
 import numpy as np
@@ -64,6 +65,14 @@ def llama_cpp_model_path():
     return model_path
 
 
+@pytest.fixture
+def llama_cpp_transformer_model_path():
+    repo_id = "ggml-org/models"
+    filename = "tinyllamas/stories15M-q4_0.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+
 @pytest.fixture
 def llama_cpp_embedding_model_path():
     repo_id = "CompendiumLabs/bge-small-en-v1.5-gguf"
@@ -339,6 +348,285 @@ def test_hybrid_model_prompt_cache_reset(llama_cpp_hybrid_model_path):
     )
 
 
+def _create_test_model(model_path):
+    return llama_cpp.Llama(
+        model_path,
+        n_ctx=64,
+        n_batch=64,
+        n_ubatch=64,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        verbose=False,
+    )
+
+
+def _generate_test_tokens(model, tokens, max_tokens=3):
+    return list(
+        itertools.islice(
+            model.generate(
+                tokens,
+                temp=0.0,
+            ),
+            max_tokens,
+        )
+    )
+
+
+MODEL_CACHE_CASES = (
+    ("llama_cpp_transformer_model_path", False, False),
+    ("llama_cpp_recurrent_model_path", True, False),
+    ("llama_cpp_hybrid_model_path", False, True),
+)
+
+RESTORED_CACHE_CASES = MODEL_CACHE_CASES
+
+
+def _eval_alternate_same_length_prompt(model, tokens, expected_next_token):
+    replacement_tokens = (
+        model.token_eos(),
+        model.token_nl(),
+        0,
+        1,
+        2,
+        model.n_vocab() - 1,
+    )
+
+    for replacement_token in replacement_tokens:
+        alternate_tokens = list(tokens)
+        alternate_tokens[-1] = replacement_token
+        if alternate_tokens == tokens:
+            continue
+
+        model.reset()
+        model.eval(alternate_tokens)
+        if model.sample(temp=0.0, idx=len(tokens) - 1) != expected_next_token:
+            return
+
+    raise AssertionError("failed to find an alternate same-length prompt")
+
+
+def _assert_exact_cached_prompt_reuse_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+
+    assert fresh._is_recurrent is is_recurrent
+    assert fresh._is_hybrid is is_hybrid
+
+    expected_tokens = _generate_test_tokens(fresh, tokens)
+
+    cached = _create_test_model(model_path)
+    assert cached._is_recurrent is is_recurrent
+    assert cached._is_hybrid is is_hybrid
+
+    cached.eval(tokens)
+    assert cached.n_tokens == len(tokens)
+    assert cached.input_ids[: cached.n_tokens].tolist() == tokens
+    assert cached.sample(temp=0.0, idx=len(tokens) - 1) == expected_tokens[0]
+
+    reset_calls = 0
+    original_reset = cached.reset
+
+    def reset_tracker():
+        nonlocal reset_calls
+        reset_calls += 1
+        original_reset()
+
+    cached.reset = reset_tracker
+
+    cached_tokens = _generate_test_tokens(cached, tokens)
+    assert reset_calls == 0
+    assert cached_tokens == expected_tokens
+    assert cached.n_tokens == len(tokens) + len(cached_tokens) - 1
+
+
+def _assert_loaded_exact_cached_prompt_reuse_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+    expected_tokens = _generate_test_tokens(fresh, tokens)
+
+    source = _create_test_model(model_path)
+    assert source._is_recurrent is is_recurrent
+    assert source._is_hybrid is is_hybrid
+
+    source.eval(tokens)
+    state = source.save_state()
+
+    loaded = _create_test_model(model_path)
+    assert loaded._is_recurrent is is_recurrent
+    assert loaded._is_hybrid is is_hybrid
+
+    _eval_alternate_same_length_prompt(
+        loaded,
+        tokens,
+        expected_tokens[0],
+    )
+    loaded.load_state(state)
+
+    assert loaded.n_tokens == len(tokens)
+    assert loaded.input_ids[: loaded.n_tokens].tolist() == tokens
+
+    loaded_tokens = _generate_test_tokens(loaded, tokens)
+    assert loaded_tokens == expected_tokens
+    assert loaded.n_tokens == len(tokens) + len(loaded_tokens) - 1
+
+
+def _assert_ram_cache_exact_prompt_hit_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+    expected = fresh.create_completion(
+        tokens,
+        max_tokens=1,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    cache = llama_cpp.LlamaRAMCache()
+    writer = _create_test_model(model_path)
+    writer.set_cache(cache)
+    writer.create_completion(
+        tokens,
+        max_tokens=1,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    cached = _create_test_model(model_path)
+    assert cached._is_recurrent is is_recurrent
+    assert cached._is_hybrid is is_hybrid
+    cached.set_cache(cache)
+
+    load_state_calls = 0
+    original_load_state = cached.load_state
+
+    def load_state_tracker(state):
+        nonlocal load_state_calls
+        load_state_calls += 1
+        original_load_state(state)
+
+    cached.load_state = load_state_tracker
+
+    actual = cached.create_completion(
+        tokens,
+        max_tokens=1,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    assert load_state_calls == 1
+    assert actual["choices"][0]["text"] == expected["choices"][0]["text"]
+    assert (
+        actual["usage"]["completion_tokens"] == expected["usage"]["completion_tokens"]
+    )
+
+
+def _assert_shorter_prompt_prefix_reuse_matches_fresh(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    prompt = "The quick brown fox"
+    history = " jumps over the lazy dog"
+    fresh = _create_test_model(model_path)
+    tokens = fresh.tokenize(prompt.encode(), add_bos=True, special=True)
+    history_tokens = fresh.tokenize(history.encode(), add_bos=False, special=True)
+    expected_tokens = _generate_test_tokens(fresh, tokens)
+
+    cached = _create_test_model(model_path)
+    assert cached._is_recurrent is is_recurrent
+    assert cached._is_hybrid is is_hybrid
+
+    cached.eval(tokens + history_tokens)
+    assert cached.n_tokens > len(tokens)
+    assert cached.input_ids[: len(tokens)].tolist() == tokens
+
+    cached_tokens = _generate_test_tokens(cached, tokens)
+    assert cached_tokens == expected_tokens
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), MODEL_CACHE_CASES
+)
+def test_exact_cached_prompt_reuse_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_exact_cached_prompt_reuse_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), RESTORED_CACHE_CASES
+)
+def test_loaded_exact_cached_prompt_reuse_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_loaded_exact_cached_prompt_reuse_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), RESTORED_CACHE_CASES
+)
+def test_ram_cache_exact_prompt_hit_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_ram_cache_exact_prompt_hit_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
+@pytest.mark.parametrize(
+    ("model_path_fixture", "is_recurrent", "is_hybrid"), MODEL_CACHE_CASES
+)
+def test_shorter_prompt_prefix_reuse_matches_fresh(
+    request,
+    model_path_fixture,
+    is_recurrent,
+    is_hybrid,
+):
+    _assert_shorter_prompt_prefix_reuse_matches_fresh(
+        request.getfixturevalue(model_path_fixture),
+        is_recurrent=is_recurrent,
+        is_hybrid=is_hybrid,
+    )
+
+
 def test_real_llama_embeddings(llama_cpp_embedding_model_path):
     model = llama_cpp.Llama(
         llama_cpp_embedding_model_path,

From 4bee85b352ec4aa7034dc13c3d80688805e47d63 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 23 Jun 2026 07:56:15 -0700
Subject: [PATCH 4/4] feat: update llama.cpp to 92e854ab8 (#2318)

---
 CHANGELOG.md               |  1 +
 llama_cpp/llama_cpp.py     |  5 +++++
 llama_cpp/llama_cpp_ext.py | 19 +++++++++++++++++++
 llama_cpp/mtmd_cpp.py      |  7 +++++++
 vendor/llama.cpp           |  2 +-
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b1d5fb880..925e941d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@92e854ab8
 - fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306
 
 ## [0.3.31]
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 21f85c81c..176709d96 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1744,6 +1744,11 @@ def llama_model_n_embd_out(model: llama_model_p, /) -> int:
 def llama_model_n_layer(model: llama_model_p, /) -> int: ...
 
 
+# LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
+@ctypes_function("llama_model_n_layer_nextn", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_layer_nextn(model: llama_model_p, /) -> int: ...
+
+
 # LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_head(model: llama_model_p, /) -> int: ...
diff --git a/llama_cpp/llama_cpp_ext.py b/llama_cpp/llama_cpp_ext.py
index 284811086..a4b424eb6 100644
--- a/llama_cpp/llama_cpp_ext.py
+++ b/llama_cpp/llama_cpp_ext.py
@@ -62,6 +62,25 @@ def llama_set_embeddings_nextn(
     ...
 
 
+# LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
+@_ctypes_function_from_names(
+    (
+        "llama_set_nextn_layer_offset",
+        "_Z28llama_set_nextn_layer_offsetP13llama_contexti",
+        "?llama_set_nextn_layer_offset@@YAXPEAUllama_context@@H@Z",
+    ),
+    [llama_cpp.llama_context_p_ctypes, ctypes.c_int32],
+    None,
+)
+def llama_set_nextn_layer_offset(
+    ctx: llama_cpp.llama_context_p,
+    offset: Union[ctypes.c_int32, int],
+    /,
+):
+    """Select which appended NextN block the decoder MTP graph runs."""
+    ...
+
+
 # LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
 @_ctypes_function_from_names(
     (
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 78f068aa9..35357a327 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -20,6 +20,7 @@
 )
 import pathlib
 from typing import (
+    Callable,
     Union,
     NewType,
     Optional,
@@ -84,6 +85,8 @@
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
 
+mtmd_progress_callback = CFUNCTYPE(c_bool, c_float, c_void_p)
+
 
 # Structures
 class mtmd_context_params(Structure):
@@ -106,6 +109,8 @@ class mtmd_context_params(Structure):
         cb_eval: llama_cpp.ggml_backend_sched_eval_callback
         cb_eval_user_data: c_void_p
         batch_max_tokens: int
+        progress_callback: Callable[[float, c_void_p], bool]
+        progress_callback_user_data: c_void_p
 
     _fields_ = [
         ("use_gpu", c_bool),
@@ -120,6 +125,8 @@ class mtmd_context_params(Structure):
         ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
         ("batch_max_tokens", c_int),
+        ("progress_callback", mtmd_progress_callback),
+        ("progress_callback_user_data", c_void_p),
     ]
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f449e0553..92e854ab8 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f449e0553708b895adbd94a301431cef691f632d
+Subproject commit 92e854ab836254bb7f2eb49babd5613474bdb700