From 31b1d95a6c19f5b615a3286069f181a415f872e8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 2 May 2024 11:32:18 -0400
Subject: [PATCH 01/14] feat: Add llama-3-vision-alpha chat format

---
 llama_cpp/llama_chat_format.py | 64 ++++++++++++++++++++++++++++++++--
 llama_cpp/server/model.py      | 14 ++++++++
 2 files changed, 76 insertions(+), 2 deletions(-)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 0af410a972..a17b86b30a 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2165,7 +2165,7 @@ def create_completion(stop):
 
 
 class Llava15ChatHandler:
-    DEFAULT_SYSTEM_MESSAGE =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+    DEFAULT_SYSTEM_MESSAGE: Optional[str] =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
 
     CHAT_FORMAT = (
         "{% for message in messages %}"
@@ -2288,7 +2288,7 @@ def __call__(
         assert self.clip_ctx is not None
 
         system_prompt = _get_system_message(messages)
-        if system_prompt == "":
+        if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
             messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
 
         image_urls = self.get_image_urls(messages)
@@ -2771,6 +2771,66 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
         "{% endif %}"
     )
 
+class Llama3VisionAlpha(Llava15ChatHandler):
+    # question = "<image>" + q
+
+    # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    DEFAULT_SYSTEM_MESSAGE = None
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+
+        "<|start_header_id|>"
+
+        "{% if message.role == 'user' %}"
+
+        "user<|end_header_id|>\n\n"
+
+        "{% if message.content is iterable %}"
+
+        # <image>
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        # Question:
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% endif %}"
+
+        # Question:
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+
+        "{% endif %}"
+
+        # Answer:
+        "{% if message.role == 'assistant' %}"
+        "assistant<|end_header_id|>\n\n"
+        "{{ message.content }}"
+        "{% endif %}"
+
+        "<|eot_id|>"
+
+        "{% endfor %}"
+
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        "{% endif %}"
+    )
 
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 1ad592d211..e102fadbd7 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -140,6 +140,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
+        elif settings.chat_format == "llama-3-vision-alpha":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
         elif settings.chat_format == "hf-autotokenizer":
             assert (
                 settings.hf_pretrained_model_name_or_path is not None

From d75dea18db61becae72784b9d750897b7cdcda26 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 2 May 2024 12:00:44 -0400
Subject: [PATCH 02/14] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f364eb6fb5..6ecf3189e0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961
+Subproject commit 6ecf3189e00a1e8e737a78b6d10e1d7006e050a2

From 21171223967896b9e941007841ebd1513b3cc4b9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 2 May 2024 12:07:09 -0400
Subject: [PATCH 03/14] chore: Bump version

---
 CHANGELOG.md          | 10 +++++++++-
 llama_cpp/__init__.py |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c9681f355b..9b995509eb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.69]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
+- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
+- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
+- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
+- fix: UTF-8 handling with grammars by @jsoma in #1415
+
 ## [0.2.68]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@
+- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
 - feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
 - fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 63c622551c..95c8819660 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.68"
\ No newline at end of file
+__version__ = "0.2.69"
\ No newline at end of file

From 2138561fab5e60672c63b6c446b62a8bc26e17c4 Mon Sep 17 00:00:00 2001
From: Daniel Thuerck <dthuerck@users.noreply.github.com>
Date: Fri, 3 May 2024 18:17:07 +0200
Subject: [PATCH 04/14] fix(server): Propagate `flash_attn` to model load.
 (#1424)

---
 llama_cpp/server/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index e102fadbd7..f002924109 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -242,6 +242,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             logits_all=settings.logits_all,
             embedding=settings.embedding,
             offload_kqv=settings.offload_kqv,
+            flash_attn=settings.flash_attn,
             # Sampling Params
             last_n_tokens_size=settings.last_n_tokens_size,
             # LoRA Params

From 0a454bebe67d12a446981eb16028c168ca5faa81 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 3 May 2024 15:23:06 -0400
Subject: [PATCH 05/14] feat(server): Remove temperature bounds checks for
 server. Closes #1384

---
 llama_cpp/server/types.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index ce9c87a694..a20b3940f2 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -18,8 +18,6 @@
 
 temperature_field = Field(
     default=0.8,
-    ge=0.0,
-    le=2.0,
     description="Adjust the randomness of the generated text.\n\n"
     + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
 )

From 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 3 May 2024 19:07:50 -0400
Subject: [PATCH 06/14] fix: Use memmove to copy str_value kv_override. Closes
 #1417

---
 llama_cpp/llama.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index f927f0ca26..17576a69b8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -262,7 +262,12 @@ def __init__(
                         raise ValueError(f"Value for {k} is too long: {v}")
                     v_bytes = v_bytes.ljust(128, b"\0")
                     self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
-                    self._kv_overrides_array[i].value.str_value[:128] = v_bytes
+                    # copy min(v_bytes, 128) to str_value
+                    ctypes.memmove(
+                        self._kv_overrides_array[i].value.str_value,
+                        v_bytes,
+                        min(len(v_bytes), 128),
+                    )
                 else:
                     raise ValueError(f"Unknown value type for {k}: {v}")
 

From 1f56c648c3a4aed2b4fe3edb4dc745fae3e7d8ae Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Sat, 4 May 2024 22:11:20 +0800
Subject: [PATCH 07/14] feat: Implement streaming for Functionary v2 + Bug
 fixes (#1419)

* set up streaming for v2

* assert v2 streaming, fix tool_call vs function_call

* fix streaming with tool_choice/function_call

* make functions return 1 function call only when 'auto'

* fix

---------

Co-authored-by: Andrei <abetlen@gmail.com>
---
 llama_cpp/llama_chat_format.py | 576 +++++++++++++++++++++++++--------
 1 file changed, 443 insertions(+), 133 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a17b86b30a..3ab94e0d3c 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1894,6 +1894,8 @@ def prepare_messages_for_inference(
         function_call = (
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
+    elif function_call is not None:
+        pass
     else:
         function_call = "auto"
 
@@ -1930,11 +1932,10 @@ def prepare_messages_for_inference(
             logits_processor=logits_processor,
             grammar=grammar,
         )
-        completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
+        if stream is False:
+            completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
 
-    assert stream is False  # TODO: support stream mode
-
     def get_grammar(function_call):
         function_body = None
         for function in functions or []:
@@ -1968,7 +1969,7 @@ def get_grammar(function_call):
 
         return grammar
 
-    def create_completion(stop):
+    def create_completion(prompt, stop, grammar):
         completion = cast(llama_types.Completion, llama.create_completion(
             prompt=prompt,
             temperature=temperature,
@@ -1976,7 +1977,7 @@ def create_completion(stop):
             top_k=top_k,
             min_p=min_p,
             typical_p=typical_p,
-            stream=False,
+            stream=stream,
             stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
@@ -1996,172 +1997,481 @@ def create_completion(stop):
     content = ""
     function_calls, function_bodies = [], []
     completion_tokens = 0
-
-    if version == "v1":
-        # If no or "auto" tool_choice/function_call
-        if isinstance(function_call, str) and function_call == "auto":
-            stops = ["\n", END_ASSISTANT_TOKEN]
-        # If tool_choice/function_call is provided
-        elif isinstance(function_call, dict):
-            prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
-            stops = END_FUNCTION_CALL_TOKEN
-            function_call = function_call["name"]
-            function_calls.append(function_call)
-            grammar = get_grammar(function_call)
-        else:
-            prompt = prompt
-            stops = ["\n", END_ASSISTANT_TOKEN]
-
-        completion = create_completion(stop=stops)
-        completion_text = completion["choices"][0]["text"]
-        completion_tokens += completion["usage"]["completion_tokens"]
+    
+    def generate_streaming(tools, functions, function_call, prompt):
+        assert version == "v2", "Streaming for v1 is not supported"
+        
+        chunk_id, chunk_created = None, None
         
-
-        # If the generation does not involve a function call
-        if (
-            START_FUNCTION_CALL_TOKEN not in prompt
-            and START_FUNCTION_CALL_TOKEN not in completion_text
-        ):
-            completion["usage"]["completion_tokens"] = completion_tokens
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        # If the generation involves a function call in completion, generate the parameters
-        elif (
-            START_FUNCTION_CALL_TOKEN not in prompt
-            and START_FUNCTION_CALL_TOKEN in completion_text
-        ):
-            prompt += (
-                completion_text.replace(
-                    f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
-                )
-                + "\n"
-            )
-            function_calls.append(
-                completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
-            )
-            grammar = get_grammar(function_calls[-1])
-            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
-            completion_tokens += completion["usage"]["completion_tokens"]
-            function_bodies.append(completion["choices"][0]["text"].strip())
-        # If the prompt involves a function call, just append generated parameters to function_bodies
-        else:
-            function_bodies.append(completion_text.strip())
-    else:
         # If tool_choice/function_call is provided
         if isinstance(function_call, dict):
             prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
-            function_call = function_call["name"]
-            function_calls.append(function_call)
-            grammar = get_grammar(function_call)
+            grammar = get_grammar(function_call["name"])
             stops = [STOP_TOKEN, FROM_TOKEN]
-            completion = create_completion(stop=stops)
-            completion_text = completion["choices"][0]["text"]
-            completion_tokens += completion["usage"]["completion_tokens"]
-            function_bodies.append(completion_text.strip())
+            tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
+            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+            completion_text = ""
+            first = True
+            for chunk in completion:
+                # Yield the tool/function name first
+                if first:
+                    if tools is not None:
+                        func_call_dict = {
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": "call_" + tool_id,
+                                    "type": "function",
+                                    "function": {"name": function_call["name"], "arguments": ""},
+                                }
+                            ]
+                        }
+                    else:
+                        func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk["id"],
+                        object="chat.completion.chunk",
+                        created=chunk["created"],
+                        model=chunk["model"],
+                        choices=[
+                            {"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
+                        ],
+                    )
+                    first = False
+                if tools is not None:
+                    func_call_dict = {
+                        "tool_calls": [
+                            {
+                                "index": 0,
+                                "id": "call_" + tool_id,
+                                "type": "function",
+                                "function": {
+                                    "name": None,
+                                    "arguments": chunk["choices"][0]["text"].rstrip(),
+                                },
+                            }
+                        ]
+                    }
+                else:
+                    func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
+                if len(chunk["choices"][0]["text"].rstrip()) > 0:
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk["id"],
+                        object="chat.completion.chunk",
+                        created=chunk["created"],
+                        model=chunk["model"],
+                        choices=[
+                            {
+                                "index": 0,
+                                "logprobs": chunk["choices"][0]["logprobs"],
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    **func_call_dict,
+                                },
+                            }
+                        ],
+                    )
+            # Yield tool_call/function_call stop message
+            yield llama_types.CreateChatCompletionStreamResponse(
+                id="chat" + chunk["id"],
+                object="chat.completion.chunk",
+                created=chunk["created"],
+                model=chunk["model"],
+                choices=[
+                    {
+                        "index": 0,
+                        "finish_reason": "tool_calls" if tools is not None else "function_call",
+                        "logprobs": None,
+                        "delta": {
+                            "role": None, "content": None, "function_call": None, "tool_calls": None
+                        },
+                    }
+                ],
+            )
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
+            tool_index = 0
             while True:
                 # Generate function name first
                 grammar = None
                 stops = CONTENT_TOKEN
-                completion = create_completion(stop=stops)
-                completion_text = completion["choices"][0]["text"]
-                completion_tokens += completion["usage"]["completion_tokens"]
+                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                completion_text = ""
+                for chunk in completion:
+                    completion_text += chunk["choices"][0]["text"]
+                if chunk_id is None:
+                    chunk_id = chunk["id"]
+                if chunk_created is None:
+                    chunk_created = chunk["created"]
                 function_name = completion_text.strip()
                 if function_name == "all":
                     prompt += "all\n<|content|>"
+                    # Yield the first empty message for content
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk_id,
+                        model=chunk["model"],
+                        created=chunk_created,
+                        object="chat.completion.chunk",
+                        choices=[
+                            {
+                                "index": 0,
+                                "delta": {"role": "assistant", "content": ""},
+                                "logprobs": None,
+                                "finish_reason": None,
+                            }
+                        ],
+                    )
                 else:
-                    function_call = completion_text.strip()
-                    prompt += f"{function_call}\n<|content|>"
-                    function_calls.append(function_call)
-                    grammar = get_grammar(function_call)
+                    prompt += f"{function_name}\n<|content|>"
+                    grammar = get_grammar(function_name)
+                    tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
+                    if tools is not None:
+                        func_call_dict = {
+                            "tool_calls": [
+                                {
+                                    "index": tool_index,
+                                    "id": "call_" + tool_id,
+                                    "type": "function",
+                                    "function": {"name": function_name, "arguments": ""},
+                                }
+                            ]
+                        }
+                    else:
+                        func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
+                    # Stream function name
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk_id,
+                        object="chat.completion.chunk",
+                        created=chunk_created,
+                        model=chunk["model"],
+                        choices=[
+                            {
+                                "index": 0,
+                                "logprobs": chunk["choices"][0]["logprobs"],
+                                "delta": {
+                                    "role": "assistant",
+                                    "content": None,
+                                    **func_call_dict,
+                                },
+                            }
+                        ],
+                    )
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
-                completion = create_completion(stop=stops)
-                completion_text = completion["choices"][0]["text"]
-                completion_tokens += completion["usage"]["completion_tokens"]
+                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                 if function_name == "all":
-                    if completion_text.endswith("\n<|from|>assistant\n"):
-                        content += completion_text[:-len("\n<|from|>assistant\n")]
-                    if completion_text.endswith("\n<|from|> assistant\n"):
-                        content += completion_text[-len("\n<|from|> assistant\n")]
-                    else:
-                        content += completion_text
-                    content = content.lstrip()
+                    completion_text = ""
+                    stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
+                    for i, chunk in enumerate(completion):
+                        completion_text += chunk["choices"][0]["text"]
+                        if is_end:
+                            buffer.append(chunk["choices"][0]["text"].strip(" "))
+                            if stop_sequence.startswith("".join(buffer)):
+                                continue
+                            else:
+                                buffer.pop()
+                                while len(buffer) > 0:
+                                    yield llama_types.CreateChatCompletionStreamResponse(
+                                        id="chat" + chunk_id,
+                                        object="chat.completion.chunk",
+                                        created=chunk_created,
+                                        model=chunk["model"],
+                                        choices=[
+                                            {
+                                                "index": 0,
+                                                "logprobs": chunk["choices"][0]["logprobs"],
+                                                "delta": {
+                                                    "role": "assistant", "content": buffer.pop(0)
+                                                },
+                                            }
+                                        ],
+                                    )
+                                is_end = False
+                        elif chunk["choices"][0]["text"] == "\n":
+                            is_end = True
+                            buffer.append(chunk["choices"][0]["text"].strip(" "))
+                            continue
+
+                        if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
+                            yield llama_types.CreateChatCompletionStreamResponse(
+                                id="chat" + chunk_id,
+                                object="chat.completion.chunk",
+                                created=chunk_created,
+                                model=chunk["model"],
+                                choices=[
+                                    {
+                                        "index": 0,
+                                        "logprobs": chunk["choices"][0]["logprobs"],
+                                        "delta": {
+                                            "role": "assistant",
+                                            "content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
+                                        },
+                                    }
+                                ],
+                            )
                     # Check whether the model wants to generate another turn
                     if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
                         if completion_text.endswith("\n<|from|>assistant\n"):
                             cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
                         elif completion_text.endswith("\n<|from|> assistant\n"):
-                            cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
+                            cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
                         else:
                             cleaned_completion_text = completion_text.strip()
                         prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
                     else:
+                        # Yield stop message
+                        yield llama_types.CreateChatCompletionStreamResponse(
+                            id="chat" + chunk_id,
+                            model=chunk["model"],
+                            created=chunk_created,
+                            object="chat.completion.chunk",
+                            choices=[
+                                {
+                                    "index": 0,
+                                    "delta": {},
+                                    "logprobs": None,
+                                    "finish_reason": "stop",
+                                }
+                            ],
+                        )
                         break
                 else:
-                    function_bodies.append(completion_text.strip())
                     # Check whether the model wants to generate another turn
+                    completion_text = ""
+                    for chunk in completion:
+                        completion_text += chunk["choices"][0]["text"]
+                        if len(chunk["choices"][0]["text"].rstrip()) > 0:
+                            if tools is not None:
+                                func_call_dict = {
+                                    "tool_calls": [
+                                        {
+                                            "index": tool_index,
+                                            "id": "call_" + tool_id,
+                                            "type": "function",
+                                            "function": {
+                                                "name": None,
+                                                "arguments": chunk["choices"][0]["text"].rstrip(),
+                                            },
+                                        }
+                                    ]
+                                }
+                            else:
+                                func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
+                            yield llama_types.CreateChatCompletionStreamResponse(
+                                id="chat" + chunk_id,
+                                object="chat.completion.chunk",
+                                created=chunk_created,
+                                model=chunk["model"],
+                                choices=[
+                                    {
+                                        "index": 0,
+                                        "logprobs": chunk["choices"][0]["logprobs"],
+                                        "delta": {
+                                            "role": None,
+                                            "content": None,
+                                            **func_call_dict,
+                                        },
+                                    }
+                                ],
+                            )
                     prompt += completion_text.strip()
                     grammar = None
-                    completion = create_completion(stop=stops)
-                    completion_tokens += completion["usage"]["completion_tokens"]
-                    if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
+                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                    completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
+                    if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
                         prompt += "\n<|from|>assistant\n<|recipient|>"
+                        tool_index += 1
                     else:
+                        # Yield tool_call/function_call stop message
+                        yield llama_types.CreateChatCompletionStreamResponse(
+                            id="chat" + chunk_id,
+                            object="chat.completion.chunk",
+                            created=chunk_created,
+                            model=chunk["model"],
+                            choices=[
+                                {
+                                    "index": 0,
+                                    "finish_reason": "tool_calls" if tools is not None else "function_call",
+                                    "logprobs": None,
+                                    "delta": {
+                                        "role": None, "content": None, "function_call": None, "tool_calls": None
+                                    },
+                                }
+                            ],
+                        )
                         break
-
-    assert "usage" in completion
-    assert len(function_calls) == len(function_bodies)
-
-    tool_calls: List[llama_types.ChatCompletionMessageToolCall] = []
-    for function_call, function_body in zip(function_calls, function_bodies):
-        tool_calls.append(
-            {
-                "id": "call_"
-                + "".join(
-                    [
-                        random.choice(string.ascii_letters + string.digits)
-                        for _ in range(24)
-                    ]
-                ),
-                "type": "function",
-                "function": {
-                    "name": function_call,
-                    "arguments": function_body,
-                },
-            }
+        
+    if stream is not False:
+        return generate_streaming(
+            tools=tools, functions=functions, function_call=function_call, prompt=prompt
         )
+    else:
+        if version == "v1":
+            # If no or "auto" tool_choice/function_call
+            if isinstance(function_call, str) and function_call == "auto":
+                stops = ["\n", END_ASSISTANT_TOKEN]
+            # If tool_choice/function_call is provided
+            elif isinstance(function_call, dict):
+                prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+                stops = END_FUNCTION_CALL_TOKEN
+                function_call = function_call["name"]
+                function_calls.append(function_call)
+                grammar = get_grammar(function_call)
+            else:
+                prompt = prompt
+                stops = ["\n", END_ASSISTANT_TOKEN]
 
-    # TODO: support stream mode
-    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
-    if len(tool_calls) > 0:
-        if tools is not None:
-            function_call_dict["tool_calls"] = tool_calls
+            completion = create_completion(stop=stops)
+            completion_text = completion["choices"][0]["text"]
+            completion_tokens += completion["usage"]["completion_tokens"]
+            
+
+            # If the generation does not involve a function call
+            if (
+                START_FUNCTION_CALL_TOKEN not in prompt
+                and START_FUNCTION_CALL_TOKEN not in completion_text
+            ):
+                completion["usage"]["completion_tokens"] = completion_tokens
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # If the generation involves a function call in completion, generate the parameters
+            elif (
+                START_FUNCTION_CALL_TOKEN not in prompt
+                and START_FUNCTION_CALL_TOKEN in completion_text
+            ):
+                prompt += (
+                    completion_text.replace(
+                        f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
+                    )
+                    + "\n"
+                )
+                function_calls.append(
+                    completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+                )
+                grammar = get_grammar(function_calls[-1])
+                completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+                completion_tokens += completion["usage"]["completion_tokens"]
+                function_bodies.append(completion["choices"][0]["text"].strip())
+            # If the prompt involves a function call, just append generated parameters to function_bodies
+            else:
+                function_bodies.append(completion_text.strip())
         else:
-            function_call_dict["function_call"] = {
-                "name": tool_calls[0]["function"]["name"],
-                "arguments": tool_calls[0]["function"]["arguments"],
-            }
-    completion["usage"]["completion_tokens"] = completion_tokens
-    return llama_types.CreateChatCompletionResponse(
-        id="chat" + completion["id"],
-        object="chat.completion",
-        created=completion["created"],
-        model=completion["model"],
-        choices=[
-            {
-                "index": 0,
-                "logprobs": completion["choices"][0]["logprobs"],
-                "message": {
-                    "role": "assistant",
-                    "content": None if content == "" else content,
-                    **function_call_dict,
-                },
-                "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
-            }
-        ],
-        usage=completion["usage"],
-    )
+            # If tool_choice/function_call is provided
+            if isinstance(function_call, dict):
+                prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+                function_call = function_call["name"]
+                function_calls.append(function_call)
+                grammar = get_grammar(function_call)
+                stops = [STOP_TOKEN, FROM_TOKEN]
+                completion = create_completion(stop=stops)
+                completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
+                function_bodies.append(completion_text.strip())
+            # If "auto" or no tool_choice/function_call
+            elif isinstance(function_call, str) and function_call == "auto":
+                while True:
+                    # Generate function name first
+                    grammar = None
+                    stops = CONTENT_TOKEN
+                    completion = create_completion(stop=stops)
+                    completion_text = completion["choices"][0]["text"]
+                    completion_tokens += completion["usage"]["completion_tokens"]
+                    function_name = completion_text.strip()
+                    if function_name == "all":
+                        prompt += "all\n<|content|>"
+                    else:
+                        function_call = completion_text.strip()
+                        prompt += f"{function_call}\n<|content|>"
+                        function_calls.append(function_call)
+                        grammar = get_grammar(function_call)
+                    # Generate content
+                    stops = [RECIPIENT_TOKEN, STOP_TOKEN]
+                    completion = create_completion(stop=stops)
+                    completion_text = completion["choices"][0]["text"]
+                    completion_tokens += completion["usage"]["completion_tokens"]
+                    if function_name == "all":
+                        if completion_text.endswith("\n<|from|>assistant\n"):
+                            content += completion_text[:-len("\n<|from|>assistant\n")]
+                        if completion_text.endswith("\n<|from|> assistant\n"):
+                            content += completion_text[-len("\n<|from|> assistant\n")]
+                        else:
+                            content += completion_text
+                        content = content.lstrip()
+                        # Check whether the model wants to generate another turn
+                        if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
+                            if completion_text.endswith("\n<|from|>assistant\n"):
+                                cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
+                            elif completion_text.endswith("\n<|from|> assistant\n"):
+                                cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
+                            else:
+                                cleaned_completion_text = completion_text.strip()
+                            prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
+                        else:
+                            break
+                    else:
+                        function_bodies.append(completion_text.strip())
+                        # Check whether the model wants to generate another turn
+                        prompt += completion_text.strip()
+                        grammar = None
+                        completion = create_completion(stop=stops)
+                        completion_tokens += completion["usage"]["completion_tokens"]
+                        if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
+                            prompt += "\n<|from|>assistant\n<|recipient|>"
+                        else:
+                            break
+
+        assert "usage" in completion
+        assert len(function_calls) == len(function_bodies)
+
+        tool_calls: List[llama_types.ChatCompletionMessageToolCall] = []
+        for function_call, function_body in zip(function_calls, function_bodies):
+            tool_calls.append(
+                {
+                    "id": "call_"
+                    + "".join(
+                        [
+                            random.choice(string.ascii_letters + string.digits)
+                            for _ in range(24)
+                        ]
+                    ),
+                    "type": "function",
+                    "function": {
+                        "name": function_call,
+                        "arguments": function_body,
+                    },
+                }
+            )
+
+        # TODO: support stream mode
+        function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
+        if len(tool_calls) > 0:
+            if tools is not None:
+                function_call_dict["tool_calls"] = tool_calls
+            else:
+                function_call_dict["function_call"] = {
+                    "name": tool_calls[0]["function"]["name"],
+                    "arguments": tool_calls[0]["function"]["arguments"],
+                }
+        completion["usage"]["completion_tokens"] = completion_tokens
+        return llama_types.CreateChatCompletionResponse(
+            id="chat" + completion["id"],
+            object="chat.completion",
+            created=completion["created"],
+            model=completion["model"],
+            choices=[
+                {
+                    "index": 0,
+                    "logprobs": completion["choices"][0]["logprobs"],
+                    "message": {
+                        "role": "assistant",
+                        "content": None if content == "" else content,
+                        **function_call_dict,
+                    },
+                    "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
+                }
+            ],
+            usage=completion["usage"],
+        )
 
 
 class Llava15ChatHandler:

From e0d7674e62bdc5b906d2461238993ea3a022f61f Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Sat, 4 May 2024 17:14:59 +0300
Subject: [PATCH 08/14] fix: detokenization case where first token does not
 start with a leading space (#1375)

* Fix tokenization edge case where llama output does not start with a space

See this notebook:
https://colab.research.google.com/drive/1Ooz11nFPk19zyJdMDx42CeesU8aWZMdI#scrollTo=oKpHw5PZ30uC

* Update _internals.py

Fixing to compare to b' ' instead of (str)' '

---------

Co-authored-by: Andrei <abetlen@gmail.com>
---
 llama_cpp/_internals.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index d7409f63a0..b404601d30 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -203,7 +203,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         # NOTE: Llama1 models automatically added a space at the start of the prompt
         # this line removes a leading space if the first token is a beginning of sentence token
         return (
-            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
+            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
         )
 
     # Extra
@@ -812,4 +812,4 @@ def sample(
     def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
         if apply_grammar and self.grammar is not None:
             ctx_main.grammar_accept_token(self.grammar, id)
-        self.prev.append(id)
\ No newline at end of file
+        self.prev.append(id)

From 3e2597eac888e6e12c7bc7021016ca5104db83ba Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 5 May 2024 12:12:27 -0400
Subject: [PATCH 09/14] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 6 +++++-
 vendor/llama.cpp       | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 46aa51662f..9e934e0526 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -294,6 +294,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 #     LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
 #     LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
 #     LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+#     LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+#     LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -303,6 +305,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 LLAMA_VOCAB_PRE_TYPE_MPT = 5
 LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
 LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
+LLAMA_VOCAB_PRE_TYPE_REFACT = 8
+LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -494,7 +498,7 @@ class llama_token_data_array(ctypes.Structure):
 
 llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
 
-# typedef bool (*llama_progress_callback)(float progress, void *ctx);
+# typedef bool (*llama_progress_callback)(float progress, void * user_data);
 llama_progress_callback = ctypes.CFUNCTYPE(
     ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
 )
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 6ecf3189e0..628b299106 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
+Subproject commit 628b299106d1e9476fdecb3cbe546bf5c60f1b89

From 36668331074c79f40494b51e4d541cd03601b9f2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sun, 5 May 2024 18:42:28 +0200
Subject: [PATCH 10/14] feat(ci): Add docker checks and check deps more
 frequently (#1426)

* Update dependabot.yml

Add github-actions update

* Update dependabot.yml

* Update dependabot.yml
---
 .github/dependabot.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index c58c9ae570..6bf90273ac 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -8,8 +8,12 @@ updates:
   - package-ecosystem: "pip" # See documentation for possible values
     directory: "/" # Location of package manifests
     schedule:
-      interval: "weekly"
+      interval: "daily"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
-      interval: "weekly"    
+      interval: "daily"
+  - package-ecosystem: "docker"
+    directory: "/"
+    schedule:
+      interval: "daily"   

From 0318702cdc860999ee70f277425edbbfe0e60419 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 5 May 2024 12:49:31 -0400
Subject: [PATCH 11/14] feat(server): Add support for setting root_path. Closes
 #1420

---
 llama_cpp/server/app.py      | 1 +
 llama_cpp/server/settings.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index b6ed9b1b6b..4cf10d1f66 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -132,6 +132,7 @@ def create_app(
         middleware=middleware,
         title="🦙 llama.cpp Python API",
         version=llama_cpp.__version__,
+        root_path=server_settings.root_path,
     )
     app.add_middleware(
         CORSMiddleware,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index ed05a889f0..a3e185007d 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -215,6 +215,10 @@ class ServerSettings(BaseSettings):
         default=False,
         description="Disable EventSource pings (may be needed for some clients).",
     )
+    root_path: str = Field(
+        default="",
+        description="The root path for the server. Useful when running behind a reverse proxy.",
+    )
 
 
 class Settings(ServerSettings, ModelSettings):

From 19724458b164b7fae728a0e4da4c43514125ed62 Mon Sep 17 00:00:00 2001
From: juanroesel <juan.roesel@zenhub.com>
Date: Mon, 6 May 2024 17:57:45 -0700
Subject: [PATCH 12/14] Ported over prometheus implementation from previous
 repo

---
 llama_cpp/_utils.py     |  72 ++++++++++++++++++++++++-
 llama_cpp/llama.py      | 113 ++++++++++++++++++++++++++++++++++------
 llama_cpp/server/app.py |   7 +++
 pyproject.toml          |   2 +
 4 files changed, 176 insertions(+), 18 deletions(-)

diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 781b265010..7ab94964b1 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -1,7 +1,9 @@
 import os
 import sys
+import psutil
+import subprocess
 
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 outnull_file = open(os.devnull, "w")
@@ -75,3 +77,71 @@ class Singleton(object, metaclass=MetaSingleton):
 
     def __init__(self):
         super(Singleton, self).__init__()
+
+
+# Get snapshot of RAM and GPU usage before and after function execution.
+# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
+def get_cpu_usage(pid) -> float:
+    """
+    CPU usage in percentage by the current process.
+    """
+    process = psutil.Process(pid)
+    return process.cpu_percent()
+
+def get_ram_usage(pid) -> float:
+    """
+    RAM usage in MiB by the current process.
+    """
+    process = psutil.Process(pid)
+    ram_info = process.memory_info()
+    ram_usage = ram_info.rss / (1024 * 1024)  # Convert to MiB
+    return ram_usage
+
+def get_gpu_info_by_pid(pid) -> float:
+    """
+    GPU memory usage by the current process (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8")
+        gpu_info = gpu_info.strip().split("\n")
+        for info in gpu_info:
+            gpu_pid, gpu_ram_usage = info.split(", ")
+            if int(gpu_pid) == pid:
+                return float(gpu_ram_usage.split()[0])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0
+
+def get_gpu_general_info() -> tuple[float, float, float]:
+    """
+    GPU general info (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8")
+        gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ")
+        return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0, 0.0, 0.0
+
+def infer_service_from_prompt(prompt: str | List[str]):
+    """
+    Infer the service for which a completion request is sent based on the prompt.
+    """
+    LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided."
+    ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue."
+    SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review."
+
+    if isinstance(prompt, list):
+        prompt = " ".join(prompt)
+
+    if LABEL_SUGGESTIONS_TASK in prompt:
+        return "label-suggestions"
+
+    elif ACCEPTANCE_CRITERIA_TASK in prompt:
+        return "acceptance-criteria"
+
+    elif SPRINT_REVIEW_TASK in prompt:
+        return "sprint-review"
+
+    return "not-specified"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index f927f0ca26..a32ea1e8df 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -38,6 +38,16 @@
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
+from llama_cpp.llama_metrics import Metrics, MetricsExporter
+
+from llama_cpp._utils import (
+    infer_service_from_prompt, 
+    get_cpu_usage, 
+    get_ram_usage, 
+    get_gpu_info_by_pid,
+    get_gpu_general_info,
+)
+
 from llama_cpp.llama_speculative import LlamaDraftModel
 
 import numpy as np
@@ -448,6 +458,9 @@ def __init__(
             if self.verbose:
                 print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
 
+        # Prometheus metrics
+        self.metrics = MetricsExporter()
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
@@ -950,6 +963,19 @@ def _create_completion(
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
+
+        # Variables required for metric collection
+        _metrics_dict = {}
+        _ttft_start = time.time()
+        _pid = os.getpid()
+        _tpot_metrics = []
+        _labels = {
+            "service": infer_service_from_prompt(prompt),  # Infer the service for which the completion is being generated
+            "request_type": "chat/completions",
+        }
+        # Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process
+        _ = get_cpu_usage(_pid)
+
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
         completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
@@ -1043,23 +1069,26 @@ def logit_bias_processor(
 
         finish_reason = "length"
         multibyte_fix = 0
-        for token in self.generate(
-            prompt_tokens,
-            top_k=top_k,
-            top_p=top_p,
-            min_p=min_p,
-            typical_p=typical_p,
-            temp=temperature,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            repeat_penalty=repeat_penalty,
-            stopping_criteria=stopping_criteria,
-            logits_processor=logits_processor,
-            grammar=grammar,
+        _tpot_start = time.time()
+        for idx, token in enumerate(
+            self.generate(
+                prompt_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                min_p=min_p,
+                typical_p=typical_p,
+                temp=temperature,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
+                repeat_penalty=repeat_penalty,
+                stopping_criteria=stopping_criteria,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            )
         ):
             assert self._model.model is not None
             if llama_cpp.llama_token_is_eog(self._model.model, token):
@@ -1216,6 +1245,14 @@ def logit_bias_processor(
                 finish_reason = "length"
                 break
 
+            # Record TTFT metric (once)
+            if idx == 0:
+                _metrics_dict["time_to_first_token"] = time.time() - _ttft_start
+            # Record TPOT metric
+            else:
+                _tpot_metrics.append(time.time() - _tpot_start)
+            _tpot_start = time.time()  # reset
+
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
@@ -1403,6 +1440,48 @@ def logit_bias_processor(
                 "token_logprobs": token_logprobs,
                 "top_logprobs": top_logprobs,
             }
+        
+        # Record TPOT metrics (per generated token)
+        _metrics_dict["time_per_output_token"] = _tpot_metrics
+
+        # Record metrics from the C++ backend (converted to seconds)
+        _timings = llama_cpp.llama_get_timings(self._ctx.ctx)
+        _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
+        _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
+        _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
+        _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
+        _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
+        _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
+        _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
+        _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)
+
+        # Record prefill and generation token metrics
+        _metrics_dict["prefill_tokens"] = len(prompt_tokens)
+        _metrics_dict["generation_tokens"] = len(completion_tokens)
+
+        # Record system info
+        _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
+        _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid)  # TODO: Returning always 0.0 -> check
+        _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
+        _metrics_dict["gpu_utilization"] = _gpu_utilization
+        _metrics_dict["gpu_ram_usage"] = _gpu_memory_used
+        _metrics_dict["gpu_ram_free"] = _gpu_memory_free
+        _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
+        _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
+        _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
+        _metrics_dict["system_info"] = {
+            "model": model_name,
+            "n_params": str(llama_cpp.llama_model_n_params(self.model)),
+            "n_embd": str(self.n_embd()),
+            "n_ctx": str(self.n_ctx()),
+            "n_vocab": str(self.n_vocab()),
+            "n_threads": str(self.n_threads)
+        } 
+
+        # Log metrics to Prometheus
+        #print(_metrics_dict, file=sys.stderr)
+        _all_metrics = Metrics(**_metrics_dict)
+        self.metrics.log_metrics(_all_metrics, labels=_labels)
 
         yield {
             "id": completion_id,
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index b6ed9b1b6b..d9bed39225 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -7,6 +7,8 @@
 from functools import partial
 from typing import Iterator, List, Optional, Union, Dict
 
+from prometheus_client import make_asgi_app
+
 import llama_cpp
 
 import anyio
@@ -145,6 +147,11 @@ def create_app(
     assert model_settings is not None
     set_llama_proxy(model_settings=model_settings)
 
+    # Add prometheus asgi middleware to route /metrics requests
+    # see: https://prometheus.github.io/client_python/exporting/http/fastapi-gunicorn/
+    metrics_app = make_asgi_app()
+    app.mount("/metrics", metrics_app)
+
     if server_settings.disable_ping_events:
         set_ping_message_factory(lambda: bytes())
 
diff --git a/pyproject.toml b/pyproject.toml
index 8345cb1f09..4b0246623f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,8 @@ server = [
     "sse-starlette>=1.6.1",
     "starlette-context>=0.3.6,<0.4",
     "PyYAML>=5.1",
+    "prometheus_client>=0.20.0",
+    "psutil>=5.9.8"
 ]
 test = [
     "pytest>=7.4.0",

From edd0ec69a2c34d14756a5d79a7f93ae28dd3a958 Mon Sep 17 00:00:00 2001
From: juanroesel <juan.roesel@zenhub.com>
Date: Mon, 6 May 2024 17:58:40 -0700
Subject: [PATCH 13/14] Added kn_cache_usage_ratio metric

---
 llama_cpp/llama_metrics.py | 218 +++++++++++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)
 create mode 100644 llama_cpp/llama_metrics.py

diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py
new file mode 100644
index 0000000000..9e00e8b99d
--- /dev/null
+++ b/llama_cpp/llama_metrics.py
@@ -0,0 +1,218 @@
+from dataclasses import dataclass
+from typing import Any, Optional, Dict, List
+
+from prometheus_client import Gauge, Info, Histogram
+
+
+LABELS = ["request_type", "service"]
+
+@dataclass
+class Metrics:
+    """
+    A dataclass to store metrics for a request.
+    """
+    # System metrics
+    system_info: Dict[str, Any]
+    state_size: int
+    cpu_utilization: float
+    cpu_ram_pid: float
+    gpu_utilization: float
+    gpu_ram_usage: float
+    gpu_ram_free: float
+    gpu_ram_pid: float
+
+    # Metrics from the C++ backend
+    load_time: float
+    sample_time: float
+    sample_throughput: float
+    time_to_first_token: float
+    time_per_output_token: List[float]
+    prompt_eval_time: float
+    prompt_eval_throughput: float
+    completion_eval_time: float
+    completion_eval_throughput: float
+    end_to_end_latency: float
+    prefill_tokens: int
+    generation_tokens: int  
+    kv_cache_usage_ratio: int
+
+
+class MetricsExporter:
+    """
+    A custom Prometheus Metrics Explorer for the LLAMA C++ backend.
+    Collects metrics per request sent to the backend.
+    """
+    def __init__(self):
+        self.labels = LABELS
+        # One-time metrics
+        self._histrogram_load_time = Histogram(
+            name="llama_cpp_python:load_t_seconds",
+            documentation="Histogram of load time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                8.0, 9.0, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0
+            ]
+        )
+        # Request-level latencies
+        self._histogram_sample_time = Histogram(
+            name="llama_cpp_python:sample_t_seconds",
+            documentation="Histogram of token sampling time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.00001, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025,
+                0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5,
+            ]
+        )
+        self._histogram_time_to_first_token = Histogram(
+            name="llama_cpp_python:ttft_seconds",
+            documentation="Histogram of time to first token in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0
+            ]
+        )
+        self._histogram_time_per_output_token = Histogram(
+            name="llama_cpp_python:tpot_seconds",
+            documentation="Histogram of time per output token in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0
+            ]
+        )
+        self._histogram_prompt_eval_time = Histogram(
+            name="llama_cpp_python:p_eval_t_seconds",
+            documentation="Histogram of prompt evaluation time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0,
+                20.0, 25.0, 30.0, 40.0, 50.0, 60.0
+            ]
+        )
+        self._histogram_completion_eval_time = Histogram(
+            name="llama_cpp_python:c_eval_t_seconds",
+            documentation="Histogram of completion evaluation time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0,
+                20.0, 25.0, 30.0, 40.0, 50.0, 60.0
+            ]
+        )
+        self._histogram_e2e_request_latency = Histogram(
+            name="llama_cpp_python:e2e_seconds",
+            documentation="Histogram of end-to-end request latency in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0,
+                20.0, 25.0, 30.0, 40.0, 50.0, 60.0
+            ]
+        )
+        # Prefill and generation tokens
+        self._histogram_prefill_tokens = Histogram(
+            name="llama_cpp_python:prefill_tokens_total",
+            documentation="Histogram of number of prefill tokens processed",
+            labelnames=self.labels,
+            buckets=[
+                1, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000,
+                3500, 4000, 4500, 5000
+            ]
+        )
+        self._histogram_generation_tokens = Histogram(
+            name="llama_cpp_python:completion_tokens_total",
+            documentation="Histogram of number of generation tokens processed",
+            labelnames=self.labels,
+            buckets=[
+                1, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000,
+                3500, 4000, 4500, 5000
+            ]
+        )
+        # Current throughput
+        self._gauge_prompt_eval_throughput = Gauge(
+            name="llama_cpp_python:prompt_eval_throughput",
+            documentation="Current throughput of the prompt evaluation process (in tokens/second)",
+            labelnames=self.labels
+        )
+        self._gauge_completion_eval_throughput = Gauge(
+            name="llama_cpp_python:completion_eval_throughput",
+            documentation="Current throughput of the completion evaluation process (in tokens/second)",
+            labelnames=self.labels
+        )
+        self._gauge_sample_throughput = Gauge(
+            name="llama_cpp_python:sample_throughput",
+            documentation="Current throughput of the token sampling process (in tokens/second)",
+            labelnames=self.labels
+        )
+        # System info
+        self._gauge_state_size = Gauge(
+            name="llama_cpp_python:state_size",
+            documentation="Current state size in bytes of various components such as rng (random number generator), logits, embedding, and kv_cache (key-value cache)",
+            labelnames=self.labels
+        )
+        self._gauge_cpu_utilization = Gauge(
+            name="llama_cpp_python:cpu_utilization",
+            documentation="Current CPU utilization",
+            labelnames=self.labels
+        )
+        self._gauge_cpu_ram_usage_by_pid = Gauge(
+            name="llama_cpp_python:cpu_memory_usage_by_pid",
+            documentation="Current CPU memory usage during the request",
+            labelnames=self.labels
+        )
+        self._gauge_gpu_utilization = Gauge(
+            name="llama_cpp_python:gpu_utilization",
+            documentation="Current GPU utilization",
+            labelnames=self.labels
+        )
+        self._gauge_gpu_memory_usage = Gauge(
+            name="llama_cpp_python:gpu_memory_usage",
+            documentation="Current GPU memory usage",
+            labelnames=self.labels
+        )
+        self._gauge_gpu_memory_free = Gauge(
+            name="llama_cpp_python:gpu_memory_free",
+            documentation="Current free GPU memory",
+            labelnames=self.labels
+        )
+        self._gauge_gpu_memory_usage_by_pid = Gauge(
+            name="llama_cpp_python:gpu_memory_usage_by_pid",
+            documentation="Current GPU memory usage during the request",
+            labelnames=self.labels
+        )
+        self._gauge_kv_cache_usage_ratio = Gauge(
+            name="llama_cpp_python:kv_cache_usage_ratio",
+            documentation="KV-cache usage. 1 means 100 percent usage",
+            labelnames=self.labels
+        )
+        self._info = Info(
+            name="llama_cpp_python:info",
+            documentation="Server metadata"
+        )
+
+    def log_metrics(self, metrics: Metrics, labels: Dict[str, str]):
+        """
+        Log the metrics using the Prometheus client.
+        """
+        self._histrogram_load_time.labels(**labels).observe(metrics.load_time)
+        self._histogram_sample_time.labels(**labels).observe(metrics.sample_time)
+        self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)
+        for _tpot in metrics.time_per_output_token:
+            self._histogram_time_per_output_token.labels(**labels).observe(_tpot)
+        self._histogram_prompt_eval_time.labels(**labels).observe(metrics.prompt_eval_time)
+        self._histogram_completion_eval_time.labels(**labels).observe(metrics.completion_eval_time)
+        self._histogram_e2e_request_latency.labels(**labels).observe(metrics.end_to_end_latency)
+        self._histogram_prefill_tokens.labels(**labels).observe(metrics.prefill_tokens)
+        self._histogram_generation_tokens.labels(**labels).observe(metrics.generation_tokens)
+        self._gauge_prompt_eval_throughput.labels(**labels).set(metrics.prompt_eval_throughput)
+        self._gauge_completion_eval_throughput.labels(**labels).set(metrics.completion_eval_throughput)
+        self._gauge_sample_throughput.labels(**labels).set(metrics.sample_throughput)
+        self._gauge_cpu_utilization.labels(**labels).set(metrics.cpu_utilization)
+        self._gauge_cpu_ram_usage_by_pid.labels(**labels).set(metrics.cpu_ram_pid)
+        self._gauge_gpu_utilization.labels(**labels).set(metrics.gpu_utilization)
+        self._gauge_gpu_memory_usage.labels(**labels).set(metrics.gpu_ram_usage)
+        self._gauge_gpu_memory_free.labels(**labels).set(metrics.gpu_ram_free)
+        self._gauge_gpu_memory_usage_by_pid.labels(**labels).set(metrics.gpu_ram_pid)
+        self._gauge_state_size.labels(**labels).set(metrics.state_size)
+        self._gauge_kv_cache_usage_ratio.labels(**labels).set(metrics.kv_cache_usage_ratio)
+        self._info.info(metrics.system_info)
\ No newline at end of file

From bd84f3c179bac46dd1dc0604db638dfd3852fdae Mon Sep 17 00:00:00 2001
From: juanroesel <juan.roesel@zenhub.com>
Date: Tue, 7 May 2024 14:53:25 -0700
Subject: [PATCH 14/14] Pulled synced commits locally and changed data type

---
 llama_cpp/llama_metrics.py | 2 +-
 vendor/llama.cpp           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py
index 9e00e8b99d..7334c71401 100644
--- a/llama_cpp/llama_metrics.py
+++ b/llama_cpp/llama_metrics.py
@@ -34,7 +34,7 @@ class Metrics:
     end_to_end_latency: float
     prefill_tokens: int
     generation_tokens: int  
-    kv_cache_usage_ratio: int
+    kv_cache_usage_ratio: float
 
 
 class MetricsExporter:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 628b299106..f364eb6fb5 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 628b299106d1e9476fdecb3cbe546bf5c60f1b89
+Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961