From 31b1d95a6c19f5b615a3286069f181a415f872e8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 May 2024 11:32:18 -0400 Subject: [PATCH 01/14] feat: Add llama-3-vision-alpha chat format --- llama_cpp/llama_chat_format.py | 64 ++++++++++++++++++++++++++++++++-- llama_cpp/server/model.py | 14 ++++++++ 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0af410a972..a17b86b30a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2165,7 +2165,7 @@ def create_completion(stop): class Llava15ChatHandler: - DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + DEFAULT_SYSTEM_MESSAGE: Optional[str] = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." CHAT_FORMAT = ( "{% for message in messages %}" @@ -2288,7 +2288,7 @@ def __call__( assert self.clip_ctx is not None system_prompt = _get_system_message(messages) - if system_prompt == "": + if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None: messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages image_urls = self.get_image_urls(messages) @@ -2771,6 +2771,66 @@ class NanoLlavaChatHandler(Llava15ChatHandler): "{% endif %}" ) +class Llama3VisionAlpha(Llava15ChatHandler): + # question = "" + q + + # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + DEFAULT_SYSTEM_MESSAGE = None + + CHAT_FORMAT = ( + "{% for message in messages %}" + + "<|start_header_id|>" + + "{% if message.role == 'user' %}" + + "user<|end_header_id|>\n\n" + + "{% if message.content is iterable %}" + + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + + "{% endif %}" + + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + + "{% endif %}" + + # Answer: + "{% if message.role == 'assistant' %}" + "assistant<|end_header_id|>\n\n" + "{{ message.content }}" + "{% endif %}" + + "<|eot_id|>" + + "{% endfor %}" + + # Generation prompt + "{% if add_generation_prompt %}" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + "{% endif %}" + ) @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 1ad592d211..e102fadbd7 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -140,6 +140,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) + elif settings.chat_format == "llama-3-vision-alpha": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "hf-autotokenizer": assert ( settings.hf_pretrained_model_name_or_path is not None From d75dea18db61becae72784b9d750897b7cdcda26 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 May 2024 12:00:44 -0400 Subject: [PATCH 02/14] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f364eb6fb5..6ecf3189e0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961 +Subproject commit 6ecf3189e00a1e8e737a78b6d10e1d7006e050a2 From 21171223967896b9e941007841ebd1513b3cc4b9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 May 2024 12:07:09 -0400 Subject: [PATCH 03/14] chore: Bump version --- CHANGELOG.md | 10 +++++++++- llama_cpp/__init__.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c9681f355b..9b995509eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.69] + +- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2 +- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8 +- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94 +- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e +- fix: UTF-8 handling with grammars by @jsoma in #1415 + ## [0.2.68] -- feat: Update llama.cpp to ggerganov/llama.cpp@ +- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167 - feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e - fix(ci): Fix build-and-release.yaml by @Smartappli in #1413 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 63c622551c..95c8819660 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.68" \ No newline at end of file +__version__ = "0.2.69" \ No newline at end of file From 2138561fab5e60672c63b6c446b62a8bc26e17c4 Mon Sep 17 00:00:00 2001 From: Daniel Thuerck Date: Fri, 3 May 2024 18:17:07 +0200 Subject: [PATCH 04/14] fix(server): Propagate `flash_attn` to model load. (#1424) --- llama_cpp/server/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index e102fadbd7..f002924109 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -242,6 +242,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: logits_all=settings.logits_all, embedding=settings.embedding, offload_kqv=settings.offload_kqv, + flash_attn=settings.flash_attn, # Sampling Params last_n_tokens_size=settings.last_n_tokens_size, # LoRA Params From 0a454bebe67d12a446981eb16028c168ca5faa81 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 3 May 2024 15:23:06 -0400 Subject: [PATCH 05/14] feat(server): Remove temperature bounds checks for server. Closes #1384 --- llama_cpp/server/types.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index ce9c87a694..a20b3940f2 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -18,8 +18,6 @@ temperature_field = Field( default=0.8, - ge=0.0, - le=2.0, description="Adjust the randomness of the generated text.\n\n" + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", ) From 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 3 May 2024 19:07:50 -0400 Subject: [PATCH 06/14] fix: Use memmove to copy str_value kv_override. Closes #1417 --- llama_cpp/llama.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f927f0ca26..17576a69b8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -262,7 +262,12 @@ def __init__( raise ValueError(f"Value for {k} is too long: {v}") v_bytes = v_bytes.ljust(128, b"\0") self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR - self._kv_overrides_array[i].value.str_value[:128] = v_bytes + # copy min(v_bytes, 128) to str_value + ctypes.memmove( + self._kv_overrides_array[i].value.str_value, + v_bytes, + min(len(v_bytes), 128), + ) else: raise ValueError(f"Unknown value type for {k}: {v}") From 1f56c648c3a4aed2b4fe3edb4dc745fae3e7d8ae Mon Sep 17 00:00:00 2001 From: Jeffrey Fong Date: Sat, 4 May 2024 22:11:20 +0800 Subject: [PATCH 07/14] feat: Implement streaming for Functionary v2 + Bug fixes (#1419) * set up streaming for v2 * assert v2 streaming, fix tool_call vs function_call * fix streaming with tool_choice/function_call * make functions return 1 function call only when 'auto' * fix --------- Co-authored-by: Andrei --- llama_cpp/llama_chat_format.py | 576 +++++++++++++++++++++++++-------- 1 file changed, 443 insertions(+), 133 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a17b86b30a..3ab94e0d3c 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1894,6 +1894,8 @@ def prepare_messages_for_inference( function_call = ( tool_choice if isinstance(tool_choice, str) else tool_choice["function"] ) + elif function_call is not None: + pass else: function_call = "auto" @@ -1930,11 +1932,10 @@ def prepare_messages_for_inference( logits_processor=logits_processor, grammar=grammar, ) - completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip() + if stream is False: + completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip() return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore - assert stream is False # TODO: support stream mode - def get_grammar(function_call): function_body = None for function in functions or []: @@ -1968,7 +1969,7 @@ def get_grammar(function_call): return grammar - def create_completion(stop): + def create_completion(prompt, stop, grammar): completion = cast(llama_types.Completion, llama.create_completion( prompt=prompt, temperature=temperature, @@ -1976,7 +1977,7 @@ def create_completion(stop): top_k=top_k, min_p=min_p, typical_p=typical_p, - stream=False, + stream=stream, stop=stop, max_tokens=max_tokens, presence_penalty=presence_penalty, @@ -1996,172 +1997,481 @@ def create_completion(stop): content = "" function_calls, function_bodies = [], [] completion_tokens = 0 - - if version == "v1": - # If no or "auto" tool_choice/function_call - if isinstance(function_call, str) and function_call == "auto": - stops = ["\n", END_ASSISTANT_TOKEN] - # If tool_choice/function_call is provided - elif isinstance(function_call, dict): - prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n" - stops = END_FUNCTION_CALL_TOKEN - function_call = function_call["name"] - function_calls.append(function_call) - grammar = get_grammar(function_call) - else: - prompt = prompt - stops = ["\n", END_ASSISTANT_TOKEN] - - completion = create_completion(stop=stops) - completion_text = completion["choices"][0]["text"] - completion_tokens += completion["usage"]["completion_tokens"] + + def generate_streaming(tools, functions, function_call, prompt): + assert version == "v2", "Streaming for v1 is not supported" + + chunk_id, chunk_created = None, None - - # If the generation does not involve a function call - if ( - START_FUNCTION_CALL_TOKEN not in prompt - and START_FUNCTION_CALL_TOKEN not in completion_text - ): - completion["usage"]["completion_tokens"] = completion_tokens - return _convert_completion_to_chat(completion, stream=stream) # type: ignore - # If the generation involves a function call in completion, generate the parameters - elif ( - START_FUNCTION_CALL_TOKEN not in prompt - and START_FUNCTION_CALL_TOKEN in completion_text - ): - prompt += ( - completion_text.replace( - f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN - ) - + "\n" - ) - function_calls.append( - completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip() - ) - grammar = get_grammar(function_calls[-1]) - completion = create_completion(stop=END_FUNCTION_CALL_TOKEN) - completion_tokens += completion["usage"]["completion_tokens"] - function_bodies.append(completion["choices"][0]["text"].strip()) - # If the prompt involves a function call, just append generated parameters to function_bodies - else: - function_bodies.append(completion_text.strip()) - else: # If tool_choice/function_call is provided if isinstance(function_call, dict): prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" - function_call = function_call["name"] - function_calls.append(function_call) - grammar = get_grammar(function_call) + grammar = get_grammar(function_call["name"]) stops = [STOP_TOKEN, FROM_TOKEN] - completion = create_completion(stop=stops) - completion_text = completion["choices"][0]["text"] - completion_tokens += completion["usage"]["completion_tokens"] - function_bodies.append(completion_text.strip()) + tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)]) + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) + completion_text = "" + first = True + for chunk in completion: + # Yield the tool/function name first + if first: + if tools is not None: + func_call_dict = { + "tool_calls": [ + { + "index": 0, + "id": "call_" + tool_id, + "type": "function", + "function": {"name": function_call["name"], "arguments": ""}, + } + ] + } + else: + func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}} + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk["id"], + object="chat.completion.chunk", + created=chunk["created"], + model=chunk["model"], + choices=[ + {"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}} + ], + ) + first = False + if tools is not None: + func_call_dict = { + "tool_calls": [ + { + "index": 0, + "id": "call_" + tool_id, + "type": "function", + "function": { + "name": None, + "arguments": chunk["choices"][0]["text"].rstrip(), + }, + } + ] + } + else: + func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}} + if len(chunk["choices"][0]["text"].rstrip()) > 0: + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk["id"], + object="chat.completion.chunk", + created=chunk["created"], + model=chunk["model"], + choices=[ + { + "index": 0, + "logprobs": chunk["choices"][0]["logprobs"], + "delta": { + "role": None, + "content": None, + **func_call_dict, + }, + } + ], + ) + # Yield tool_call/function_call stop message + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk["id"], + object="chat.completion.chunk", + created=chunk["created"], + model=chunk["model"], + choices=[ + { + "index": 0, + "finish_reason": "tool_calls" if tools is not None else "function_call", + "logprobs": None, + "delta": { + "role": None, "content": None, "function_call": None, "tool_calls": None + }, + } + ], + ) # If "auto" or no tool_choice/function_call elif isinstance(function_call, str) and function_call == "auto": + tool_index = 0 while True: # Generate function name first grammar = None stops = CONTENT_TOKEN - completion = create_completion(stop=stops) - completion_text = completion["choices"][0]["text"] - completion_tokens += completion["usage"]["completion_tokens"] + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) + completion_text = "" + for chunk in completion: + completion_text += chunk["choices"][0]["text"] + if chunk_id is None: + chunk_id = chunk["id"] + if chunk_created is None: + chunk_created = chunk["created"] function_name = completion_text.strip() if function_name == "all": prompt += "all\n<|content|>" + # Yield the first empty message for content + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + model=chunk["model"], + created=chunk_created, + object="chat.completion.chunk", + choices=[ + { + "index": 0, + "delta": {"role": "assistant", "content": ""}, + "logprobs": None, + "finish_reason": None, + } + ], + ) else: - function_call = completion_text.strip() - prompt += f"{function_call}\n<|content|>" - function_calls.append(function_call) - grammar = get_grammar(function_call) + prompt += f"{function_name}\n<|content|>" + grammar = get_grammar(function_name) + tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)]) + if tools is not None: + func_call_dict = { + "tool_calls": [ + { + "index": tool_index, + "id": "call_" + tool_id, + "type": "function", + "function": {"name": function_name, "arguments": ""}, + } + ] + } + else: + func_call_dict = {"function_call": {"name": function_name, "arguments": ""}} + # Stream function name + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + object="chat.completion.chunk", + created=chunk_created, + model=chunk["model"], + choices=[ + { + "index": 0, + "logprobs": chunk["choices"][0]["logprobs"], + "delta": { + "role": "assistant", + "content": None, + **func_call_dict, + }, + } + ], + ) # Generate content stops = [RECIPIENT_TOKEN, STOP_TOKEN] - completion = create_completion(stop=stops) - completion_text = completion["choices"][0]["text"] - completion_tokens += completion["usage"]["completion_tokens"] + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) if function_name == "all": - if completion_text.endswith("\n<|from|>assistant\n"): - content += completion_text[:-len("\n<|from|>assistant\n")] - if completion_text.endswith("\n<|from|> assistant\n"): - content += completion_text[-len("\n<|from|> assistant\n")] - else: - content += completion_text - content = content.lstrip() + completion_text = "" + stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False + for i, chunk in enumerate(completion): + completion_text += chunk["choices"][0]["text"] + if is_end: + buffer.append(chunk["choices"][0]["text"].strip(" ")) + if stop_sequence.startswith("".join(buffer)): + continue + else: + buffer.pop() + while len(buffer) > 0: + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + object="chat.completion.chunk", + created=chunk_created, + model=chunk["model"], + choices=[ + { + "index": 0, + "logprobs": chunk["choices"][0]["logprobs"], + "delta": { + "role": "assistant", "content": buffer.pop(0) + }, + } + ], + ) + is_end = False + elif chunk["choices"][0]["text"] == "\n": + is_end = True + buffer.append(chunk["choices"][0]["text"].strip(" ")) + continue + + if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0: + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + object="chat.completion.chunk", + created=chunk_created, + model=chunk["model"], + choices=[ + { + "index": 0, + "logprobs": chunk["choices"][0]["logprobs"], + "delta": { + "role": "assistant", + "content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip() + }, + } + ], + ) # Check whether the model wants to generate another turn if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text: if completion_text.endswith("\n<|from|>assistant\n"): cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip() elif completion_text.endswith("\n<|from|> assistant\n"): - cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip() + cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip() else: cleaned_completion_text = completion_text.strip() prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>" else: + # Yield stop message + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + model=chunk["model"], + created=chunk_created, + object="chat.completion.chunk", + choices=[ + { + "index": 0, + "delta": {}, + "logprobs": None, + "finish_reason": "stop", + } + ], + ) break else: - function_bodies.append(completion_text.strip()) # Check whether the model wants to generate another turn + completion_text = "" + for chunk in completion: + completion_text += chunk["choices"][0]["text"] + if len(chunk["choices"][0]["text"].rstrip()) > 0: + if tools is not None: + func_call_dict = { + "tool_calls": [ + { + "index": tool_index, + "id": "call_" + tool_id, + "type": "function", + "function": { + "name": None, + "arguments": chunk["choices"][0]["text"].rstrip(), + }, + } + ] + } + else: + func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}} + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + object="chat.completion.chunk", + created=chunk_created, + model=chunk["model"], + choices=[ + { + "index": 0, + "logprobs": chunk["choices"][0]["logprobs"], + "delta": { + "role": None, + "content": None, + **func_call_dict, + }, + } + ], + ) prompt += completion_text.strip() grammar = None - completion = create_completion(stop=stops) - completion_tokens += completion["usage"]["completion_tokens"] - if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]: + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) + completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion]) + if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None: prompt += "\n<|from|>assistant\n<|recipient|>" + tool_index += 1 else: + # Yield tool_call/function_call stop message + yield llama_types.CreateChatCompletionStreamResponse( + id="chat" + chunk_id, + object="chat.completion.chunk", + created=chunk_created, + model=chunk["model"], + choices=[ + { + "index": 0, + "finish_reason": "tool_calls" if tools is not None else "function_call", + "logprobs": None, + "delta": { + "role": None, "content": None, "function_call": None, "tool_calls": None + }, + } + ], + ) break - - assert "usage" in completion - assert len(function_calls) == len(function_bodies) - - tool_calls: List[llama_types.ChatCompletionMessageToolCall] = [] - for function_call, function_body in zip(function_calls, function_bodies): - tool_calls.append( - { - "id": "call_" - + "".join( - [ - random.choice(string.ascii_letters + string.digits) - for _ in range(24) - ] - ), - "type": "function", - "function": { - "name": function_call, - "arguments": function_body, - }, - } + + if stream is not False: + return generate_streaming( + tools=tools, functions=functions, function_call=function_call, prompt=prompt ) + else: + if version == "v1": + # If no or "auto" tool_choice/function_call + if isinstance(function_call, str) and function_call == "auto": + stops = ["\n", END_ASSISTANT_TOKEN] + # If tool_choice/function_call is provided + elif isinstance(function_call, dict): + prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n" + stops = END_FUNCTION_CALL_TOKEN + function_call = function_call["name"] + function_calls.append(function_call) + grammar = get_grammar(function_call) + else: + prompt = prompt + stops = ["\n", END_ASSISTANT_TOKEN] - # TODO: support stream mode - function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {} - if len(tool_calls) > 0: - if tools is not None: - function_call_dict["tool_calls"] = tool_calls + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] + + + # If the generation does not involve a function call + if ( + START_FUNCTION_CALL_TOKEN not in prompt + and START_FUNCTION_CALL_TOKEN not in completion_text + ): + completion["usage"]["completion_tokens"] = completion_tokens + return _convert_completion_to_chat(completion, stream=stream) # type: ignore + # If the generation involves a function call in completion, generate the parameters + elif ( + START_FUNCTION_CALL_TOKEN not in prompt + and START_FUNCTION_CALL_TOKEN in completion_text + ): + prompt += ( + completion_text.replace( + f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN + ) + + "\n" + ) + function_calls.append( + completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip() + ) + grammar = get_grammar(function_calls[-1]) + completion = create_completion(stop=END_FUNCTION_CALL_TOKEN) + completion_tokens += completion["usage"]["completion_tokens"] + function_bodies.append(completion["choices"][0]["text"].strip()) + # If the prompt involves a function call, just append generated parameters to function_bodies + else: + function_bodies.append(completion_text.strip()) else: - function_call_dict["function_call"] = { - "name": tool_calls[0]["function"]["name"], - "arguments": tool_calls[0]["function"]["arguments"], - } - completion["usage"]["completion_tokens"] = completion_tokens - return llama_types.CreateChatCompletionResponse( - id="chat" + completion["id"], - object="chat.completion", - created=completion["created"], - model=completion["model"], - choices=[ - { - "index": 0, - "logprobs": completion["choices"][0]["logprobs"], - "message": { - "role": "assistant", - "content": None if content == "" else content, - **function_call_dict, - }, - "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop", - } - ], - usage=completion["usage"], - ) + # If tool_choice/function_call is provided + if isinstance(function_call, dict): + prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" + function_call = function_call["name"] + function_calls.append(function_call) + grammar = get_grammar(function_call) + stops = [STOP_TOKEN, FROM_TOKEN] + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] + function_bodies.append(completion_text.strip()) + # If "auto" or no tool_choice/function_call + elif isinstance(function_call, str) and function_call == "auto": + while True: + # Generate function name first + grammar = None + stops = CONTENT_TOKEN + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] + function_name = completion_text.strip() + if function_name == "all": + prompt += "all\n<|content|>" + else: + function_call = completion_text.strip() + prompt += f"{function_call}\n<|content|>" + function_calls.append(function_call) + grammar = get_grammar(function_call) + # Generate content + stops = [RECIPIENT_TOKEN, STOP_TOKEN] + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + completion_tokens += completion["usage"]["completion_tokens"] + if function_name == "all": + if completion_text.endswith("\n<|from|>assistant\n"): + content += completion_text[:-len("\n<|from|>assistant\n")] + if completion_text.endswith("\n<|from|> assistant\n"): + content += completion_text[-len("\n<|from|> assistant\n")] + else: + content += completion_text + content = content.lstrip() + # Check whether the model wants to generate another turn + if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text: + if completion_text.endswith("\n<|from|>assistant\n"): + cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip() + elif completion_text.endswith("\n<|from|> assistant\n"): + cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip() + else: + cleaned_completion_text = completion_text.strip() + prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>" + else: + break + else: + function_bodies.append(completion_text.strip()) + # Check whether the model wants to generate another turn + prompt += completion_text.strip() + grammar = None + completion = create_completion(stop=stops) + completion_tokens += completion["usage"]["completion_tokens"] + if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]: + prompt += "\n<|from|>assistant\n<|recipient|>" + else: + break + + assert "usage" in completion + assert len(function_calls) == len(function_bodies) + + tool_calls: List[llama_types.ChatCompletionMessageToolCall] = [] + for function_call, function_body in zip(function_calls, function_bodies): + tool_calls.append( + { + "id": "call_" + + "".join( + [ + random.choice(string.ascii_letters + string.digits) + for _ in range(24) + ] + ), + "type": "function", + "function": { + "name": function_call, + "arguments": function_body, + }, + } + ) + + # TODO: support stream mode + function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {} + if len(tool_calls) > 0: + if tools is not None: + function_call_dict["tool_calls"] = tool_calls + else: + function_call_dict["function_call"] = { + "name": tool_calls[0]["function"]["name"], + "arguments": tool_calls[0]["function"]["arguments"], + } + completion["usage"]["completion_tokens"] = completion_tokens + return llama_types.CreateChatCompletionResponse( + id="chat" + completion["id"], + object="chat.completion", + created=completion["created"], + model=completion["model"], + choices=[ + { + "index": 0, + "logprobs": completion["choices"][0]["logprobs"], + "message": { + "role": "assistant", + "content": None if content == "" else content, + **function_call_dict, + }, + "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop", + } + ], + usage=completion["usage"], + ) class Llava15ChatHandler: From e0d7674e62bdc5b906d2461238993ea3a022f61f Mon Sep 17 00:00:00 2001 From: Noam Gat Date: Sat, 4 May 2024 17:14:59 +0300 Subject: [PATCH 08/14] fix: detokenization case where first token does not start with a leading space (#1375) * Fix tokenization edge case where llama output does not start with a space See this notebook: https://colab.research.google.com/drive/1Ooz11nFPk19zyJdMDx42CeesU8aWZMdI#scrollTo=oKpHw5PZ30uC * Update _internals.py Fixing to compare to b' ' instead of (str)' ' --------- Co-authored-by: Andrei --- llama_cpp/_internals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index d7409f63a0..b404601d30 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -203,7 +203,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: # NOTE: Llama1 models automatically added a space at the start of the prompt # this line removes a leading space if the first token is a beginning of sentence token return ( - output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output + output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output ) # Extra @@ -812,4 +812,4 @@ def sample( def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool): if apply_grammar and self.grammar is not None: ctx_main.grammar_accept_token(self.grammar, id) - self.prev.append(id) \ No newline at end of file + self.prev.append(id) From 3e2597eac888e6e12c7bc7021016ca5104db83ba Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 5 May 2024 12:12:27 -0400 Subject: [PATCH 09/14] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 6 +++++- vendor/llama.cpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 46aa51662f..9e934e0526 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -294,6 +294,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_VOCAB_PRE_TYPE_MPT = 5, # LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, # LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, +# LLAMA_VOCAB_PRE_TYPE_REFACT = 8, +# LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -303,6 +305,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_VOCAB_PRE_TYPE_MPT = 5 LLAMA_VOCAB_PRE_TYPE_STARCODER = 6 LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 +LLAMA_VOCAB_PRE_TYPE_REFACT = 8 +LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 # // note: these values should be synchronized with ggml_rope @@ -494,7 +498,7 @@ class llama_token_data_array(ctypes.Structure): llama_token_data_array_p = ctypes.POINTER(llama_token_data_array) -# typedef bool (*llama_progress_callback)(float progress, void *ctx); +# typedef bool (*llama_progress_callback)(float progress, void * user_data); llama_progress_callback = ctypes.CFUNCTYPE( ctypes.c_bool, ctypes.c_float, ctypes.c_void_p ) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6ecf3189e0..628b299106 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6ecf3189e00a1e8e737a78b6d10e1d7006e050a2 +Subproject commit 628b299106d1e9476fdecb3cbe546bf5c60f1b89 From 36668331074c79f40494b51e4d541cd03601b9f2 Mon Sep 17 00:00:00 2001 From: Olivier DEBAUCHE Date: Sun, 5 May 2024 18:42:28 +0200 Subject: [PATCH 10/14] feat(ci): Add docker checks and check deps more frequently (#1426) * Update dependabot.yml Add github-actions update * Update dependabot.yml * Update dependabot.yml --- .github/dependabot.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c58c9ae570..6bf90273ac 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,8 +8,12 @@ updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/" # Location of package manifests schedule: - interval: "weekly" + interval: "daily" - package-ecosystem: "github-actions" directory: "/" schedule: - interval: "weekly" + interval: "daily" + - package-ecosystem: "docker" + directory: "/" + schedule: + interval: "daily" From 0318702cdc860999ee70f277425edbbfe0e60419 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 5 May 2024 12:49:31 -0400 Subject: [PATCH 11/14] feat(server): Add support for setting root_path. Closes #1420 --- llama_cpp/server/app.py | 1 + llama_cpp/server/settings.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b6ed9b1b6b..4cf10d1f66 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -132,6 +132,7 @@ def create_app( middleware=middleware, title="🦙 llama.cpp Python API", version=llama_cpp.__version__, + root_path=server_settings.root_path, ) app.add_middleware( CORSMiddleware, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index ed05a889f0..a3e185007d 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -215,6 +215,10 @@ class ServerSettings(BaseSettings): default=False, description="Disable EventSource pings (may be needed for some clients).", ) + root_path: str = Field( + default="", + description="The root path for the server. Useful when running behind a reverse proxy.", + ) class Settings(ServerSettings, ModelSettings): From 19724458b164b7fae728a0e4da4c43514125ed62 Mon Sep 17 00:00:00 2001 From: juanroesel Date: Mon, 6 May 2024 17:57:45 -0700 Subject: [PATCH 12/14] Ported over prometheus implementation from previous repo --- llama_cpp/_utils.py | 72 ++++++++++++++++++++++++- llama_cpp/llama.py | 113 ++++++++++++++++++++++++++++++++++------ llama_cpp/server/app.py | 7 +++ pyproject.toml | 2 + 4 files changed, 176 insertions(+), 18 deletions(-) diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 781b265010..7ab94964b1 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -1,7 +1,9 @@ import os import sys +import psutil +import subprocess -from typing import Any, Dict +from typing import Any, Dict, List # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor outnull_file = open(os.devnull, "w") @@ -75,3 +77,71 @@ class Singleton(object, metaclass=MetaSingleton): def __init__(self): super(Singleton, self).__init__() + + +# Get snapshot of RAM and GPU usage before and after function execution. +# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616 +def get_cpu_usage(pid) -> float: + """ + CPU usage in percentage by the current process. + """ + process = psutil.Process(pid) + return process.cpu_percent() + +def get_ram_usage(pid) -> float: + """ + RAM usage in MiB by the current process. + """ + process = psutil.Process(pid) + ram_info = process.memory_info() + ram_usage = ram_info.rss / (1024 * 1024) # Convert to MiB + return ram_usage + +def get_gpu_info_by_pid(pid) -> float: + """ + GPU memory usage by the current process (if GPU is available) + """ + try: + gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8") + gpu_info = gpu_info.strip().split("\n") + for info in gpu_info: + gpu_pid, gpu_ram_usage = info.split(", ") + if int(gpu_pid) == pid: + return float(gpu_ram_usage.split()[0]) + except (subprocess.CalledProcessError, FileNotFoundError): + pass + return 0.0 + +def get_gpu_general_info() -> tuple[float, float, float]: + """ + GPU general info (if GPU is available) + """ + try: + gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8") + gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ") + return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free]) + except (subprocess.CalledProcessError, FileNotFoundError): + pass + return 0.0, 0.0, 0.0 + +def infer_service_from_prompt(prompt: str | List[str]): + """ + Infer the service for which a completion request is sent based on the prompt. + """ + LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided." + ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue." + SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review." + + if isinstance(prompt, list): + prompt = " ".join(prompt) + + if LABEL_SUGGESTIONS_TASK in prompt: + return "label-suggestions" + + elif ACCEPTANCE_CRITERIA_TASK in prompt: + return "acceptance-criteria" + + elif SPRINT_REVIEW_TASK in prompt: + return "sprint-review" + + return "not-specified" diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f927f0ca26..a32ea1e8df 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -38,6 +38,16 @@ import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama_chat_format as llama_chat_format +from llama_cpp.llama_metrics import Metrics, MetricsExporter + +from llama_cpp._utils import ( + infer_service_from_prompt, + get_cpu_usage, + get_ram_usage, + get_gpu_info_by_pid, + get_gpu_general_info, +) + from llama_cpp.llama_speculative import LlamaDraftModel import numpy as np @@ -448,6 +458,9 @@ def __init__( if self.verbose: print(f"Using fallback chat format: {chat_format}", file=sys.stderr) + # Prometheus metrics + self.metrics = MetricsExporter() + @property def ctx(self) -> llama_cpp.llama_context_p: assert self._ctx.ctx is not None @@ -950,6 +963,19 @@ def _create_completion( completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) + + # Variables required for metric collection + _metrics_dict = {} + _ttft_start = time.time() + _pid = os.getpid() + _tpot_metrics = [] + _labels = { + "service": infer_service_from_prompt(prompt), # Infer the service for which the completion is being generated + "request_type": "chat/completions", + } + # Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process + _ = get_cpu_usage(_pid) + # If prompt is empty, initialize completion with BOS token to avoid # detokenization including a space at the beginning of the completion completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()] @@ -1043,23 +1069,26 @@ def logit_bias_processor( finish_reason = "length" multibyte_fix = 0 - for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - min_p=min_p, - typical_p=typical_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, - grammar=grammar, + _tpot_start = time.time() + for idx, token in enumerate( + self.generate( + prompt_tokens, + top_k=top_k, + top_p=top_p, + min_p=min_p, + typical_p=typical_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, + grammar=grammar, + ) ): assert self._model.model is not None if llama_cpp.llama_token_is_eog(self._model.model, token): @@ -1216,6 +1245,14 @@ def logit_bias_processor( finish_reason = "length" break + # Record TTFT metric (once) + if idx == 0: + _metrics_dict["time_to_first_token"] = time.time() - _ttft_start + # Record TPOT metric + else: + _tpot_metrics.append(time.time() - _tpot_start) + _tpot_start = time.time() # reset + if stopping_criteria is not None and stopping_criteria( self._input_ids, self._scores[-1, :] ): @@ -1403,6 +1440,48 @@ def logit_bias_processor( "token_logprobs": token_logprobs, "top_logprobs": top_logprobs, } + + # Record TPOT metrics (per generated token) + _metrics_dict["time_per_output_token"] = _tpot_metrics + + # Record metrics from the C++ backend (converted to seconds) + _timings = llama_cpp.llama_get_timings(self._ctx.ctx) + _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2) + _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2) + _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0 + _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2) + _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0 + _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2) + _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0 + _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2) + + # Record prefill and generation token metrics + _metrics_dict["prefill_tokens"] = len(prompt_tokens) + _metrics_dict["generation_tokens"] = len(completion_tokens) + + # Record system info + _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info() + _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check + _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid) + _metrics_dict["gpu_utilization"] = _gpu_utilization + _metrics_dict["gpu_ram_usage"] = _gpu_memory_used + _metrics_dict["gpu_ram_free"] = _gpu_memory_free + _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid) + _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx) + _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2) + _metrics_dict["system_info"] = { + "model": model_name, + "n_params": str(llama_cpp.llama_model_n_params(self.model)), + "n_embd": str(self.n_embd()), + "n_ctx": str(self.n_ctx()), + "n_vocab": str(self.n_vocab()), + "n_threads": str(self.n_threads) + } + + # Log metrics to Prometheus + #print(_metrics_dict, file=sys.stderr) + _all_metrics = Metrics(**_metrics_dict) + self.metrics.log_metrics(_all_metrics, labels=_labels) yield { "id": completion_id, diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b6ed9b1b6b..d9bed39225 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -7,6 +7,8 @@ from functools import partial from typing import Iterator, List, Optional, Union, Dict +from prometheus_client import make_asgi_app + import llama_cpp import anyio @@ -145,6 +147,11 @@ def create_app( assert model_settings is not None set_llama_proxy(model_settings=model_settings) + # Add prometheus asgi middleware to route /metrics requests + # see: https://prometheus.github.io/client_python/exporting/http/fastapi-gunicorn/ + metrics_app = make_asgi_app() + app.mount("/metrics", metrics_app) + if server_settings.disable_ping_events: set_ping_message_factory(lambda: bytes()) diff --git a/pyproject.toml b/pyproject.toml index 8345cb1f09..4b0246623f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,8 @@ server = [ "sse-starlette>=1.6.1", "starlette-context>=0.3.6,<0.4", "PyYAML>=5.1", + "prometheus_client>=0.20.0", + "psutil>=5.9.8" ] test = [ "pytest>=7.4.0", From edd0ec69a2c34d14756a5d79a7f93ae28dd3a958 Mon Sep 17 00:00:00 2001 From: juanroesel Date: Mon, 6 May 2024 17:58:40 -0700 Subject: [PATCH 13/14] Added kn_cache_usage_ratio metric --- llama_cpp/llama_metrics.py | 218 +++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 llama_cpp/llama_metrics.py diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py new file mode 100644 index 0000000000..9e00e8b99d --- /dev/null +++ b/llama_cpp/llama_metrics.py @@ -0,0 +1,218 @@ +from dataclasses import dataclass +from typing import Any, Optional, Dict, List + +from prometheus_client import Gauge, Info, Histogram + + +LABELS = ["request_type", "service"] + +@dataclass +class Metrics: + """ + A dataclass to store metrics for a request. + """ + # System metrics + system_info: Dict[str, Any] + state_size: int + cpu_utilization: float + cpu_ram_pid: float + gpu_utilization: float + gpu_ram_usage: float + gpu_ram_free: float + gpu_ram_pid: float + + # Metrics from the C++ backend + load_time: float + sample_time: float + sample_throughput: float + time_to_first_token: float + time_per_output_token: List[float] + prompt_eval_time: float + prompt_eval_throughput: float + completion_eval_time: float + completion_eval_throughput: float + end_to_end_latency: float + prefill_tokens: int + generation_tokens: int + kv_cache_usage_ratio: int + + +class MetricsExporter: + """ + A custom Prometheus Metrics Explorer for the LLAMA C++ backend. + Collects metrics per request sent to the backend. + """ + def __init__(self): + self.labels = LABELS + # One-time metrics + self._histrogram_load_time = Histogram( + name="llama_cpp_python:load_t_seconds", + documentation="Histogram of load time in seconds", + labelnames=self.labels, + buckets=[ + 0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, + 8.0, 9.0, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0 + ] + ) + # Request-level latencies + self._histogram_sample_time = Histogram( + name="llama_cpp_python:sample_t_seconds", + documentation="Histogram of token sampling time in seconds", + labelnames=self.labels, + buckets=[ + 0.00001, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, + 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, + ] + ) + self._histogram_time_to_first_token = Histogram( + name="llama_cpp_python:ttft_seconds", + documentation="Histogram of time to first token in seconds", + labelnames=self.labels, + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0 + ] + ) + self._histogram_time_per_output_token = Histogram( + name="llama_cpp_python:tpot_seconds", + documentation="Histogram of time per output token in seconds", + labelnames=self.labels, + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0 + ] + ) + self._histogram_prompt_eval_time = Histogram( + name="llama_cpp_python:p_eval_t_seconds", + documentation="Histogram of prompt evaluation time in seconds", + labelnames=self.labels, + buckets=[ + 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, + 20.0, 25.0, 30.0, 40.0, 50.0, 60.0 + ] + ) + self._histogram_completion_eval_time = Histogram( + name="llama_cpp_python:c_eval_t_seconds", + documentation="Histogram of completion evaluation time in seconds", + labelnames=self.labels, + buckets=[ + 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, + 20.0, 25.0, 30.0, 40.0, 50.0, 60.0 + ] + ) + self._histogram_e2e_request_latency = Histogram( + name="llama_cpp_python:e2e_seconds", + documentation="Histogram of end-to-end request latency in seconds", + labelnames=self.labels, + buckets=[ + 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, + 20.0, 25.0, 30.0, 40.0, 50.0, 60.0 + ] + ) + # Prefill and generation tokens + self._histogram_prefill_tokens = Histogram( + name="llama_cpp_python:prefill_tokens_total", + documentation="Histogram of number of prefill tokens processed", + labelnames=self.labels, + buckets=[ + 1, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, + 3500, 4000, 4500, 5000 + ] + ) + self._histogram_generation_tokens = Histogram( + name="llama_cpp_python:completion_tokens_total", + documentation="Histogram of number of generation tokens processed", + labelnames=self.labels, + buckets=[ + 1, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, + 3500, 4000, 4500, 5000 + ] + ) + # Current throughput + self._gauge_prompt_eval_throughput = Gauge( + name="llama_cpp_python:prompt_eval_throughput", + documentation="Current throughput of the prompt evaluation process (in tokens/second)", + labelnames=self.labels + ) + self._gauge_completion_eval_throughput = Gauge( + name="llama_cpp_python:completion_eval_throughput", + documentation="Current throughput of the completion evaluation process (in tokens/second)", + labelnames=self.labels + ) + self._gauge_sample_throughput = Gauge( + name="llama_cpp_python:sample_throughput", + documentation="Current throughput of the token sampling process (in tokens/second)", + labelnames=self.labels + ) + # System info + self._gauge_state_size = Gauge( + name="llama_cpp_python:state_size", + documentation="Current state size in bytes of various components such as rng (random number generator), logits, embedding, and kv_cache (key-value cache)", + labelnames=self.labels + ) + self._gauge_cpu_utilization = Gauge( + name="llama_cpp_python:cpu_utilization", + documentation="Current CPU utilization", + labelnames=self.labels + ) + self._gauge_cpu_ram_usage_by_pid = Gauge( + name="llama_cpp_python:cpu_memory_usage_by_pid", + documentation="Current CPU memory usage during the request", + labelnames=self.labels + ) + self._gauge_gpu_utilization = Gauge( + name="llama_cpp_python:gpu_utilization", + documentation="Current GPU utilization", + labelnames=self.labels + ) + self._gauge_gpu_memory_usage = Gauge( + name="llama_cpp_python:gpu_memory_usage", + documentation="Current GPU memory usage", + labelnames=self.labels + ) + self._gauge_gpu_memory_free = Gauge( + name="llama_cpp_python:gpu_memory_free", + documentation="Current free GPU memory", + labelnames=self.labels + ) + self._gauge_gpu_memory_usage_by_pid = Gauge( + name="llama_cpp_python:gpu_memory_usage_by_pid", + documentation="Current GPU memory usage during the request", + labelnames=self.labels + ) + self._gauge_kv_cache_usage_ratio = Gauge( + name="llama_cpp_python:kv_cache_usage_ratio", + documentation="KV-cache usage. 1 means 100 percent usage", + labelnames=self.labels + ) + self._info = Info( + name="llama_cpp_python:info", + documentation="Server metadata" + ) + + def log_metrics(self, metrics: Metrics, labels: Dict[str, str]): + """ + Log the metrics using the Prometheus client. + """ + self._histrogram_load_time.labels(**labels).observe(metrics.load_time) + self._histogram_sample_time.labels(**labels).observe(metrics.sample_time) + self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token) + for _tpot in metrics.time_per_output_token: + self._histogram_time_per_output_token.labels(**labels).observe(_tpot) + self._histogram_prompt_eval_time.labels(**labels).observe(metrics.prompt_eval_time) + self._histogram_completion_eval_time.labels(**labels).observe(metrics.completion_eval_time) + self._histogram_e2e_request_latency.labels(**labels).observe(metrics.end_to_end_latency) + self._histogram_prefill_tokens.labels(**labels).observe(metrics.prefill_tokens) + self._histogram_generation_tokens.labels(**labels).observe(metrics.generation_tokens) + self._gauge_prompt_eval_throughput.labels(**labels).set(metrics.prompt_eval_throughput) + self._gauge_completion_eval_throughput.labels(**labels).set(metrics.completion_eval_throughput) + self._gauge_sample_throughput.labels(**labels).set(metrics.sample_throughput) + self._gauge_cpu_utilization.labels(**labels).set(metrics.cpu_utilization) + self._gauge_cpu_ram_usage_by_pid.labels(**labels).set(metrics.cpu_ram_pid) + self._gauge_gpu_utilization.labels(**labels).set(metrics.gpu_utilization) + self._gauge_gpu_memory_usage.labels(**labels).set(metrics.gpu_ram_usage) + self._gauge_gpu_memory_free.labels(**labels).set(metrics.gpu_ram_free) + self._gauge_gpu_memory_usage_by_pid.labels(**labels).set(metrics.gpu_ram_pid) + self._gauge_state_size.labels(**labels).set(metrics.state_size) + self._gauge_kv_cache_usage_ratio.labels(**labels).set(metrics.kv_cache_usage_ratio) + self._info.info(metrics.system_info) \ No newline at end of file From bd84f3c179bac46dd1dc0604db638dfd3852fdae Mon Sep 17 00:00:00 2001 From: juanroesel Date: Tue, 7 May 2024 14:53:25 -0700 Subject: [PATCH 14/14] Pulled synced commits locally and changed data type --- llama_cpp/llama_metrics.py | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py index 9e00e8b99d..7334c71401 100644 --- a/llama_cpp/llama_metrics.py +++ b/llama_cpp/llama_metrics.py @@ -34,7 +34,7 @@ class Metrics: end_to_end_latency: float prefill_tokens: int generation_tokens: int - kv_cache_usage_ratio: int + kv_cache_usage_ratio: float class MetricsExporter: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 628b299106..f364eb6fb5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 628b299106d1e9476fdecb3cbe546bf5c60f1b89 +Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961