diff --git a/.dockerignore b/.dockerignore index fd64c09b37..1b85f8c9a3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,6 +2,7 @@ _skbuild/ .envrc +# LLMs - comment if you'd like to bake the model into the image models/ # Byte-compiled / optimized / DLL files diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md new file mode 100644 index 0000000000..da84dda5c8 --- /dev/null +++ b/INSTRUCTIONS.md @@ -0,0 +1,35 @@ +## Syncing with upstream repo + +See [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork#syncing-a-fork-branch-from-the-web-ui) for more details. + +1. On the GitHub UI, create a new branch `repo-sync`, if the branch doesn't exist already. + +2. Click on the "Sync fork" button and then click on the "Update branch" button. This will import all the commits from the upstream repo. + +3. Create a local branch `repo-sync` and pull the contents from the remote `repo-sync` branch. + +4. Solve for any conflicts if they arise. Otherwise, proceed to the next step. + +5. Update all the git submodles: + +``` +git submodule update --recursive +``` + +6. Since changes have probably been made to the vendor libraries (`llama_cpp`, `kompute`), we need to recompile the `llama_cpp` package. Navigate to the `vendor/llama.cpp` folder and clean the build cache: + +``` +make clean +``` +6. Navigate back to the root directory and type the following to recompile the `llama_cpp` package and build the dependenies again: + +``` +make deps && make build +``` +7. Launch the `llama_cpp_python` server using the following command: +``` +python -m llama_cpp.server --model $MODEL --n_gpu_layers -1 +``` +NOTE: Modify the launch arguments as needed. Make sure the `MODEL` environment variable points to an absolute path containing a `.gguf` model. + +8. If the server launches without issues, then you can proceed to create a PR with the latest changes \ No newline at end of file diff --git a/dev.Dockerfile b/dev.Dockerfile new file mode 100644 index 0000000000..24f5be7270 --- /dev/null +++ b/dev.Dockerfile @@ -0,0 +1,44 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3.11.8 + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + libopenblas-dev \ + build-essential \ + git + +RUN mkdir /app +WORKDIR /app +COPY . /app + +RUN python3 -m pip install --upgrade pip + +RUN make deps && make build && make clean + +# Set environment variable for the host +ENV GH_TOKEN=$GH_TOKEN +ENV HOST=0.0.0.0 +ENV PORT=8000 +ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf + +# # Install depencencies +# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client + +# # Install llama-cpp-python (build with METAL) +# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/docker/simple/run.sh"] +# CMD python3 -m llama_cpp.server --n_gpu_layers -1 diff --git a/dev.docker-compose b/dev.docker-compose new file mode 100644 index 0000000000..7b21e468a2 --- /dev/null +++ b/dev.docker-compose @@ -0,0 +1,15 @@ +version: '3' +services: + dev-llama-cpp-python: + build: + context: . + dockerfile: dev.Dockerfile + ports: + - 8000:8000 + volumes: + - ./llama_cpp:/app/llama_cpp + networks: + - zh-service-network +networks: + zh-service-network: + external: true \ No newline at end of file diff --git a/docker/simple/run.sh b/docker/simple/run.sh index c85e73d2b6..d4fd489a0e 100644 --- a/docker/simple/run.sh +++ b/docker/simple/run.sh @@ -1,4 +1,5 @@ #!/bin/bash make build -uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT +# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload +python3 -m llama_cpp.server --model $MODEL --n_gpu_layers -1 \ No newline at end of file diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 7638170a97..f3407c92bb 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,6 +1,7 @@ import sys import ctypes import logging +import logging.config import llama_cpp @@ -17,8 +18,38 @@ 5: logging.DEBUG, } -logger = logging.getLogger("llama-cpp-python") +UVICORN_LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": {"format": "%(asctime)s [%(levelname)s] %(message)s"}, + }, + "handlers": { + "default": { + "level": "INFO", + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", # Default is stderr + }, + }, + "loggers": { + "uvicorn.error": { + "level": "DEBUG", + "handlers": ["default"], + }, + "uvicorn.access": { + "level": "DEBUG", + "handlers": ["default"], + }, + }, +} +# Set up llama-cpp-python logger matching the format of uvicorn logger +logger = logging.getLogger("llama-cpp-python") +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) @llama_cpp.llama_log_callback def llama_log_callback( @@ -27,7 +58,13 @@ def llama_log_callback( user_data: ctypes.c_void_p, ): if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: - print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) + _text = text.decode("utf-8") + if _text.endswith("\n"): + _text = _text[:-1] + + # Skip if the message only contains "." + if not _text == ".": + logger.log(GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level], _text) llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 781b265010..14d0542fcc 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -1,7 +1,13 @@ import os import sys +import psutil +import asyncio +import subprocess + +from typing import Any, Dict, List, Tuple, Union + +from llama_cpp.llama_metrics import QueueMetrics, MetricsExporter -from typing import Any, Dict # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor outnull_file = open(os.devnull, "w") @@ -10,6 +16,7 @@ STDOUT_FILENO = 1 STDERR_FILENO = 2 + class suppress_stdout_stderr(object): # NOTE: these must be "saved" here to avoid exceptions when using # this context manager inside of a __del__ method @@ -75,3 +82,117 @@ class Singleton(object, metaclass=MetaSingleton): def __init__(self): super(Singleton, self).__init__() + + +# Get snapshot of RAM and GPU usage before and after function execution. +# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616 +def get_cpu_usage(pid) -> float: + """ + CPU usage in percentage by the current process. + """ + process = psutil.Process(pid) + return process.cpu_percent() + + +def get_ram_usage(pid) -> float: + """ + RAM usage in MiB by the current process. + """ + process = psutil.Process(pid) + ram_info = process.memory_info() + ram_usage = ram_info.rss / (1024 * 1024) # Convert to MiB + return ram_usage + + +def get_gpu_info_by_pid(pid) -> float: + """ + GPU memory usage by the current process (if GPU is available) + """ + try: + gpu_info = subprocess.check_output( + [ + "nvidia-smi", + "--query-compute-apps=pid,used_memory", + "--format=csv,noheader", + ] + ).decode("utf-8") + gpu_info = gpu_info.strip().split("\n") + for info in gpu_info: + gpu_pid, gpu_ram_usage = info.split(", ") + if int(gpu_pid) == pid: + return float(gpu_ram_usage.split()[0]) + except (subprocess.CalledProcessError, FileNotFoundError): + pass + return 0.0 + + +def get_gpu_general_info() -> Tuple[float, float, float]: + """ + GPU general info (if GPU is available) + """ + try: + gpu_info = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=utilization.gpu,memory.used,memory.free", + "--format=csv,noheader", + ] + ).decode("utf-8") + gpu_utilization, gpu_memory_used, gpu_memory_free = ( + gpu_info.strip().split("\n")[0].split(", ") + ) + return tuple( + float(tup.split()[0]) + for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free] + ) + except (subprocess.CalledProcessError, FileNotFoundError): + pass + return 0.0, 0.0, 0.0 + + +async def monitor_task_queue( + status_dict: Dict[str, Union[int, float]], metrics_exporter: MetricsExporter +): + """ + An asynchronous function that monitors the task queue and updates + a shared status dictionary with the number of tasks that have not + started and the number of tasks that are currently running. + It recursively calls itself to continuously monitor the task queue. + NOTE: There will always be 3 tasks running in the task queue: + - LifespanOn.main: Main application coroutine + - Server.serve: Server coroutine + - monitor_task_queue: Task queue monitoring coroutine + Any upcoming requests will be added to the task queue in the form of + another RequestReponseCycle.run_asgi coroutine. + """ + if not isinstance(metrics_exporter, MetricsExporter): + raise ValueError("metrics_exporter must be an instance of MetricsExporter") + + all_tasks = asyncio.all_tasks() + + # Get count of all running tasks + _all_tasks = [task for task in all_tasks if task._state == "PENDING"] + status_dict["running_tasks_count"] = len(_all_tasks) + # Get basic metadata of all running tasks + status_dict["running_tasks"] = { + task.get_name(): str(task.get_coro()) + .lstrip("\u003C") + .rstrip("\u003E") + for task in all_tasks + } + + assert status_dict is not None + + # Register current running tasks as a Prometheus metric + _labels = { + "service": "general", + "request_type": "health_check", + } + _queue_metrics = QueueMetrics(**status_dict) + metrics_exporter.log_queue_metrics(_queue_metrics, _labels) + + await asyncio.sleep(5) # adds a delay of 5 seconds to avoid overloading the CPU + + asyncio.create_task( + monitor_task_queue(status_dict, metrics_exporter) + ) # pass status_dict to the next task diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5f5396683c..476d75d54e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -43,6 +43,15 @@ import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama_chat_format as llama_chat_format +from llama_cpp.llama_metrics import RequestMetrics + +from llama_cpp._utils import ( + get_cpu_usage, + get_ram_usage, + get_gpu_info_by_pid, + get_gpu_general_info +) + from llama_cpp.llama_speculative import LlamaDraftModel import numpy as np @@ -57,7 +66,7 @@ _LlamaSamplingContext, # type: ignore _normalize_embedding, # type: ignore ) -from ._logger import set_verbose +from ._logger import set_verbose, logger from ._utils import suppress_stdout_stderr @@ -394,7 +403,7 @@ def __init__( ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + logger.info(f'System info: {llama_cpp.llama_print_system_info().decode("utf-8")}') self.chat_format = chat_format self.chat_handler = chat_handler @@ -425,10 +434,10 @@ def __init__( except Exception as e: self.metadata = {} if self.verbose: - print(f"Failed to load metadata: {e}", file=sys.stderr) + logger.error(f"Failed to load metadata: {e}") if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + logger.info(f"Model metadata: {self.metadata}") eos_token_id = self.token_eos() bos_token_id = self.token_bos() @@ -443,7 +452,7 @@ def __init__( template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"] if self.verbose and template_choices: - print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr) + logger.info(f"Available chat formats from metadata: {', '.join(template_choices.keys())}") for name, template in template_choices.items(): self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( @@ -465,20 +474,20 @@ def __init__( if chat_format is not None: self.chat_format = chat_format if self.verbose: - print(f"Guessed chat format: {chat_format}", file=sys.stderr) + logger.info(f"Guessed chat format: {chat_format}") else: if self.verbose: - print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr) - print(f"Using chat eos_token: {eos_token}", file=sys.stderr) - print(f"Using chat bos_token: {bos_token}", file=sys.stderr) + logger.info(f"Using gguf chat template: {template_choices['chat_template.default']}") + logger.info(f"Using chat eos_token: {eos_token}") + logger.info(f"Using chat bos_token: {bos_token}") self.chat_format = "chat_template.default" if self.chat_format is None and self.chat_handler is None: self.chat_format = "llama-2" if self.verbose: - print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr) - + logger.info(f"Using fallback chat format: {self.chat_format}") + @property def ctx(self) -> llama_cpp.llama_context_p: assert self._ctx.ctx is not None @@ -719,7 +728,8 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + # print("Llama.generate: prefix-match hit", file=sys.stderr) + logger.info("Llama.generate: prefix-match hit") reset = False tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix @@ -945,7 +955,7 @@ def decode_batch(seq_sizes: List[int]): return output, total_tokens else: return output - + def _create_completion( self, prompt: Union[str, List[int]], @@ -973,11 +983,23 @@ def _create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, + ai_service: Optional[str] = None ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: assert self._ctx is not None assert suffix is None or suffix.__class__ is str + # Variables required for metric collection + _metrics_dict = {} + _ttft_start = time.time() + _pid = os.getpid() + _tpot_metrics = [] + _labels = { + "service": ai_service if ai_service is not None else "not-specified", + "request_type": "chat/completions", + } + # Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process + _ = get_cpu_usage(_pid) completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) @@ -1129,23 +1151,26 @@ def logit_bias_processor( finish_reason = "length" multibyte_fix = 0 - for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - min_p=min_p, - typical_p=typical_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, - grammar=grammar, + _tpot_start = time.time() + for idx, token in enumerate( + self.generate( + prompt_tokens, + top_k=top_k, + top_p=top_p, + min_p=min_p, + typical_p=typical_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, + grammar=grammar, + ) ): assert self._model.model is not None if llama_cpp.llama_token_is_eog(self._model.model, token): @@ -1302,6 +1327,14 @@ def logit_bias_processor( finish_reason = "length" break + # Record TTFT metric (once) + if idx == 0: + _metrics_dict["time_to_first_token"] = time.time() - _ttft_start + # Record TPOT metric + else: + _tpot_metrics.append(time.time() - _tpot_start) + _tpot_start = time.time() # reset + if stopping_criteria is not None and stopping_criteria( self._input_ids, self._scores[-1, :] ): @@ -1322,6 +1355,14 @@ def logit_bias_processor( token_end_position = 0 for token in remaining_tokens: + # Record TTFT metric (once) + if idx == 0: + _metrics_dict["time_to_first_token"] = time.time() - _ttft_start + # Record TPOT metric + else: + _tpot_metrics.append(time.time() - _tpot_start) + _tpot_start = time.time() # reset + token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens])) logprobs_or_none: Optional[CompletionLogprobs] = None @@ -1415,6 +1456,53 @@ def logit_bias_processor( print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() print("Llama._create_completion: cache saved", file=sys.stderr) + + ## PROMETHEUS METRICS IN STREAMING MODE ## + # Record TTFT metric -- Setting to None if no tokens were generated + if not _metrics_dict.get("time_to_first_token"): + _metrics_dict["time_to_first_token"] = None + + # Record TPOT metrics (per generated token) + _metrics_dict["time_per_output_token"] = _tpot_metrics + + # Record metrics from the C++ backend (converted to seconds) + _timings = llama_cpp.llama_get_timings(self._ctx.ctx) + _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2) + _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2) + _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0 + _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2) + _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0 + _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2) + _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0 + _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2) + + # Record prefill and generation token metrics + _metrics_dict["prefill_tokens"] = len(prompt_tokens) + _metrics_dict["generation_tokens"] = len(completion_tokens) + + # Record system info + _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info() + _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check + _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid) + _metrics_dict["gpu_utilization"] = _gpu_utilization + _metrics_dict["gpu_ram_usage"] = _gpu_memory_used + _metrics_dict["gpu_ram_free"] = _gpu_memory_free + _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid) + _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx) + _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2) + _metrics_dict["system_info"] = { + "model": model_name, + "n_params": str(llama_cpp.llama_model_n_params(self.model)), + "n_embd": str(self.n_embd()), + "n_ctx": str(self.n_ctx()), + "n_vocab": str(self.n_vocab()), + "n_threads": str(self.n_threads) + } + + # Log metrics to Prometheus + _all_metrics = RequestMetrics(**_metrics_dict) + self.metrics.log_request_metrics(_all_metrics, labels=_labels) + return if self.cache: @@ -1489,12 +1577,59 @@ def logit_bias_processor( "token_logprobs": token_logprobs, "top_logprobs": top_logprobs, } + + ## PROMETHEUS METRICS IN CHAT COMPLETION MODE ## + # Record TTFT metric -- Setting to None if no tokens were generated + if not _metrics_dict.get("time_to_first_token"): + _metrics_dict["time_to_first_token"] = None + + # Record TPOT metrics (per generated token) + _metrics_dict["time_per_output_token"] = _tpot_metrics + + # Record metrics from the C++ backend (converted to seconds) + _timings = llama_cpp.llama_get_timings(self._ctx.ctx) + _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2) + _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2) + _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0 + _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2) + _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0 + _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2) + _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0 + _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2) + + # Record prefill and generation token metrics + _metrics_dict["prefill_tokens"] = len(prompt_tokens) + _metrics_dict["generation_tokens"] = len(completion_tokens) + + # Record system info + _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info() + _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check + _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid) + _metrics_dict["gpu_utilization"] = _gpu_utilization + _metrics_dict["gpu_ram_usage"] = _gpu_memory_used + _metrics_dict["gpu_ram_free"] = _gpu_memory_free + _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid) + _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx) + _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2) + _metrics_dict["system_info"] = { + "model": model_name, + "n_params": str(llama_cpp.llama_model_n_params(self.model)), + "n_embd": str(self.n_embd()), + "n_ctx": str(self.n_ctx()), + "n_vocab": str(self.n_vocab()), + "n_threads": str(self.n_threads) + } + + # Log metrics to Prometheus + _all_metrics = RequestMetrics(**_metrics_dict) + self.metrics.log_request_metrics(_all_metrics, labels=_labels) yield { "id": completion_id, "object": "text_completion", "created": created, "model": model_name, + "service": ai_service, "choices": [ { "text": text_str, @@ -1537,6 +1672,7 @@ def create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, + ai_service: Optional[str] = None ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1600,6 +1736,7 @@ def create_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -1634,6 +1771,7 @@ def __call__( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, + ai_service: Optional[str] = None ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1697,6 +1835,7 @@ def __call__( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) def create_chat_completion( @@ -1729,6 +1868,7 @@ def create_chat_completion( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + ai_service: Optional[str] = None ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -1798,6 +1938,7 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 975f74762d..60bef9e842 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -88,6 +88,7 @@ def __call__( grammar: Optional[llama.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + ai_service: Optional[str] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -537,6 +538,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + ai_service: Optional[str] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -627,6 +629,7 @@ def chat_completion_handler( stopping_criteria=stopping_criteria, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) if tool is not None: tool_name = tool["function"]["name"] @@ -1714,6 +1717,7 @@ def functionary_v1_v2_chat_handler( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + ai_service: Optional[str] = None, **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -1930,6 +1934,7 @@ def prepare_messages_for_inference( model=model, logits_processor=logits_processor, grammar=grammar, + ai_service=ai_service ) if stream is False: completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip() diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py new file mode 100644 index 0000000000..a7ffc7a094 --- /dev/null +++ b/llama_cpp/llama_metrics.py @@ -0,0 +1,425 @@ +from dataclasses import dataclass +from typing import Any, Optional, Dict, List + +from prometheus_client import Gauge, Info, Histogram + + +LABELS = ["request_type", "service"] + + +@dataclass +class RequestMetrics: + """ + A dataclass to store metrics for a given request. + """ + + # System metrics + system_info: Dict[str, Any] + state_size: int + cpu_utilization: float + cpu_ram_pid: float + gpu_utilization: float + gpu_ram_usage: float + gpu_ram_free: float + gpu_ram_pid: float + + # Metrics from the C++ backend + load_time: float + sample_time: float + sample_throughput: float + time_to_first_token: float + time_per_output_token: List[float] + prompt_eval_time: float + prompt_eval_throughput: float + completion_eval_time: float + completion_eval_throughput: float + end_to_end_latency: float + prefill_tokens: int + generation_tokens: int + kv_cache_usage_ratio: float + + +@dataclass +class QueueMetrics: + """ + A dataclass to store metrics for the task queue. + """ + + running_tasks_count: int + running_tasks: dict + + +class MetricsExporter: + """ + A custom Prometheus Metrics Explorer for the LLAMA C++ backend. + Collects metrics per request sent to the backend. + """ + + def __init__(self): + self.labels = LABELS + # One-time metrics + self._histogram_load_time = Histogram( + name="llama_cpp_python:load_t_seconds", + documentation="Histogram of load time in seconds", + labelnames=self.labels, + buckets=[ + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.0, + 3.0, + 4.0, + 5.0, + 6.0, + 7.0, + 8.0, + 9.0, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + ], + ) + # Request-level latencies + self._histogram_sample_time = Histogram( + name="llama_cpp_python:sample_t_seconds", + documentation="Histogram of token sampling time in seconds", + labelnames=self.labels, + buckets=[ + 0.00001, + 0.00005, + 0.0001, + 0.00025, + 0.0005, + 0.001, + 0.0025, + 0.005, + 0.0075, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.25, + 0.5, + ], + ) + self._histogram_time_to_first_token = Histogram( + name="llama_cpp_python:ttft_seconds", + documentation="Histogram of time to first token in seconds", + labelnames=self.labels, + buckets=[ + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + ], + ) + self._histogram_time_per_output_token = Histogram( + name="llama_cpp_python:tpot_seconds", + documentation="Histogram of time per output token in seconds", + labelnames=self.labels, + buckets=[ + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + ], + ) + self._histogram_prompt_eval_time = Histogram( + name="llama_cpp_python:p_eval_t_seconds", + documentation="Histogram of prompt evaluation time in seconds", + labelnames=self.labels, + buckets=[ + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], + ) + self._histogram_completion_eval_time = Histogram( + name="llama_cpp_python:c_eval_t_seconds", + documentation="Histogram of completion evaluation time in seconds", + labelnames=self.labels, + buckets=[ + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], + ) + self._histogram_e2e_request_latency = Histogram( + name="llama_cpp_python:e2e_seconds", + documentation="Histogram of end-to-end request latency in seconds", + labelnames=self.labels, + buckets=[ + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], + ) + # Prefill and generation tokens + self._histogram_prefill_tokens = Histogram( + name="llama_cpp_python:prefill_tokens_total", + documentation="Histogram of number of prefill tokens processed", + labelnames=self.labels, + buckets=[ + 1, + 10, + 25, + 50, + 100, + 250, + 500, + 750, + 1000, + 1500, + 2000, + 2500, + 3000, + 3500, + 4000, + 4500, + 5000, + ], + ) + self._histogram_generation_tokens = Histogram( + name="llama_cpp_python:completion_tokens_total", + documentation="Histogram of number of generation tokens processed", + labelnames=self.labels, + buckets=[ + 1, + 10, + 25, + 50, + 100, + 250, + 500, + 750, + 1000, + 1500, + 2000, + 2500, + 3000, + 3500, + 4000, + 4500, + 5000, + ], + ) + # Current throughput + self._gauge_prompt_eval_throughput = Gauge( + name="llama_cpp_python:prompt_eval_throughput", + documentation="Current throughput of the prompt evaluation process (in tokens/second)", + labelnames=self.labels, + ) + self._gauge_completion_eval_throughput = Gauge( + name="llama_cpp_python:completion_eval_throughput", + documentation="Current throughput of the completion evaluation process (in tokens/second)", + labelnames=self.labels, + ) + self._gauge_sample_throughput = Gauge( + name="llama_cpp_python:sample_throughput", + documentation="Current throughput of the token sampling process (in tokens/second)", + labelnames=self.labels, + ) + # System info + self._gauge_state_size = Gauge( + name="llama_cpp_python:state_size", + documentation="Current state size in bytes of various components such as rng (random number generator), logits, embedding, and kv_cache (key-value cache)", + labelnames=self.labels, + ) + self._gauge_cpu_utilization = Gauge( + name="llama_cpp_python:cpu_utilization", + documentation="Current CPU utilization", + labelnames=self.labels, + ) + self._gauge_cpu_ram_usage_by_pid = Gauge( + name="llama_cpp_python:cpu_memory_usage_by_pid", + documentation="Current CPU memory usage during the request", + labelnames=self.labels, + ) + self._gauge_gpu_utilization = Gauge( + name="llama_cpp_python:gpu_utilization", + documentation="Current GPU utilization", + labelnames=self.labels, + ) + self._gauge_gpu_memory_usage = Gauge( + name="llama_cpp_python:gpu_memory_usage", + documentation="Current GPU memory usage", + labelnames=self.labels, + ) + self._gauge_gpu_memory_free = Gauge( + name="llama_cpp_python:gpu_memory_free", + documentation="Current free GPU memory", + labelnames=self.labels, + ) + self._gauge_gpu_memory_usage_by_pid = Gauge( + name="llama_cpp_python:gpu_memory_usage_by_pid", + documentation="Current GPU memory usage during the request", + labelnames=self.labels, + ) + self._gauge_kv_cache_usage_ratio = Gauge( + name="llama_cpp_python:kv_cache_usage_ratio", + documentation="KV-cache usage. 1 means 100 percent usage", + labelnames=self.labels, + ) + self._gauge_running_tasks = Histogram( + name="llama_cpp_python:running_tasks", + documentation="Number of running tasks in the task queue", + labelnames=self.labels, + buckets=[ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 12, + 14, + 16, + 18, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + ], + ) + + # Server metadata + self._info = Info(name="llama_cpp_python:info", documentation="Server metadata") + + def log_request_metrics(self, metrics: RequestMetrics, labels: Dict[str, str]): + """ + Log the metrics using the Prometheus client. + """ + self._histogram_load_time.labels(**labels).observe(metrics.load_time) + self._histogram_sample_time.labels(**labels).observe(metrics.sample_time) + if metrics.time_to_first_token: + self._histogram_time_to_first_token.labels(**labels).observe( + metrics.time_to_first_token + ) + for _tpot in metrics.time_per_output_token: + self._histogram_time_per_output_token.labels(**labels).observe(_tpot) + self._histogram_prompt_eval_time.labels(**labels).observe( + metrics.prompt_eval_time + ) + self._histogram_completion_eval_time.labels(**labels).observe( + metrics.completion_eval_time + ) + self._histogram_e2e_request_latency.labels(**labels).observe( + metrics.end_to_end_latency + ) + self._histogram_prefill_tokens.labels(**labels).observe(metrics.prefill_tokens) + self._histogram_generation_tokens.labels(**labels).observe( + metrics.generation_tokens + ) + self._gauge_prompt_eval_throughput.labels(**labels).set( + metrics.prompt_eval_throughput + ) + self._gauge_completion_eval_throughput.labels(**labels).set( + metrics.completion_eval_throughput + ) + self._gauge_sample_throughput.labels(**labels).set(metrics.sample_throughput) + self._gauge_cpu_utilization.labels(**labels).set(metrics.cpu_utilization) + self._gauge_cpu_ram_usage_by_pid.labels(**labels).set(metrics.cpu_ram_pid) + self._gauge_gpu_utilization.labels(**labels).set(metrics.gpu_utilization) + self._gauge_gpu_memory_usage.labels(**labels).set(metrics.gpu_ram_usage) + self._gauge_gpu_memory_free.labels(**labels).set(metrics.gpu_ram_free) + self._gauge_gpu_memory_usage_by_pid.labels(**labels).set(metrics.gpu_ram_pid) + self._gauge_state_size.labels(**labels).set(metrics.state_size) + self._gauge_kv_cache_usage_ratio.labels(**labels).set( + metrics.kv_cache_usage_ratio + ) + self._info.info(metrics.system_info) + + def log_queue_metrics(self, metrics: QueueMetrics, labels: Dict[str, str]): + """ + Log the metrics for the task queue. + """ + self._gauge_running_tasks.labels(**labels).observe(metrics.running_tasks_count) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index a6f1f4e9ca..6ced7bc106 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -37,6 +37,7 @@ ConfigFileSettings, ) from llama_cpp.server.cli import add_args_from_model, parse_model_from_args +from llama_cpp._logger import logger, UVICORN_LOGGING_CONFIG def main(): @@ -75,7 +76,7 @@ def main(): server_settings = parse_model_from_args(ServerSettings, args) model_settings = [parse_model_from_args(ModelSettings, args)] except Exception as e: - print(e, file=sys.stderr) + logger.error(e) parser.print_help() sys.exit(1) assert server_settings is not None @@ -90,6 +91,7 @@ def main(): port=int(os.getenv("PORT", server_settings.port)), ssl_keyfile=server_settings.ssl_keyfile, ssl_certfile=server_settings.ssl_certfile, + log_config=UVICORN_LOGGING_CONFIG ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 8fd41a3fc7..ed90fa155e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,5 +1,7 @@ from __future__ import annotations +from contextlib import asynccontextmanager + import os import json import typing @@ -9,12 +11,15 @@ from functools import partial from typing import Iterator, List, Optional, Union, Dict +from prometheus_client import make_asgi_app + import llama_cpp import anyio from anyio.streams.memory import MemoryObjectSendStream from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body +from fastapi.responses import JSONResponse from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware from fastapi.security import HTTPBearer @@ -41,8 +46,12 @@ TokenizeInputCountResponse, DetokenizeInputRequest, DetokenizeInputResponse, + HealthMetrics ) from llama_cpp.server.errors import RouteErrorHandler +from llama_cpp._utils import monitor_task_queue +from llama_cpp.llama_metrics import MetricsExporter +from llama_cpp._logger import logger router = APIRouter(route_class=RouteErrorHandler) @@ -97,6 +106,29 @@ def set_ping_message_factory(factory: typing.Callable[[], bytes]): _ping_message_factory = factory +def set_metrics_exporter(): + global metrics_exporter + try: + metrics_exporter + except NameError: + metrics_exporter = MetricsExporter() + + return metrics_exporter + +task_queue_status = {} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + A context manager that launches tasks to be run during the application's lifespan. + """ + metrics_exporter = set_metrics_exporter() + + await monitor_task_queue(task_queue_status, metrics_exporter) + yield + + def create_app( settings: Settings | None = None, server_settings: ServerSettings | None = None, @@ -136,6 +168,7 @@ def create_app( title="🦙 llama.cpp Python API", version=llama_cpp.__version__, root_path=server_settings.root_path, + lifespan=lifespan ) app.add_middleware( CORSMiddleware, @@ -149,6 +182,11 @@ def create_app( assert model_settings is not None set_llama_proxy(model_settings=model_settings) + # Add prometheus asgi middleware to route /metrics requests + # see: https://prometheus.github.io/client_python/exporting/http/fastapi-gunicorn/ + metrics_app = make_asgi_app() + app.mount("/metrics", metrics_app) + if server_settings.disable_ping_events: set_ping_message_factory(lambda: bytes()) @@ -174,9 +212,9 @@ async def get_event_publisher( raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: - print("disconnected") + logger.warning(f"Disconnected from client {request.client}") with anyio.move_on_after(1, shield=True): - print(f"Disconnected from client (via refresh/close) {request.client}") + logger.error(f"Disconnected from client (via refresh/close) {request.client}") raise e finally: if on_complete: @@ -222,6 +260,32 @@ async def authenticate( openai_v1_tag = "OpenAI V1" +@router.get( + "/v1/health", + response_model=HealthMetrics, + summary="Server's health", +) +async def check_health(): + # 3 running tasks + new scheduled request + if 0 <= task_queue_status.get("running_tasks_count", 0) <= 4: + return JSONResponse( + content={"status": "OK", "task_queue_status": task_queue_status} + ) + # 2 - 6 scheduled requests + elif 4 < task_queue_status.get("running_tasks_count", 0) < 10: + return JSONResponse( + content={"status": "Warning", "task_queue_status": task_queue_status} + ) + # 7+ scheduled requests + # TODO: Evaluate if in this case we should manually stop the execution of certain tasks to clear the queue + elif task_queue_status.get("running_tasks_count", 0) >= 10: + return JSONResponse( + content={"status": "Critical", "task_queue_status": task_queue_status} + ) + else: + pass + + @router.post( "/v1/completions", summary="Completion", @@ -283,7 +347,6 @@ async def create_completion( if request.url.path != "/v1/engines/copilot-codex/completions" else "copilot-codex" ) - exclude = { "n", "best_of", @@ -360,7 +423,6 @@ async def create_embedding( **request.model_dump(exclude={"user"}), ) - @router.post( "/v1/chat/completions", summary="Chat", @@ -405,6 +467,7 @@ async def create_chat_completion( {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, ], + "ai_service": "copilot" }, }, "json_mode": { @@ -485,7 +548,14 @@ async def create_chat_completion( "user", "min_tokens", } + + # Extract relevant kwargs from the request body kwargs = body.model_dump(exclude=exclude) + + # Adds the ai_service value from the request body to the kwargs + # to be passed downstream to the llama_cpp.ChatCompletion object + kwargs["ai_service"] = body.ai_service + llama = llama_proxy(body.model) if body.logit_bias is not None: kwargs["logit_bias"] = ( @@ -506,10 +576,13 @@ async def create_chat_completion( else: kwargs["logits_processor"].extend(_min_tokens_logits_processor) + # Set the metrics exporter for the llama object + llama.metrics = set_metrics_exporter() + iterator_or_completion: Union[ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) - + if isinstance(iterator_or_completion, Iterator): # EAFP: It's easier to ask for forgiveness than permission first_response = await run_in_threadpool(next, iterator_or_completion) diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py index fbf9fd80d5..826e7ed945 100644 --- a/llama_cpp/server/errors.py +++ b/llama_cpp/server/errors.py @@ -21,6 +21,7 @@ CreateEmbeddingRequest, CreateChatCompletionRequest, ) +from llama_cpp._logger import logger class ErrorResponse(TypedDict): @@ -134,7 +135,7 @@ def error_message_wrapper( ] = None, ) -> Tuple[int, ErrorResponse]: """Wraps error message in OpenAI style error response""" - print(f"Exception: {str(error)}", file=sys.stderr) + logger.error(f"Exception: {str(error)}") traceback.print_exc(file=sys.stderr) if body is not None and isinstance( body, diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index ad39c1004b..c35fd37dde 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -9,6 +9,7 @@ import llama_cpp.llama_tokenizer as llama_tokenizer from llama_cpp.server.settings import ModelSettings +from llama_cpp._logger import logger class LlamaProxy: @@ -272,11 +273,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: if settings.cache: if settings.cache_type == "disk": if settings.verbose: - print(f"Using disk cache with size {settings.cache_size}") + logger.info(f"Using disk cache with size {settings.cache_size}") cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: - print(f"Using ram cache with size {settings.cache_size}") + logger.info(f"Using ram cache with size {settings.cache_size}") cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) _model.set_cache(cache) return _model diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index a75f9e55b9..9c32fe568e 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List, Optional, Union, Dict +from typing import List, Optional, Union, Dict, Any from typing_extensions import TypedDict, Literal from pydantic import BaseModel, Field @@ -267,6 +267,9 @@ class CreateChatCompletionRequest(BaseModel): } } + # AI service added as request body parameter by Client + ai_service: Optional[str] = None + class ModelData(TypedDict): id: str @@ -314,3 +317,7 @@ class DetokenizeInputResponse(BaseModel): model_config = { "json_schema_extra": {"example": {"text": "How many tokens in this query?"}} } + +class HealthMetrics(BaseModel): + model_config = {"arbitrary_types_allowed": True} + task_queue_status: Dict[str, Any] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8345cb1f09..4b0246623f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,8 @@ server = [ "sse-starlette>=1.6.1", "starlette-context>=0.3.6,<0.4", "PyYAML>=5.1", + "prometheus_client>=0.20.0", + "psutil>=5.9.8" ] test = [ "pytest>=7.4.0", diff --git a/tests/test_llama.py b/tests/test_llama.py index 469ef91cab..49150c9f2b 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -5,9 +5,18 @@ from scipy.special import log_softmax import llama_cpp +from llama_cpp.llama_metrics import MetricsExporter MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf" +def set_test_metrics_exporter(): + global metrics_exporter + try: + metrics_exporter + except NameError: + metrics_exporter = MetricsExporter() + + return metrics_exporter def test_llama_cpp_tokenization(): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False) @@ -153,7 +162,11 @@ def mock_kv_cache_seq_add( def test_llama_patch(mock_llama): n_ctx = 128 + ai_service_completion = "test-label-suggestions" + ai_service_streaming = "test-acceptance-criteria" llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx) + llama.metrics = set_test_metrics_exporter() + n_vocab = llama_cpp.llama_n_vocab(llama._model.model) assert n_vocab == 32000 @@ -163,32 +176,32 @@ def test_llama_patch(mock_llama): ## Test basic completion from bos until eos mock_llama(llama, all_text) - completion = llama.create_completion("", max_tokens=36) + completion = llama.create_completion("", max_tokens=36, ai_service=ai_service_completion) assert completion["choices"][0]["text"] == all_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test basic completion until eos mock_llama(llama, all_text) - completion = llama.create_completion(text, max_tokens=20) + completion = llama.create_completion(text, max_tokens=20, ai_service=ai_service_completion) assert completion["choices"][0]["text"] == output_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until eos mock_llama(llama, all_text) - chunks = list(llama.create_completion(text, max_tokens=20, stream=True)) + chunks = list(llama.create_completion(text, max_tokens=20, stream=True, ai_service=ai_service_streaming)) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text assert chunks[-1]["choices"][0]["finish_reason"] == "stop" ## Test basic completion until stop sequence mock_llama(llama, all_text) - completion = llama.create_completion(text, max_tokens=20, stop=["lazy"]) + completion = llama.create_completion(text, max_tokens=20, stop=["lazy"], ai_service=ai_service_completion) assert completion["choices"][0]["text"] == " jumps over the " assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until stop sequence mock_llama(llama, all_text) chunks = list( - llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) + llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"], ai_service=ai_service_streaming) ) assert ( "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " @@ -197,13 +210,13 @@ def test_llama_patch(mock_llama): ## Test basic completion until length mock_llama(llama, all_text) - completion = llama.create_completion(text, max_tokens=2) + completion = llama.create_completion(text, max_tokens=2, ai_service=ai_service_completion) assert completion["choices"][0]["text"] == " jumps" assert completion["choices"][0]["finish_reason"] == "length" ## Test streaming completion until length mock_llama(llama, all_text) - chunks = list(llama.create_completion(text, max_tokens=2, stream=True)) + chunks = list(llama.create_completion(text, max_tokens=2, stream=True, ai_service=ai_service_streaming)) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps" assert chunks[-1]["choices"][0]["finish_reason"] == "length" @@ -228,17 +241,19 @@ def test_llama_pickle(): def test_utf8(mock_llama): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, logits_all=True) + llama.metrics = set_test_metrics_exporter() output_text = "😀" + ai_service = "label-suggestions" ## Test basic completion with utf8 multibyte mock_llama(llama, output_text) - completion = llama.create_completion("", max_tokens=4) + completion = llama.create_completion("", max_tokens=4, ai_service=ai_service) assert completion["choices"][0]["text"] == output_text ## Test basic completion with incomplete utf8 multibyte mock_llama(llama, output_text) - completion = llama.create_completion("", max_tokens=1) + completion = llama.create_completion("", max_tokens=1, ai_service=ai_service) assert completion["choices"][0]["text"] == "" @@ -266,6 +281,22 @@ def test_llama_server(): } +def test_metrics_endpoint(): + from fastapi.testclient import TestClient + from llama_cpp.server.app import create_app, Settings + + settings = Settings( + model=MODEL, + vocab_only=True, + ) + app = create_app(settings) + client = TestClient(app) + response = client.get("/metrics") + assert response.status_code == 200 + assert "test-label-suggestions" in response.text + assert "test-acceptance-criteria" in response.text + + @pytest.mark.parametrize( "size_and_axis", [