diff --git a/.dockerignore b/.dockerignore
index fd64c09b37..1b85f8c9a3 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,6 +2,7 @@ _skbuild/
 
 .envrc
 
+# LLMs - comment if you'd like to bake the model into the image
 models/
 
 # Byte-compiled / optimized / DLL files
diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md
new file mode 100644
index 0000000000..da84dda5c8
--- /dev/null
+++ b/INSTRUCTIONS.md
@@ -0,0 +1,35 @@
+## Syncing with upstream repo
+
+See [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork#syncing-a-fork-branch-from-the-web-ui) for more details. 
+
+1. On the GitHub UI, create a new branch `repo-sync`, if the branch doesn't exist already.
+
+2. Click on the "Sync fork" button and then click on the "Update branch" button. This will import all the commits from the upstream repo.
+
+3. Create a local branch `repo-sync` and pull the contents from the remote `repo-sync` branch.
+
+4. Solve for any conflicts if they arise. Otherwise, proceed to the next step.
+
+5. Update all the git submodles:
+
+```
+git submodule update --recursive
+```
+
+6. Since changes have probably been made to the vendor libraries (`llama_cpp`, `kompute`), we need to recompile the `llama_cpp` package. Navigate to the `vendor/llama.cpp` folder and clean the build cache:
+
+```
+make clean
+```
+6. Navigate back to the root directory and type the following to recompile the `llama_cpp` package and build the dependenies again:
+
+```
+make deps && make build
+```
+7. Launch the `llama_cpp_python` server using the following command:
+```
+python -m llama_cpp.server --model $MODEL --n_gpu_layers -1
+```
+NOTE: Modify the launch arguments as needed. Make sure the `MODEL` environment variable points to an absolute path containing a `.gguf` model. 
+
+8. If the server launches without issues, then you can proceed to create a PR with the latest changes
\ No newline at end of file
diff --git a/dev.Dockerfile b/dev.Dockerfile
new file mode 100644
index 0000000000..24f5be7270
--- /dev/null
+++ b/dev.Dockerfile
@@ -0,0 +1,44 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3.11.8
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    libopenblas-dev \
+    build-essential \
+    git
+
+RUN mkdir /app
+WORKDIR /app
+COPY . /app
+
+RUN python3 -m pip install --upgrade pip
+
+RUN make deps && make build && make clean
+
+# Set environment variable for the host
+ENV GH_TOKEN=$GH_TOKEN
+ENV HOST=0.0.0.0
+ENV PORT=8000
+ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf
+
+# # Install depencencies
+# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client
+
+# # Install llama-cpp-python (build with METAL)
+# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/docker/simple/run.sh"]
+# CMD python3 -m llama_cpp.server --n_gpu_layers -1
diff --git a/dev.docker-compose b/dev.docker-compose
new file mode 100644
index 0000000000..7b21e468a2
--- /dev/null
+++ b/dev.docker-compose
@@ -0,0 +1,15 @@
+version: '3'
+services:
+    dev-llama-cpp-python:
+        build:
+            context: .
+            dockerfile: dev.Dockerfile
+        ports: 
+            - 8000:8000
+        volumes:
+            - ./llama_cpp:/app/llama_cpp
+        networks:
+            - zh-service-network
+networks:
+    zh-service-network:
+        external: true
\ No newline at end of file
diff --git a/docker/simple/run.sh b/docker/simple/run.sh
index c85e73d2b6..d4fd489a0e 100644
--- a/docker/simple/run.sh
+++ b/docker/simple/run.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
 make build
-uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
+# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers -1
\ No newline at end of file
diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py
index 7638170a97..f3407c92bb 100644
--- a/llama_cpp/_logger.py
+++ b/llama_cpp/_logger.py
@@ -1,6 +1,7 @@
 import sys
 import ctypes
 import logging
+import logging.config
 
 import llama_cpp
 
@@ -17,8 +18,38 @@
     5: logging.DEBUG,
 }
 
-logger = logging.getLogger("llama-cpp-python")
+UVICORN_LOGGING_CONFIG = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "standard": {"format": "%(asctime)s [%(levelname)s] %(message)s"},
+    },
+    "handlers": {
+        "default": {
+            "level": "INFO",
+            "formatter": "standard",
+            "class": "logging.StreamHandler",
+            "stream": "ext://sys.stdout",  # Default is stderr
+        },
+    },
+    "loggers": {
+        "uvicorn.error": {
+            "level": "DEBUG",
+            "handlers": ["default"],
+        },
+        "uvicorn.access": {
+            "level": "DEBUG",
+            "handlers": ["default"],
+        },
+    },
+}
 
+# Set up llama-cpp-python logger matching the format of uvicorn logger
+logger = logging.getLogger("llama-cpp-python")
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
 
 @llama_cpp.llama_log_callback
 def llama_log_callback(
@@ -27,7 +58,13 @@ def llama_log_callback(
     user_data: ctypes.c_void_p,
 ):
     if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
-        print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
+        _text = text.decode("utf-8")
+        if _text.endswith("\n"):
+            _text = _text[:-1]
+        
+        # Skip if the message only contains "."
+        if not _text == ".":
+            logger.log(GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level], _text)
 
 
 llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 781b265010..14d0542fcc 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -1,7 +1,13 @@
 import os
 import sys
+import psutil
+import asyncio
+import subprocess
+
+from typing import Any, Dict, List, Tuple, Union
+
+from llama_cpp.llama_metrics import QueueMetrics, MetricsExporter
 
-from typing import Any, Dict
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 outnull_file = open(os.devnull, "w")
@@ -10,6 +16,7 @@
 STDOUT_FILENO = 1
 STDERR_FILENO = 2
 
+
 class suppress_stdout_stderr(object):
     # NOTE: these must be "saved" here to avoid exceptions when using
     #       this context manager inside of a __del__ method
@@ -75,3 +82,117 @@ class Singleton(object, metaclass=MetaSingleton):
 
     def __init__(self):
         super(Singleton, self).__init__()
+
+
+# Get snapshot of RAM and GPU usage before and after function execution.
+# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
+def get_cpu_usage(pid) -> float:
+    """
+    CPU usage in percentage by the current process.
+    """
+    process = psutil.Process(pid)
+    return process.cpu_percent()
+
+
+def get_ram_usage(pid) -> float:
+    """
+    RAM usage in MiB by the current process.
+    """
+    process = psutil.Process(pid)
+    ram_info = process.memory_info()
+    ram_usage = ram_info.rss / (1024 * 1024)  # Convert to MiB
+    return ram_usage
+
+
+def get_gpu_info_by_pid(pid) -> float:
+    """
+    GPU memory usage by the current process (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-compute-apps=pid,used_memory",
+                "--format=csv,noheader",
+            ]
+        ).decode("utf-8")
+        gpu_info = gpu_info.strip().split("\n")
+        for info in gpu_info:
+            gpu_pid, gpu_ram_usage = info.split(", ")
+            if int(gpu_pid) == pid:
+                return float(gpu_ram_usage.split()[0])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0
+
+
+def get_gpu_general_info() -> Tuple[float, float, float]:
+    """
+    GPU general info (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=utilization.gpu,memory.used,memory.free",
+                "--format=csv,noheader",
+            ]
+        ).decode("utf-8")
+        gpu_utilization, gpu_memory_used, gpu_memory_free = (
+            gpu_info.strip().split("\n")[0].split(", ")
+        )
+        return tuple(
+            float(tup.split()[0])
+            for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free]
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0, 0.0, 0.0
+
+
+async def monitor_task_queue(
+    status_dict: Dict[str, Union[int, float]], metrics_exporter: MetricsExporter
+):
+    """
+    An asynchronous function that monitors the task queue and updates
+    a shared status dictionary with the number of tasks that have not
+    started and the number of tasks that are currently running.
+    It recursively calls itself to continuously monitor the task queue.
+    NOTE: There will always be 3 tasks running in the task queue:
+    - LifespanOn.main: Main application coroutine
+    - Server.serve: Server coroutine
+    - monitor_task_queue: Task queue monitoring coroutine
+    Any upcoming requests will be added to the task queue in the form of
+    another RequestReponseCycle.run_asgi coroutine.
+    """
+    if not isinstance(metrics_exporter, MetricsExporter):
+        raise ValueError("metrics_exporter must be an instance of MetricsExporter")
+    
+    all_tasks = asyncio.all_tasks()
+
+    # Get count of all running tasks
+    _all_tasks = [task for task in all_tasks if task._state == "PENDING"]
+    status_dict["running_tasks_count"] = len(_all_tasks)
+    # Get basic metadata of all running tasks
+    status_dict["running_tasks"] = {
+        task.get_name(): str(task.get_coro())
+        .lstrip("\u003C")
+        .rstrip("\u003E")
+        for task in all_tasks
+    }
+
+    assert status_dict is not None
+
+    # Register current running tasks as a Prometheus metric
+    _labels = {
+        "service": "general",
+        "request_type": "health_check",
+    }
+    _queue_metrics = QueueMetrics(**status_dict)
+    metrics_exporter.log_queue_metrics(_queue_metrics, _labels)
+
+    await asyncio.sleep(5)  # adds a delay of 5 seconds to avoid overloading the CPU
+
+    asyncio.create_task(
+        monitor_task_queue(status_dict, metrics_exporter)
+    )  # pass status_dict to the next task
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 5f5396683c..476d75d54e 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -43,6 +43,15 @@
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
+from llama_cpp.llama_metrics import RequestMetrics
+
+from llama_cpp._utils import (
+    get_cpu_usage, 
+    get_ram_usage, 
+    get_gpu_info_by_pid,
+    get_gpu_general_info
+)
+
 from llama_cpp.llama_speculative import LlamaDraftModel
 
 import numpy as np
@@ -57,7 +66,7 @@
     _LlamaSamplingContext,  # type: ignore
     _normalize_embedding,  # type: ignore
 )
-from ._logger import set_verbose
+from ._logger import set_verbose, logger
 from ._utils import suppress_stdout_stderr
 
 
@@ -394,7 +403,7 @@ def __init__(
                 )
 
         if self.verbose:
-            print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
+            logger.info(f'System info: {llama_cpp.llama_print_system_info().decode("utf-8")}')
 
         self.chat_format = chat_format
         self.chat_handler = chat_handler
@@ -425,10 +434,10 @@ def __init__(
         except Exception as e:
             self.metadata = {}
             if self.verbose:
-                print(f"Failed to load metadata: {e}", file=sys.stderr)
+                logger.error(f"Failed to load metadata: {e}")
 
         if self.verbose:
-            print(f"Model metadata: {self.metadata}", file=sys.stderr)
+            logger.info(f"Model metadata: {self.metadata}")
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()
@@ -443,7 +452,7 @@ def __init__(
             template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
 
         if self.verbose and template_choices:
-            print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
+            logger.info(f"Available chat formats from metadata: {', '.join(template_choices.keys())}")
 
         for name, template in template_choices.items():
             self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
@@ -465,20 +474,20 @@ def __init__(
             if chat_format is not None:
                 self.chat_format = chat_format
                 if self.verbose:
-                    print(f"Guessed chat format: {chat_format}", file=sys.stderr)
+                    logger.info(f"Guessed chat format: {chat_format}")
             else:
                 if self.verbose:
-                    print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
-                    print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
-                    print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
+                    logger.info(f"Using gguf chat template: {template_choices['chat_template.default']}")
+                    logger.info(f"Using chat eos_token: {eos_token}")
+                    logger.info(f"Using chat bos_token: {bos_token}")
 
                 self.chat_format = "chat_template.default"
 
         if self.chat_format is None and self.chat_handler is None:
             self.chat_format = "llama-2"
             if self.verbose:
-                print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
-
+                logger.info(f"Using fallback chat format: {self.chat_format}")
+                
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
@@ -719,7 +728,8 @@ def generate(
                     break
             if longest_prefix > 0:
                 if self.verbose:
-                    print("Llama.generate: prefix-match hit", file=sys.stderr)
+                    # print("Llama.generate: prefix-match hit", file=sys.stderr)
+                    logger.info("Llama.generate: prefix-match hit")
                 reset = False
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
@@ -945,7 +955,7 @@ def decode_batch(seq_sizes: List[int]):
             return output, total_tokens
         else:
             return output
-
+        
     def _create_completion(
         self,
         prompt: Union[str, List[int]],
@@ -973,11 +983,23 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        ai_service: Optional[str] = None
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
+        # Variables required for metric collection
+        _metrics_dict = {}
+        _ttft_start = time.time()
+        _pid = os.getpid()
+        _tpot_metrics = []
+        _labels = {
+            "service": ai_service if ai_service is not None else "not-specified",
+            "request_type": "chat/completions",
+        }
+        # Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process
+        _ = get_cpu_usage(_pid)
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
@@ -1129,23 +1151,26 @@ def logit_bias_processor(
 
         finish_reason = "length"
         multibyte_fix = 0
-        for token in self.generate(
-            prompt_tokens,
-            top_k=top_k,
-            top_p=top_p,
-            min_p=min_p,
-            typical_p=typical_p,
-            temp=temperature,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            repeat_penalty=repeat_penalty,
-            stopping_criteria=stopping_criteria,
-            logits_processor=logits_processor,
-            grammar=grammar,
+        _tpot_start = time.time()
+        for idx, token in enumerate(
+            self.generate(
+                prompt_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                min_p=min_p,
+                typical_p=typical_p,
+                temp=temperature,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
+                repeat_penalty=repeat_penalty,
+                stopping_criteria=stopping_criteria,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            )
         ):
             assert self._model.model is not None
             if llama_cpp.llama_token_is_eog(self._model.model, token):
@@ -1302,6 +1327,14 @@ def logit_bias_processor(
                 finish_reason = "length"
                 break
 
+            # Record TTFT metric (once)
+            if idx == 0:
+                _metrics_dict["time_to_first_token"] = time.time() - _ttft_start
+            # Record TPOT metric
+            else:
+                _tpot_metrics.append(time.time() - _tpot_start)
+            _tpot_start = time.time()  # reset
+
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
@@ -1322,6 +1355,14 @@ def logit_bias_processor(
 
             token_end_position = 0
             for token in remaining_tokens:
+                # Record TTFT metric (once)
+                if idx == 0:
+                    _metrics_dict["time_to_first_token"] = time.time() - _ttft_start
+                # Record TPOT metric
+                else:
+                    _tpot_metrics.append(time.time() - _tpot_start)
+                _tpot_start = time.time()  # reset
+
                 token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -1415,6 +1456,53 @@ def logit_bias_processor(
                     print("Llama._create_completion: cache save", file=sys.stderr)
                 self.cache[prompt_tokens + completion_tokens] = self.save_state()
                 print("Llama._create_completion: cache saved", file=sys.stderr)
+            
+            ## PROMETHEUS METRICS IN STREAMING MODE ##
+            # Record TTFT metric -- Setting to None if no tokens were generated
+            if not _metrics_dict.get("time_to_first_token"):
+                _metrics_dict["time_to_first_token"] = None
+
+            # Record TPOT metrics (per generated token)
+            _metrics_dict["time_per_output_token"] = _tpot_metrics
+
+            # Record metrics from the C++ backend (converted to seconds)
+            _timings = llama_cpp.llama_get_timings(self._ctx.ctx)
+            _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
+            _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
+            _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
+            _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
+            _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
+            _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
+            _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
+            _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)
+
+            # Record prefill and generation token metrics
+            _metrics_dict["prefill_tokens"] = len(prompt_tokens)
+            _metrics_dict["generation_tokens"] = len(completion_tokens)
+
+            # Record system info
+            _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
+            _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid)  # TODO: Returning always 0.0 -> check
+            _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
+            _metrics_dict["gpu_utilization"] = _gpu_utilization
+            _metrics_dict["gpu_ram_usage"] = _gpu_memory_used
+            _metrics_dict["gpu_ram_free"] = _gpu_memory_free
+            _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
+            _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
+            _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
+            _metrics_dict["system_info"] = {
+                "model": model_name,
+                "n_params": str(llama_cpp.llama_model_n_params(self.model)),
+                "n_embd": str(self.n_embd()),
+                "n_ctx": str(self.n_ctx()),
+                "n_vocab": str(self.n_vocab()),
+                "n_threads": str(self.n_threads)
+            } 
+
+            # Log metrics to Prometheus
+            _all_metrics = RequestMetrics(**_metrics_dict)
+            self.metrics.log_request_metrics(_all_metrics, labels=_labels)
+            
             return
 
         if self.cache:
@@ -1489,12 +1577,59 @@ def logit_bias_processor(
                 "token_logprobs": token_logprobs,
                 "top_logprobs": top_logprobs,
             }
+        
+        ## PROMETHEUS METRICS IN CHAT COMPLETION MODE ##
+        # Record TTFT metric -- Setting to None if no tokens were generated
+        if not _metrics_dict.get("time_to_first_token"):
+            _metrics_dict["time_to_first_token"] = None
+
+        # Record TPOT metrics (per generated token)
+        _metrics_dict["time_per_output_token"] = _tpot_metrics
+
+        # Record metrics from the C++ backend (converted to seconds)
+        _timings = llama_cpp.llama_get_timings(self._ctx.ctx)
+        _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
+        _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
+        _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
+        _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
+        _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
+        _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
+        _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
+        _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)
+
+        # Record prefill and generation token metrics
+        _metrics_dict["prefill_tokens"] = len(prompt_tokens)
+        _metrics_dict["generation_tokens"] = len(completion_tokens)
+
+        # Record system info
+        _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
+        _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid)  # TODO: Returning always 0.0 -> check
+        _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
+        _metrics_dict["gpu_utilization"] = _gpu_utilization
+        _metrics_dict["gpu_ram_usage"] = _gpu_memory_used
+        _metrics_dict["gpu_ram_free"] = _gpu_memory_free
+        _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
+        _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
+        _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
+        _metrics_dict["system_info"] = {
+            "model": model_name,
+            "n_params": str(llama_cpp.llama_model_n_params(self.model)),
+            "n_embd": str(self.n_embd()),
+            "n_ctx": str(self.n_ctx()),
+            "n_vocab": str(self.n_vocab()),
+            "n_threads": str(self.n_threads)
+        } 
+
+        # Log metrics to Prometheus
+        _all_metrics = RequestMetrics(**_metrics_dict)
+        self.metrics.log_request_metrics(_all_metrics, labels=_labels)
 
         yield {
             "id": completion_id,
             "object": "text_completion",
             "created": created,
             "model": model_name,
+            "service": ai_service,
             "choices": [
                 {
                     "text": text_str,
@@ -1537,6 +1672,7 @@ def create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        ai_service: Optional[str] = None
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1600,6 +1736,7 @@ def create_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            ai_service=ai_service
         )
         if stream:
             chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -1634,6 +1771,7 @@ def __call__(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        ai_service: Optional[str] = None
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1697,6 +1835,7 @@ def __call__(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            ai_service=ai_service
         )
 
     def create_chat_completion(
@@ -1729,6 +1868,7 @@ def create_chat_completion(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        ai_service: Optional[str] = None
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1798,6 +1938,7 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            ai_service=ai_service
         )
 
     def create_chat_completion_openai_v1(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 975f74762d..60bef9e842 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -88,6 +88,7 @@ def __call__(
         grammar: Optional[llama.LlamaGrammar] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        ai_service: Optional[str] = None,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -537,6 +538,7 @@ def chat_completion_handler(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        ai_service: Optional[str] = None,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -627,6 +629,7 @@ def chat_completion_handler(
             stopping_criteria=stopping_criteria,
             grammar=grammar,
             logit_bias=logit_bias,
+            ai_service=ai_service
         )
         if tool is not None:
             tool_name = tool["function"]["name"]
@@ -1714,6 +1717,7 @@ def functionary_v1_v2_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    ai_service: Optional[str] = None,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -1930,6 +1934,7 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            ai_service=ai_service
         )
         if stream is False:
             completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py
new file mode 100644
index 0000000000..a7ffc7a094
--- /dev/null
+++ b/llama_cpp/llama_metrics.py
@@ -0,0 +1,425 @@
+from dataclasses import dataclass
+from typing import Any, Optional, Dict, List
+
+from prometheus_client import Gauge, Info, Histogram
+
+
+LABELS = ["request_type", "service"]
+
+
+@dataclass
+class RequestMetrics:
+    """
+    A dataclass to store metrics for a given request.
+    """
+
+    # System metrics
+    system_info: Dict[str, Any]
+    state_size: int
+    cpu_utilization: float
+    cpu_ram_pid: float
+    gpu_utilization: float
+    gpu_ram_usage: float
+    gpu_ram_free: float
+    gpu_ram_pid: float
+
+    # Metrics from the C++ backend
+    load_time: float
+    sample_time: float
+    sample_throughput: float
+    time_to_first_token: float
+    time_per_output_token: List[float]
+    prompt_eval_time: float
+    prompt_eval_throughput: float
+    completion_eval_time: float
+    completion_eval_throughput: float
+    end_to_end_latency: float
+    prefill_tokens: int
+    generation_tokens: int
+    kv_cache_usage_ratio: float
+
+
+@dataclass
+class QueueMetrics:
+    """
+    A dataclass to store metrics for the task queue.
+    """
+
+    running_tasks_count: int
+    running_tasks: dict
+
+
+class MetricsExporter:
+    """
+    A custom Prometheus Metrics Explorer for the LLAMA C++ backend.
+    Collects metrics per request sent to the backend.
+    """
+
+    def __init__(self):
+        self.labels = LABELS
+        # One-time metrics
+        self._histogram_load_time = Histogram(
+            name="llama_cpp_python:load_t_seconds",
+            documentation="Histogram of load time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.0,
+                3.0,
+                4.0,
+                5.0,
+                6.0,
+                7.0,
+                8.0,
+                9.0,
+                10.0,
+                12.5,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+            ],
+        )
+        # Request-level latencies
+        self._histogram_sample_time = Histogram(
+            name="llama_cpp_python:sample_t_seconds",
+            documentation="Histogram of token sampling time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.00001,
+                0.00005,
+                0.0001,
+                0.00025,
+                0.0005,
+                0.001,
+                0.0025,
+                0.005,
+                0.0075,
+                0.01,
+                0.025,
+                0.05,
+                0.075,
+                0.1,
+                0.25,
+                0.5,
+            ],
+        )
+        self._histogram_time_to_first_token = Histogram(
+            name="llama_cpp_python:ttft_seconds",
+            documentation="Histogram of time to first token in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.001,
+                0.005,
+                0.01,
+                0.02,
+                0.04,
+                0.06,
+                0.08,
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                12.5,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+            ],
+        )
+        self._histogram_time_per_output_token = Histogram(
+            name="llama_cpp_python:tpot_seconds",
+            documentation="Histogram of time per output token in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.001,
+                0.005,
+                0.01,
+                0.02,
+                0.04,
+                0.06,
+                0.08,
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                12.5,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+            ],
+        )
+        self._histogram_prompt_eval_time = Histogram(
+            name="llama_cpp_python:p_eval_t_seconds",
+            documentation="Histogram of prompt evaluation time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                12.5,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
+        )
+        self._histogram_completion_eval_time = Histogram(
+            name="llama_cpp_python:c_eval_t_seconds",
+            documentation="Histogram of completion evaluation time in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                12.5,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
+        )
+        self._histogram_e2e_request_latency = Histogram(
+            name="llama_cpp_python:e2e_seconds",
+            documentation="Histogram of end-to-end request latency in seconds",
+            labelnames=self.labels,
+            buckets=[
+                0.1,
+                0.25,
+                0.5,
+                0.75,
+                1.0,
+                2.5,
+                5.0,
+                7.5,
+                10.0,
+                12.5,
+                15.0,
+                20.0,
+                25.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
+        )
+        # Prefill and generation tokens
+        self._histogram_prefill_tokens = Histogram(
+            name="llama_cpp_python:prefill_tokens_total",
+            documentation="Histogram of number of prefill tokens processed",
+            labelnames=self.labels,
+            buckets=[
+                1,
+                10,
+                25,
+                50,
+                100,
+                250,
+                500,
+                750,
+                1000,
+                1500,
+                2000,
+                2500,
+                3000,
+                3500,
+                4000,
+                4500,
+                5000,
+            ],
+        )
+        self._histogram_generation_tokens = Histogram(
+            name="llama_cpp_python:completion_tokens_total",
+            documentation="Histogram of number of generation tokens processed",
+            labelnames=self.labels,
+            buckets=[
+                1,
+                10,
+                25,
+                50,
+                100,
+                250,
+                500,
+                750,
+                1000,
+                1500,
+                2000,
+                2500,
+                3000,
+                3500,
+                4000,
+                4500,
+                5000,
+            ],
+        )
+        # Current throughput
+        self._gauge_prompt_eval_throughput = Gauge(
+            name="llama_cpp_python:prompt_eval_throughput",
+            documentation="Current throughput of the prompt evaluation process (in tokens/second)",
+            labelnames=self.labels,
+        )
+        self._gauge_completion_eval_throughput = Gauge(
+            name="llama_cpp_python:completion_eval_throughput",
+            documentation="Current throughput of the completion evaluation process (in tokens/second)",
+            labelnames=self.labels,
+        )
+        self._gauge_sample_throughput = Gauge(
+            name="llama_cpp_python:sample_throughput",
+            documentation="Current throughput of the token sampling process (in tokens/second)",
+            labelnames=self.labels,
+        )
+        # System info
+        self._gauge_state_size = Gauge(
+            name="llama_cpp_python:state_size",
+            documentation="Current state size in bytes of various components such as rng (random number generator), logits, embedding, and kv_cache (key-value cache)",
+            labelnames=self.labels,
+        )
+        self._gauge_cpu_utilization = Gauge(
+            name="llama_cpp_python:cpu_utilization",
+            documentation="Current CPU utilization",
+            labelnames=self.labels,
+        )
+        self._gauge_cpu_ram_usage_by_pid = Gauge(
+            name="llama_cpp_python:cpu_memory_usage_by_pid",
+            documentation="Current CPU memory usage during the request",
+            labelnames=self.labels,
+        )
+        self._gauge_gpu_utilization = Gauge(
+            name="llama_cpp_python:gpu_utilization",
+            documentation="Current GPU utilization",
+            labelnames=self.labels,
+        )
+        self._gauge_gpu_memory_usage = Gauge(
+            name="llama_cpp_python:gpu_memory_usage",
+            documentation="Current GPU memory usage",
+            labelnames=self.labels,
+        )
+        self._gauge_gpu_memory_free = Gauge(
+            name="llama_cpp_python:gpu_memory_free",
+            documentation="Current free GPU memory",
+            labelnames=self.labels,
+        )
+        self._gauge_gpu_memory_usage_by_pid = Gauge(
+            name="llama_cpp_python:gpu_memory_usage_by_pid",
+            documentation="Current GPU memory usage during the request",
+            labelnames=self.labels,
+        )
+        self._gauge_kv_cache_usage_ratio = Gauge(
+            name="llama_cpp_python:kv_cache_usage_ratio",
+            documentation="KV-cache usage. 1 means 100 percent usage",
+            labelnames=self.labels,
+        )
+        self._gauge_running_tasks = Histogram(
+            name="llama_cpp_python:running_tasks",
+            documentation="Number of running tasks in the task queue",
+            labelnames=self.labels,
+            buckets=[
+                1,
+                2,
+                3,
+                4,
+                5,
+                6,
+                7,
+                8,
+                9,
+                10,
+                12,
+                14,
+                16,
+                18,
+                20,
+                25,
+                30,
+                35,
+                40,
+                45,
+                50,
+            ],
+        )
+
+        # Server metadata
+        self._info = Info(name="llama_cpp_python:info", documentation="Server metadata")
+
+    def log_request_metrics(self, metrics: RequestMetrics, labels: Dict[str, str]):
+        """
+        Log the metrics using the Prometheus client.
+        """
+        self._histogram_load_time.labels(**labels).observe(metrics.load_time)
+        self._histogram_sample_time.labels(**labels).observe(metrics.sample_time)
+        if metrics.time_to_first_token:
+            self._histogram_time_to_first_token.labels(**labels).observe(
+                metrics.time_to_first_token
+            )
+        for _tpot in metrics.time_per_output_token:
+            self._histogram_time_per_output_token.labels(**labels).observe(_tpot)
+        self._histogram_prompt_eval_time.labels(**labels).observe(
+            metrics.prompt_eval_time
+        )
+        self._histogram_completion_eval_time.labels(**labels).observe(
+            metrics.completion_eval_time
+        )
+        self._histogram_e2e_request_latency.labels(**labels).observe(
+            metrics.end_to_end_latency
+        )
+        self._histogram_prefill_tokens.labels(**labels).observe(metrics.prefill_tokens)
+        self._histogram_generation_tokens.labels(**labels).observe(
+            metrics.generation_tokens
+        )
+        self._gauge_prompt_eval_throughput.labels(**labels).set(
+            metrics.prompt_eval_throughput
+        )
+        self._gauge_completion_eval_throughput.labels(**labels).set(
+            metrics.completion_eval_throughput
+        )
+        self._gauge_sample_throughput.labels(**labels).set(metrics.sample_throughput)
+        self._gauge_cpu_utilization.labels(**labels).set(metrics.cpu_utilization)
+        self._gauge_cpu_ram_usage_by_pid.labels(**labels).set(metrics.cpu_ram_pid)
+        self._gauge_gpu_utilization.labels(**labels).set(metrics.gpu_utilization)
+        self._gauge_gpu_memory_usage.labels(**labels).set(metrics.gpu_ram_usage)
+        self._gauge_gpu_memory_free.labels(**labels).set(metrics.gpu_ram_free)
+        self._gauge_gpu_memory_usage_by_pid.labels(**labels).set(metrics.gpu_ram_pid)
+        self._gauge_state_size.labels(**labels).set(metrics.state_size)
+        self._gauge_kv_cache_usage_ratio.labels(**labels).set(
+            metrics.kv_cache_usage_ratio
+        )
+        self._info.info(metrics.system_info)
+
+    def log_queue_metrics(self, metrics: QueueMetrics, labels: Dict[str, str]):
+        """
+        Log the metrics for the task queue.
+        """
+        self._gauge_running_tasks.labels(**labels).observe(metrics.running_tasks_count)
diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index a6f1f4e9ca..6ced7bc106 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -37,6 +37,7 @@
     ConfigFileSettings,
 )
 from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
+from llama_cpp._logger import logger, UVICORN_LOGGING_CONFIG
 
 
 def main():
@@ -75,7 +76,7 @@ def main():
             server_settings = parse_model_from_args(ServerSettings, args)
             model_settings = [parse_model_from_args(ModelSettings, args)]
     except Exception as e:
-        print(e, file=sys.stderr)
+        logger.error(e)
         parser.print_help()
         sys.exit(1)
     assert server_settings is not None
@@ -90,6 +91,7 @@ def main():
         port=int(os.getenv("PORT", server_settings.port)),
         ssl_keyfile=server_settings.ssl_keyfile,
         ssl_certfile=server_settings.ssl_certfile,
+        log_config=UVICORN_LOGGING_CONFIG
     )
 
 
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 8fd41a3fc7..ed90fa155e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from contextlib import asynccontextmanager
+
 import os
 import json
 import typing
@@ -9,12 +11,15 @@
 from functools import partial
 from typing import Iterator, List, Optional, Union, Dict
 
+from prometheus_client import make_asgi_app
+
 import llama_cpp
 
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
 from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
 from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
+from fastapi.responses import JSONResponse
 from fastapi.middleware import Middleware
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer
@@ -41,8 +46,12 @@
     TokenizeInputCountResponse,
     DetokenizeInputRequest,
     DetokenizeInputResponse,
+    HealthMetrics
 )
 from llama_cpp.server.errors import RouteErrorHandler
+from llama_cpp._utils import monitor_task_queue
+from llama_cpp.llama_metrics import MetricsExporter
+from llama_cpp._logger import logger
 
 
 router = APIRouter(route_class=RouteErrorHandler)
@@ -97,6 +106,29 @@ def set_ping_message_factory(factory: typing.Callable[[], bytes]):
     _ping_message_factory = factory
 
 
+def set_metrics_exporter():
+    global metrics_exporter
+    try:
+        metrics_exporter
+    except NameError:
+        metrics_exporter = MetricsExporter()
+
+    return metrics_exporter
+
+task_queue_status = {}
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    A context manager that launches tasks to be run during the application's lifespan.
+    """
+    metrics_exporter = set_metrics_exporter()
+
+    await monitor_task_queue(task_queue_status, metrics_exporter)
+    yield
+
+
 def create_app(
     settings: Settings | None = None,
     server_settings: ServerSettings | None = None,
@@ -136,6 +168,7 @@ def create_app(
         title="🦙 llama.cpp Python API",
         version=llama_cpp.__version__,
         root_path=server_settings.root_path,
+        lifespan=lifespan
     )
     app.add_middleware(
         CORSMiddleware,
@@ -149,6 +182,11 @@ def create_app(
     assert model_settings is not None
     set_llama_proxy(model_settings=model_settings)
 
+    # Add prometheus asgi middleware to route /metrics requests
+    # see: https://prometheus.github.io/client_python/exporting/http/fastapi-gunicorn/
+    metrics_app = make_asgi_app()
+    app.mount("/metrics", metrics_app)
+
     if server_settings.disable_ping_events:
         set_ping_message_factory(lambda: bytes())
 
@@ -174,9 +212,9 @@ async def get_event_publisher(
                     raise anyio.get_cancelled_exc_class()()
             await inner_send_chan.send(dict(data="[DONE]"))
         except anyio.get_cancelled_exc_class() as e:
-            print("disconnected")
+            logger.warning(f"Disconnected from client {request.client}")
             with anyio.move_on_after(1, shield=True):
-                print(f"Disconnected from client (via refresh/close) {request.client}")
+                logger.error(f"Disconnected from client (via refresh/close) {request.client}")
                 raise e
         finally:
             if on_complete:
@@ -222,6 +260,32 @@ async def authenticate(
 openai_v1_tag = "OpenAI V1"
 
 
+@router.get(
+    "/v1/health",
+    response_model=HealthMetrics,
+    summary="Server's health",
+)
+async def check_health():
+    # 3 running tasks + new scheduled request
+    if 0 <= task_queue_status.get("running_tasks_count", 0) <= 4:
+        return JSONResponse(
+            content={"status": "OK", "task_queue_status": task_queue_status}
+        )
+    # 2 - 6 scheduled requests
+    elif 4 < task_queue_status.get("running_tasks_count", 0) < 10:
+        return JSONResponse(
+            content={"status": "Warning", "task_queue_status": task_queue_status}
+        )
+    # 7+ scheduled requests
+    # TODO: Evaluate if in this case we should manually stop the execution of certain tasks to clear the queue
+    elif task_queue_status.get("running_tasks_count", 0) >= 10:
+        return JSONResponse(
+            content={"status": "Critical", "task_queue_status": task_queue_status}
+        )
+    else:
+        pass
+
+
 @router.post(
     "/v1/completions",
     summary="Completion",
@@ -283,7 +347,6 @@ async def create_completion(
         if request.url.path != "/v1/engines/copilot-codex/completions"
         else "copilot-codex"
     )
-
     exclude = {
         "n",
         "best_of",
@@ -360,7 +423,6 @@ async def create_embedding(
         **request.model_dump(exclude={"user"}),
     )
 
-
 @router.post(
     "/v1/chat/completions",
     summary="Chat",
@@ -405,6 +467,7 @@ async def create_chat_completion(
                         {"role": "system", "content": "You are a helpful assistant."},
                         {"role": "user", "content": "What is the capital of France?"},
                     ],
+                    "ai_service": "copilot"
                 },
             },
             "json_mode": {
@@ -485,7 +548,14 @@ async def create_chat_completion(
         "user",
         "min_tokens",
     }
+    
+    # Extract relevant kwargs from the request body
     kwargs = body.model_dump(exclude=exclude)
+    
+    # Adds the ai_service value from the request body to the kwargs
+    # to be passed downstream to the llama_cpp.ChatCompletion object
+    kwargs["ai_service"] = body.ai_service
+    
     llama = llama_proxy(body.model)
     if body.logit_bias is not None:
         kwargs["logit_bias"] = (
@@ -506,10 +576,13 @@ async def create_chat_completion(
         else:
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
+    # Set the metrics exporter for the llama object
+    llama.metrics = set_metrics_exporter()
+    
     iterator_or_completion: Union[
         llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
     ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
-
+    
     if isinstance(iterator_or_completion, Iterator):
         # EAFP: It's easier to ask for forgiveness than permission
         first_response = await run_in_threadpool(next, iterator_or_completion)
diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index fbf9fd80d5..826e7ed945 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -21,6 +21,7 @@
     CreateEmbeddingRequest,
     CreateChatCompletionRequest,
 )
+from llama_cpp._logger import logger
 
 
 class ErrorResponse(TypedDict):
@@ -134,7 +135,7 @@ def error_message_wrapper(
         ] = None,
     ) -> Tuple[int, ErrorResponse]:
         """Wraps error message in OpenAI style error response"""
-        print(f"Exception: {str(error)}", file=sys.stderr)
+        logger.error(f"Exception: {str(error)}")
         traceback.print_exc(file=sys.stderr)
         if body is not None and isinstance(
             body,
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index ad39c1004b..c35fd37dde 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -9,6 +9,7 @@
 import llama_cpp.llama_tokenizer as llama_tokenizer
 
 from llama_cpp.server.settings import ModelSettings
+from llama_cpp._logger import logger
 
 
 class LlamaProxy:
@@ -272,11 +273,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
-                    print(f"Using disk cache with size {settings.cache_size}")
+                    logger.info(f"Using disk cache with size {settings.cache_size}")
                 cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
             else:
                 if settings.verbose:
-                    print(f"Using ram cache with size {settings.cache_size}")
+                    logger.info(f"Using ram cache with size {settings.cache_size}")
                 cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
             _model.set_cache(cache)
         return _model
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index a75f9e55b9..9c32fe568e 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import List, Optional, Union, Dict
+from typing import List, Optional, Union, Dict, Any
 from typing_extensions import TypedDict, Literal
 
 from pydantic import BaseModel, Field
@@ -267,6 +267,9 @@ class CreateChatCompletionRequest(BaseModel):
         }
     }
 
+    # AI service added as request body parameter by Client
+    ai_service: Optional[str] = None
+
 
 class ModelData(TypedDict):
     id: str
@@ -314,3 +317,7 @@ class DetokenizeInputResponse(BaseModel):
     model_config = {
         "json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
     }
+
+class HealthMetrics(BaseModel):
+    model_config = {"arbitrary_types_allowed": True}
+    task_queue_status: Dict[str, Any]
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 8345cb1f09..4b0246623f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,8 @@ server = [
     "sse-starlette>=1.6.1",
     "starlette-context>=0.3.6,<0.4",
     "PyYAML>=5.1",
+    "prometheus_client>=0.20.0",
+    "psutil>=5.9.8"
 ]
 test = [
     "pytest>=7.4.0",
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 469ef91cab..49150c9f2b 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -5,9 +5,18 @@
 from scipy.special import log_softmax
 
 import llama_cpp
+from llama_cpp.llama_metrics import MetricsExporter
 
 MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"
 
+def set_test_metrics_exporter():
+    global metrics_exporter
+    try:
+        metrics_exporter
+    except NameError:
+        metrics_exporter = MetricsExporter()
+
+    return metrics_exporter
 
 def test_llama_cpp_tokenization():
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False)
@@ -153,7 +162,11 @@ def mock_kv_cache_seq_add(
 
 def test_llama_patch(mock_llama):
     n_ctx = 128
+    ai_service_completion = "test-label-suggestions"
+    ai_service_streaming = "test-acceptance-criteria"
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
+    llama.metrics = set_test_metrics_exporter()
+
     n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
     assert n_vocab == 32000
 
@@ -163,32 +176,32 @@ def test_llama_patch(mock_llama):
 
     ## Test basic completion from bos until eos
     mock_llama(llama, all_text)
-    completion = llama.create_completion("", max_tokens=36)
+    completion = llama.create_completion("", max_tokens=36, ai_service=ai_service_completion)
     assert completion["choices"][0]["text"] == all_text
     assert completion["choices"][0]["finish_reason"] == "stop"
 
     ## Test basic completion until eos
     mock_llama(llama, all_text)
-    completion = llama.create_completion(text, max_tokens=20)
+    completion = llama.create_completion(text, max_tokens=20, ai_service=ai_service_completion)
     assert completion["choices"][0]["text"] == output_text
     assert completion["choices"][0]["finish_reason"] == "stop"
 
     ## Test streaming completion until eos
     mock_llama(llama, all_text)
-    chunks = list(llama.create_completion(text, max_tokens=20, stream=True))
+    chunks = list(llama.create_completion(text, max_tokens=20, stream=True, ai_service=ai_service_streaming))
     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text
     assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
 
     ## Test basic completion until stop sequence
     mock_llama(llama, all_text)
-    completion = llama.create_completion(text, max_tokens=20, stop=["lazy"])
+    completion = llama.create_completion(text, max_tokens=20, stop=["lazy"], ai_service=ai_service_completion)
     assert completion["choices"][0]["text"] == " jumps over the "
     assert completion["choices"][0]["finish_reason"] == "stop"
 
     ## Test streaming completion until stop sequence
     mock_llama(llama, all_text)
     chunks = list(
-        llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
+        llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"], ai_service=ai_service_streaming)
     )
     assert (
         "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the "
@@ -197,13 +210,13 @@ def test_llama_patch(mock_llama):
 
     ## Test basic completion until length
     mock_llama(llama, all_text)
-    completion = llama.create_completion(text, max_tokens=2)
+    completion = llama.create_completion(text, max_tokens=2, ai_service=ai_service_completion)
     assert completion["choices"][0]["text"] == " jumps"
     assert completion["choices"][0]["finish_reason"] == "length"
 
     ## Test streaming completion until length
     mock_llama(llama, all_text)
-    chunks = list(llama.create_completion(text, max_tokens=2, stream=True))
+    chunks = list(llama.create_completion(text, max_tokens=2, stream=True, ai_service=ai_service_streaming))
     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps"
     assert chunks[-1]["choices"][0]["finish_reason"] == "length"
 
@@ -228,17 +241,19 @@ def test_llama_pickle():
 
 def test_utf8(mock_llama):
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, logits_all=True)
+    llama.metrics = set_test_metrics_exporter()
 
     output_text = "😀"
+    ai_service = "label-suggestions"
 
     ## Test basic completion with utf8 multibyte
     mock_llama(llama, output_text)
-    completion = llama.create_completion("", max_tokens=4)
+    completion = llama.create_completion("", max_tokens=4, ai_service=ai_service)
     assert completion["choices"][0]["text"] == output_text
 
     ## Test basic completion with incomplete utf8 multibyte
     mock_llama(llama, output_text)
-    completion = llama.create_completion("", max_tokens=1)
+    completion = llama.create_completion("", max_tokens=1, ai_service=ai_service)
     assert completion["choices"][0]["text"] == ""
 
 
@@ -266,6 +281,22 @@ def test_llama_server():
     }
 
 
+def test_metrics_endpoint():
+    from fastapi.testclient import TestClient
+    from llama_cpp.server.app import create_app, Settings
+
+    settings = Settings(
+        model=MODEL,
+        vocab_only=True,
+    )
+    app = create_app(settings)
+    client = TestClient(app)
+    response = client.get("/metrics")
+    assert response.status_code == 200
+    assert "test-label-suggestions" in response.text
+    assert "test-acceptance-criteria" in response.text
+
+
 @pytest.mark.parametrize(
     "size_and_axis",
     [