diff --git a/README.md b/README.md
index dbaec50775..8ebf49c78a 100644
--- a/README.md
+++ b/README.md
@@ -272,20 +272,20 @@ The high-level API provides a simple managed interface through the [`Llama`](htt
 Below is a short example demonstrating how to use the high-level API to for basic text completion:
 
 ```python
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 llm = Llama(
-      model_path="./models/7B/llama-model.gguf",
-      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
-      # seed=1337, # Uncomment to set a specific seed
-      # n_ctx=2048, # Uncomment to increase the context window
+    model_path="./models/7B/llama-model.gguf",
+    # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+    # seed=1337, # Uncomment to set a specific seed
+    # n_ctx=2048, # Uncomment to increase the context window
 )
 output = llm(
-      "Q: Name the planets in the solar system? A: ", # Prompt
-      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
-      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
-      echo=True # Echo the prompt back in the output
-) # Generate a completion, can also call create_completion
+    "Q: Name the planets in the solar system? A: ",  # Prompt
+    max_tokens=32,  # Generate up to 32 tokens, set to None to generate up to the end of the context window
+    stop=["Q:", "\n"],  # Stop generating just before the model would generate a new question
+    echo=True  # Echo the prompt back in the output
+)  # Generate a completion, can also call create_completion
 print(output)
 ```
 
@@ -346,19 +346,20 @@ The model will will format the messages into a single prompt using the following
 Set `verbose=True` to see the selected chat format.
 
 ```python
-from llama_cpp import Llama
+from llama_cpp_python import Llama
+
 llm = Llama(
-      model_path="path/to/llama-2/llama-model.gguf",
-      chat_format="llama-2"
+    model_path="path/to/llama-2/llama-model.gguf",
+    chat_format="llama-2"
 )
 llm.create_chat_completion(
-      messages = [
-          {"role": "system", "content": "You are an assistant who perfectly describes images."},
-          {
-              "role": "user",
-              "content": "Describe this image in detail please."
-          }
-      ]
+    messages=[
+        {"role": "system", "content": "You are an assistant who perfectly describes images."},
+        {
+            "role": "user",
+            "content": "Describe this image in detail please."
+        }
+    ]
 )
 ```
 
@@ -376,7 +377,8 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the
 The following example will constrain the response to valid JSON strings only.
 
 ```python
-from llama_cpp import Llama
+from llama_cpp_python import Llama
+
 llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
 llm.create_chat_completion(
     messages=[
@@ -398,7 +400,8 @@ llm.create_chat_completion(
 To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument.
 
 ```python
-from llama_cpp import Llama
+from llama_cpp_python import Llama
+
 llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
 llm.create_chat_completion(
     messages=[
@@ -425,47 +428,48 @@ llm.create_chat_completion(
 The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
 
 ```python
-from llama_cpp import Llama
+from llama_cpp_python import Llama
+
 llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
 llm.create_chat_completion(
-      messages = [
+    messages=[
         {
-          "role": "system",
-          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
+            "role": "system",
+            "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
 
         },
         {
-          "role": "user",
-          "content": "Extract Jason is 25 years old"
+            "role": "user",
+            "content": "Extract Jason is 25 years old"
         }
-      ],
-      tools=[{
+    ],
+    tools=[{
         "type": "function",
         "function": {
-          "name": "UserDetail",
-          "parameters": {
-            "type": "object",
-            "title": "UserDetail",
-            "properties": {
-              "name": {
-                "title": "Name",
-                "type": "string"
-              },
-              "age": {
-                "title": "Age",
-                "type": "integer"
-              }
-            },
-            "required": [ "name", "age" ]
-          }
+            "name": "UserDetail",
+            "parameters": {
+                "type": "object",
+                "title": "UserDetail",
+                "properties": {
+                    "name": {
+                        "title": "Name",
+                        "type": "string"
+                    },
+                    "age": {
+                        "title": "Age",
+                        "type": "integer"
+                    }
+                },
+                "required": ["name", "age"]
+            }
         }
-      }],
-      tool_choice={
+    }],
+    tool_choice={
         "type": "function",
         "function": {
-          "name": "UserDetail"
+            "name": "UserDetail"
         }
-      }
+    }
 )
 ```
 
@@ -477,13 +481,14 @@ The various gguf-converted files for this set of models can be found [here](http
 Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
 
 ```python
-from llama_cpp import Llama
-from llama_cpp.llama_tokenizer import LlamaHFTokenizer
+from llama_cpp_python import Llama
+from llama_cpp_python.llama_tokenizer import LlamaHFTokenizer
+
 llm = Llama.from_pretrained(
-  repo_id="meetkai/functionary-small-v2.2-GGUF",
-  filename="functionary-small-v2.2.q4_0.gguf",
-  chat_format="functionary-v2",
-  tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
+    repo_id="meetkai/functionary-small-v2.2-GGUF",
+    filename="functionary-small-v2.2.q4_0.gguf",
+    chat_format="functionary-v2",
+    tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
 )
 ```
 
@@ -509,22 +514,24 @@ Below are the supported multi-modal models and their respective chat handlers (P
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
 ```python
-from llama_cpp import Llama
-from llama_cpp.llama_chat_format import Llava15ChatHandler
+from llama_cpp_python import Llama
+from llama_cpp_python.llama_chat_format import Llava15ChatHandler
+
 chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
 llm = Llama(
-  model_path="./path/to/llava/llama-model.gguf",
-  chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
+    model_path="./path/to/llava/llama-model.gguf",
+    chat_handler=chat_handler,
+    n_ctx=2048,  # n_ctx should be increased to accommodate the image embedding
 )
 llm.create_chat_completion(
-    messages = [
+    messages=[
         {"role": "system", "content": "You are an assistant who perfectly describes images."},
         {
             "role": "user",
             "content": [
-                {"type" : "text", "text": "What's in this image?"},
-                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
+                {"type": "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {
+                    "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}}
             ]
         }
     ]
@@ -534,28 +541,29 @@ llm.create_chat_completion(
 You can also pull the model from the Hugging Face Hub using the `from_pretrained` method.
 
 ```python
-from llama_cpp import Llama
-from llama_cpp.llama_chat_format import MoondreamChatHandler
+from llama_cpp_python import Llama
+from llama_cpp_python.llama_chat_format import MoondreamChatHandler
 
 chat_handler = MoondreamChatHandler.from_pretrained(
-  repo_id="vikhyatk/moondream2",
-  filename="*mmproj*",
+    repo_id="vikhyatk/moondream2",
+    filename="*mmproj*",
 )
 
 llm = Llama.from_pretrained(
-  repo_id="vikhyatk/moondream2",
-  filename="*text-model*",
-  chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
+    repo_id="vikhyatk/moondream2",
+    filename="*text-model*",
+    chat_handler=chat_handler,
+    n_ctx=2048,  # n_ctx should be increased to accommodate the image embedding
 )
 
 response = llm.create_chat_completion(
-    messages = [
+    messages=[
         {
             "role": "user",
             "content": [
-                {"type" : "text", "text": "What's in this image?"},
-                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
+                {"type": "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {
+                    "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}}
 
             ]
         }
@@ -607,12 +615,13 @@ The fastest way to use speculative decoding is through the `LlamaPromptLookupDec
 Just pass this as a draft model to the `Llama` class during initialization.
 
 ```python
-from llama_cpp import Llama
-from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+from llama_cpp_python import Llama
+from llama_cpp_python.llama_speculative import LlamaPromptLookupDecoding
 
 llama = Llama(
     model_path="path/to/model.gguf",
-    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10)
+    # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
 )
 ```
 
@@ -621,9 +630,9 @@ llama = Llama(
 To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly.
 
 ```python
-import llama_cpp
+import llama_cpp_python
 
-llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True)
+llm = llama_cpp_python.Llama(model_path="path/to/model.gguf", embedding=True)
 
 embeddings = llm.create_embedding("Hello, world!")
 
@@ -657,14 +666,14 @@ To install the server package and get started:
 
 ```bash
 pip install 'llama-cpp-python[server]'
-python3 -m llama_cpp.server --model models/7B/llama-model.gguf
+python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf
 ```
 
 Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
 
 ```bash
 CMAKE_ARGS="-DGGML_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
-python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
+python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf --n_gpu_layers 35
 ```
 
 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
@@ -675,16 +684,16 @@ Similarly, to change the port (default is 8000), use `--port`.
 You probably also want to set the prompt format. For chatml, use
 
 ```bash
-python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format chatml
+python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf --chat_format chatml
 ```
 
 That will format the prompt according to how model expects it. You can find the prompt format in the model card.
-For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
+For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp_python/llama_chat_format.py) and look for lines starting with "@register_chat_format".
 
 If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
 
 ```bash
-python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
+python3 -m llama_cpp_python.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
 ```
 
 ### Web Server Features
@@ -714,18 +723,20 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github
 Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
 
 ```python
-import llama_cpp
+import llama_cpp_python
 import ctypes
-llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
-params = llama_cpp.llama_context_default_params()
+
+llama_cpp_python.llama_backend_init(False)  # Must be called once at the start of each program
+params = llama_cpp_python.llama_context_default_params()
 # use bytes for char * params
-model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
-ctx = llama_cpp.llama_new_context_with_model(model, params)
+model = llama_cpp_python.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
+ctx = llama_cpp_python.llama_new_context_with_model(model, params)
 max_tokens = params.n_ctx
 # use ctypes arrays for array params
-tokens = (llama_cpp.llama_token * int(max_tokens))()
-n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
-llama_cpp.llama_free(ctx)
+tokens = (llama_cpp_python.llama_token * int(max_tokens))()
+n_tokens = llama_cpp_python.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens,
+                                           llama_cpp_python.c_bool(True))
+llama_cpp_python.llama_free(ctx)
 ```
 
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
diff --git a/docs/server.md b/docs/server.md
index cd6f86c513..f4d04956e4 100644
--- a/docs/server.md
+++ b/docs/server.md
@@ -19,7 +19,7 @@ pip install llama-cpp-python[server]
 The server can then be started by running the following command:
 
 ```bash
-python3 -m llama_cpp.server --model <model_path>
+python3 -m llama_cpp_python.server --model <model_path>
 ```
 
 ### Server options
@@ -27,7 +27,7 @@ python3 -m llama_cpp.server --model <model_path>
 For a full list of options, run:
 
 ```bash
-python3 -m llama_cpp.server --help
+python3 -m llama_cpp_python.server --help
 ```
 
 NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
@@ -53,7 +53,7 @@ You'll first need to download one of the available code completion models in GGU
 Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests:
 
 ```bash
-python3 -m llama_cpp.server --model <model_path> --n_ctx 16192
+python3 -m llama_cpp_python.server --model <model_path> --n_ctx 16192
 ```
 
 Then just update your settings in `.vscode/settings.json` to point to your code completion server:
@@ -83,7 +83,7 @@ Then when you run the server you'll need to also specify either `functionary-v1`
 Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
 
 ```bash
-python3 -m llama_cpp.server --model <model_path_to_functionary_v2_model> --chat_format functionary-v2 --hf_pretrained_model_name_or_path <model_path_to_functionary_v2_tokenizer>
+python3 -m llama_cpp_python.server --model <model_path_to_functionary_v2_model> --chat_format functionary-v2 --hf_pretrained_model_name_or_path <model_path_to_functionary_v2_tokenizer>
 ```
 
 Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling.
@@ -104,7 +104,7 @@ You'll first need to download one of the available multi-modal models in GGUF fo
 Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format
 
 ```bash
-python3 -m llama_cpp.server --model <model_path> --clip_model_path <clip_model_path> --chat_format llava-1-5
+python3 -m llama_cpp_python.server --model <model_path> --clip_model_path <clip_model_path> --chat_format llava-1-5
 ```
 
 Then you can just use the OpenAI API as normal
@@ -138,7 +138,7 @@ print(response)
 The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
 
 ```bash
-python3 -m llama_cpp.server --config_file <config_file>
+python3 -m llama_cpp_python.server --config_file <config_file>
 ```
 
 Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py
index 0b36746f91..9dd0b583de 100644
--- a/examples/batch-processing/server.py
+++ b/examples/batch-processing/server.py
@@ -1,19 +1,19 @@
 """llama-cpp-python server from scratch in a single file.
 """
 
-# import llama_cpp
+# import llama_cpp_python
 
 # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
 
-# model_params = llama_cpp.llama_model_default_params()
-# model = llama_cpp.llama_load_model_from_file(path, model_params)
+# model_params = llama_cpp_python.llama_model_default_params()
+# model = llama_cpp_python.llama_load_model_from_file(path, model_params)
 
 # if model is None:
 #     raise RuntimeError(f"Failed to load model from file: {path}")
 
 
-# ctx_params = llama_cpp.llama_context_default_params()
-# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
+# ctx_params = llama_cpp_python.llama_context_default_params()
+# ctx = llama_cpp_python.llama_new_context_with_model(model, ctx_params)
 
 # if ctx is None:
 #     raise RuntimeError("Failed to create context")
diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py
index e16bf234ab..0bdeffddac 100644
--- a/examples/gradio_chat/local.py
+++ b/examples/gradio_chat/local.py
@@ -1,12 +1,12 @@
-import llama_cpp
-import llama_cpp.llama_tokenizer
+import llama_cpp_python
+import llama_cpp_python.llama_tokenizer
 
 import gradio as gr
 
-llama = llama_cpp.Llama.from_pretrained(
+llama = llama_cpp_python.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
-    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+    tokenizer=llama_cpp_python.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
         "Qwen/Qwen1.5-0.5B"
     ),
     verbose=False,
diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py
index dfed17516d..bf064f83cc 100644
--- a/examples/hf_pull/main.py
+++ b/examples/hf_pull/main.py
@@ -1,11 +1,11 @@
-import llama_cpp
-import llama_cpp.llama_tokenizer
+import llama_cpp_python
+import llama_cpp_python.llama_tokenizer
 
 
-llama = llama_cpp.Llama.from_pretrained(
+llama = llama_cpp_python.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
-    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+    tokenizer=llama_cpp_python.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
         "Qwen/Qwen1.5-0.5B"
     ),
     verbose=False,
diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index ee59767d65..6ca7bbe263 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -9,26 +9,26 @@
 
 Then run:
 ```
-uvicorn --factory llama_cpp.server.app:create_app --reload
+uvicorn --factory llama_cpp_python.server.app:create_app --reload
 ```
 
 or
 
 ```
-python3 -m llama_cpp.server
+python3 -m llama_cpp_python.server
 ```
 
 Then visit http://localhost:8000/docs to see the interactive API docs.
 
 
-To actually see the implementation of the server, see llama_cpp/server/app.py
+To actually see the implementation of the server, see llama_cpp_python/server/app.py
 
 """
 
 import os
 import uvicorn
 
-from llama_cpp.server.app import create_app
+from llama_cpp_python.server.app import create_app
 
 if __name__ == "__main__":
     app = create_app()
diff --git a/examples/high_level_api/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py
index feb0ed68d9..85de8654f0 100644
--- a/examples/high_level_api/high_level_api_embedding.py
+++ b/examples/high_level_api/high_level_api_embedding.py
@@ -1,6 +1,6 @@
 import argparse
 
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin")
diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
index e41f375774..fe0de7e253 100644
--- a/examples/high_level_api/high_level_api_inference.py
+++ b/examples/high_level_api/high_level_api_inference.py
@@ -1,7 +1,7 @@
 import json
 import argparse
 
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
diff --git a/examples/high_level_api/high_level_api_infill.py b/examples/high_level_api/high_level_api_infill.py
index 282333e5a8..dfe0f754d6 100644
--- a/examples/high_level_api/high_level_api_infill.py
+++ b/examples/high_level_api/high_level_api_infill.py
@@ -1,6 +1,6 @@
 import argparse
 
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py
index 747c6130e3..5e61a15248 100644
--- a/examples/high_level_api/high_level_api_streaming.py
+++ b/examples/high_level_api/high_level_api_streaming.py
@@ -1,7 +1,7 @@
 import json
 import argparse
 
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py
index b91632f5bc..8c681d39fe 100644
--- a/examples/high_level_api/langchain_custom_llm.py
+++ b/examples/high_level_api/langchain_custom_llm.py
@@ -1,6 +1,6 @@
 import argparse
 
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 from langchain.llms.base import LLM
 from typing import Optional, List, Mapping, Any
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 39081be17a..759bdfbdd8 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -16,7 +16,7 @@
 from time import time
 from os import cpu_count, path
 
-import llama_cpp
+import llama_cpp_python
 from common import GptParams, gpt_params_parse, gpt_random_prompt
 import util
 
@@ -71,7 +71,7 @@ def __init__(self, params: GptParams) -> None:
         self.multibyte_fix = []
 
         # model load
-        self.lparams = llama_cpp.llama_model_default_params()
+        self.lparams = llama_cpp_python.llama_model_default_params()
         self.lparams.n_ctx = self.params.n_ctx
         self.lparams.n_parts = self.params.n_parts
         self.lparams.seed = self.params.seed
@@ -79,23 +79,23 @@ def __init__(self, params: GptParams) -> None:
         self.lparams.use_mlock = self.params.use_mlock
         self.lparams.use_mmap = self.params.use_mmap
 
-        self.model = llama_cpp.llama_load_model_from_file(
+        self.model = llama_cpp_python.llama_load_model_from_file(
             self.params.model.encode("utf8"), self.lparams
         )
 
         # Context Params.
-        self.cparams = llama_cpp.llama_context_default_params()
+        self.cparams = llama_cpp_python.llama_context_default_params()
 
-        self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
+        self.ctx = llama_cpp_python.llama_new_context_with_model(self.model, self.cparams)
         if not self.ctx:
             raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
         if self.params.ignore_eos:
-            self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+            self.params.logit_bias[llama_cpp_python.llama_token_eos()] = -float("inf")
 
         if len(self.params.lora_adapter) > 0:
             if (
-                llama_cpp.llama_apply_lora_from_file(
+                llama_cpp_python.llama_apply_lora_from_file(
                     self.ctx,
                     self.params.lora_adapter.encode("utf8"),
                     (
@@ -113,26 +113,26 @@ def __init__(self, params: GptParams) -> None:
         print(file=sys.stderr)
         print(
             f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
-| {llama_cpp.llama_print_system_info().decode('utf8')}",
+| {llama_cpp_python.llama_print_system_info().decode('utf8')}",
             file=sys.stderr,
         )
 
         # determine the required inference memory per token:
         if self.params.mem_test:
             tmp = [0, 1, 2, 3]
-            llama_cpp.llama_eval(
+            llama_cpp_python.llama_eval(
                 self.ctx,
-                (llama_cpp.c_int * len(tmp))(*tmp),
+                (llama_cpp_python.c_int * len(tmp))(*tmp),
                 len(tmp),
                 0,
                 self.n_threads,
             )
-            llama_cpp.llama_print_timings(self.ctx)
+            llama_cpp_python.llama_print_timings(self.ctx)
             self.exit()
             return
 
         # create internal context
-        self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
+        self.n_ctx = llama_cpp_python.llama_n_ctx(self.ctx)
 
         # Add a space in front of the first character to match OG llama tokenizer behavior
         self.params.prompt = " " + self.params.prompt
@@ -142,7 +142,7 @@ def __init__(self, params: GptParams) -> None:
             with open(self.params.file) as f:
                 self.params.prompt = f.read()
 
-        self.session_tokens: list[llama_cpp.llama_token] = []
+        self.session_tokens: list[llama_cpp_python.llama_token] = []
         if len(self.params.path_session) > 0:
             print(
                 f"attempting to load saved session from '{self.params.path_session}'",
@@ -150,10 +150,10 @@ def __init__(self, params: GptParams) -> None:
             )
 
             if path.exists(self.params.path_session):
-                _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
-                _n_token_count_out = llama_cpp.c_size_t()
+                _session_tokens = (llama_cpp_python.llama_token * (self.params.n_ctx))()
+                _n_token_count_out = llama_cpp_python.c_size_t()
                 if (
-                    llama_cpp.llama_load_session_file(
+                    llama_cpp_python.llama_load_session_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
                         _session_tokens,
@@ -312,8 +312,8 @@ def __init__(self, params: GptParams) -> None:
 
     # tokenize a prompt
     def _tokenize(self, prompt, bos=True):
-        _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
-        _n = llama_cpp.llama_tokenize(
+        _arr = (llama_cpp_python.llama_token * ((len(prompt) + 1) * 4))()
+        _n = llama_cpp_python.llama_tokenize(
             self.model,
             prompt.encode("utf8", errors="ignore"),
             len(prompt),
@@ -379,17 +379,17 @@ def generate(self):
                 # TODO BUG: The batching code causes nonsensical generation
                 """for i in range(0, len(self.embd), self.params.n_batch):
 					n_eval = self.params.n_batch
-					_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
-					if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
+					_arr = (llama_cpp_python.llama_token * n_eval)(*self.embd[i:i + n_eval])
+					if llama_cpp_python.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
 						print(f"failed to eval")
 						return
 					
 					self.n_past += n_eval"""
 
                 if (
-                    llama_cpp.llama_eval(
+                    llama_cpp_python.llama_eval(
                         self.ctx,
-                        (llama_cpp.llama_token * len(self.embd))(*self.embd),
+                        (llama_cpp_python.llama_token * len(self.embd))(*self.embd),
                         len(self.embd),
                         self.n_past,
                     )
@@ -406,7 +406,7 @@ def generate(self):
             if len(self.embd_inp) <= self.input_consumed:  # && !is_interacting
                 # out of user input, sample next token
                 top_k = (
-                    llama_cpp.llama_n_vocab(self.ctx)
+                    llama_cpp_python.llama_n_vocab(self.ctx)
                     if self.params.top_k <= 0
                     else self.params.top_k
                 )
@@ -419,10 +419,10 @@ def generate(self):
                 # optionally save the session on first sample (for faster prompt loading next time)
                 if len(self.params.path_session) > 0 and self.need_to_save_session:
                     self.need_to_save_session = False
-                    llama_cpp.llama_save_session_file(
+                    llama_cpp_python.llama_save_session_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
-                        (llama_cpp.llama_token * len(self.session_tokens))(
+                        (llama_cpp_python.llama_token * len(self.session_tokens))(
                             *self.session_tokens
                         ),
                         len(self.session_tokens),
@@ -430,108 +430,108 @@ def generate(self):
 
                 id = 0
 
-                logits = llama_cpp.llama_get_logits(self.ctx)
-                n_vocab = llama_cpp.llama_n_vocab(self.model)
+                logits = llama_cpp_python.llama_get_logits(self.ctx)
+                n_vocab = llama_cpp_python.llama_n_vocab(self.model)
 
                 # Apply params.logit_bias map
                 for key, value in self.params.logit_bias.items():
                     logits[key] += value
 
-                _arr = (llama_cpp.llama_token_data * n_vocab)(
+                _arr = (llama_cpp_python.llama_token_data * n_vocab)(
                     *[
-                        llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+                        llama_cpp_python.llama_token_data(token_id, logits[token_id], 0.0)
                         for token_id in range(n_vocab)
                     ]
                 )
-                candidates_p = llama_cpp.ctypes.pointer(
-                    llama_cpp.llama_token_data_array(_arr, len(_arr), False)
+                candidates_p = llama_cpp_python.ctypes.pointer(
+                    llama_cpp_python.llama_token_data_array(_arr, len(_arr), False)
                 )
 
                 # Apply penalties
-                nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
+                nl_logit = logits[llama_cpp_python.llama_token_nl(self.ctx)]
                 last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
-                _arr = (llama_cpp.llama_token * last_n_repeat)(
+                _arr = (llama_cpp_python.llama_token * last_n_repeat)(
                     *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :]
                 )
-                llama_cpp.llama_sample_repetition_penalties(
+                llama_cpp_python.llama_sample_repetition_penalties(
                     ctx=self.ctx,
                     candidates=candidates_p,
                     last_tokens_data=_arr,
                     penalty_last_n=last_n_repeat,
-                    penalty_repeat=llama_cpp.c_float(self.params.repeat_penalty),
-                    penalty_freq=llama_cpp.c_float(self.params.frequency_penalty),
-                    penalty_present=llama_cpp.c_float(self.params.presence_penalty),
+                    penalty_repeat=llama_cpp_python.c_float(self.params.repeat_penalty),
+                    penalty_freq=llama_cpp_python.c_float(self.params.frequency_penalty),
+                    penalty_present=llama_cpp_python.c_float(self.params.presence_penalty),
                 )
 
                 # NOT PRESENT IN CURRENT VERSION ?
-                # llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
+                # llama_cpp_python.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
                 # 	_arr,
-                # 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
+                # 	last_n_repeat, llama_cpp_python.c_float(self.params.frequency_penalty), llama_cpp_python.c_float(self.params.presence_penalty))
 
                 if not self.params.penalize_nl:
-                    logits[llama_cpp.llama_token_nl()] = nl_logit
+                    logits[llama_cpp_python.llama_token_nl()] = nl_logit
 
                 if self.params.temp <= 0:
                     # Greedy sampling
-                    id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+                    id = llama_cpp_python.llama_sample_token_greedy(self.ctx, candidates_p)
                 else:
                     if self.params.mirostat == 1:
                         mirostat_mu = 2.0 * self.params.mirostat_tau
                         mirostat_m = 100
-                        llama_cpp.llama_sample_temperature(
-                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                        llama_cpp_python.llama_sample_temperature(
+                            self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp)
                         )
-                        id = llama_cpp.llama_sample_token_mirostat(
+                        id = llama_cpp_python.llama_sample_token_mirostat(
                             self.ctx,
                             candidates_p,
-                            llama_cpp.c_float(self.params.mirostat_tau),
-                            llama_cpp.c_float(self.params.mirostat_eta),
-                            llama_cpp.c_int(mirostat_m),
-                            llama_cpp.c_float(mirostat_mu),
+                            llama_cpp_python.c_float(self.params.mirostat_tau),
+                            llama_cpp_python.c_float(self.params.mirostat_eta),
+                            llama_cpp_python.c_int(mirostat_m),
+                            llama_cpp_python.c_float(mirostat_mu),
                         )
                     elif self.params.mirostat == 2:
                         mirostat_mu = 2.0 * self.params.mirostat_tau
-                        llama_cpp.llama_sample_temperature(
-                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                        llama_cpp_python.llama_sample_temperature(
+                            self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp)
                         )
-                        id = llama_cpp.llama_sample_token_mirostat_v2(
+                        id = llama_cpp_python.llama_sample_token_mirostat_v2(
                             self.ctx,
                             candidates_p,
-                            llama_cpp.c_float(self.params.mirostat_tau),
-                            llama_cpp.c_float(self.params.mirostat_eta),
-                            llama_cpp.c_float(mirostat_mu),
+                            llama_cpp_python.c_float(self.params.mirostat_tau),
+                            llama_cpp_python.c_float(self.params.mirostat_eta),
+                            llama_cpp_python.c_float(mirostat_mu),
                         )
                     else:
                         # Temperature sampling
-                        llama_cpp.llama_sample_top_k(
+                        llama_cpp_python.llama_sample_top_k(
                             self.ctx,
                             candidates_p,
                             top_k,
-                            min_keep=llama_cpp.c_size_t(1),
+                            min_keep=llama_cpp_python.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_tail_free(
+                        llama_cpp_python.llama_sample_tail_free(
                             self.ctx,
                             candidates_p,
-                            llama_cpp.c_float(self.params.tfs_z),
-                            min_keep=llama_cpp.c_size_t(1),
+                            llama_cpp_python.c_float(self.params.tfs_z),
+                            min_keep=llama_cpp_python.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_typical(
+                        llama_cpp_python.llama_sample_typical(
                             self.ctx,
                             candidates_p,
-                            llama_cpp.c_float(self.params.typical_p),
-                            min_keep=llama_cpp.c_size_t(1),
+                            llama_cpp_python.c_float(self.params.typical_p),
+                            min_keep=llama_cpp_python.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_top_p(
+                        llama_cpp_python.llama_sample_top_p(
                             self.ctx,
                             candidates_p,
-                            llama_cpp.c_float(self.params.top_p),
-                            min_keep=llama_cpp.c_size_t(1),
+                            llama_cpp_python.c_float(self.params.top_p),
+                            min_keep=llama_cpp_python.c_size_t(1),
                         )
-                        llama_cpp.llama_sample_temperature(
-                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                        llama_cpp_python.llama_sample_temperature(
+                            self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp)
                         )
-                        id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
+                        id = llama_cpp_python.llama_sample_token(self.ctx, candidates_p)
                 # print("`{}`".format(candidates_p.size))
 
                 self.last_n_tokens.pop(0)
@@ -539,7 +539,7 @@ def generate(self):
 
                 # replace end of text token with newline token when in interactive mode
                 if (
-                    id == llama_cpp.llama_token_eos(self.ctx)
+                    id == llama_cpp_python.llama_token_eos(self.ctx)
                     and self.params.interactive
                     and not self.params.instruct
                 ):
@@ -599,7 +599,7 @@ def generate(self):
                     break
 
             # end of text token
-            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
+            if len(self.embd) > 0 and self.embd[-1] == llama_cpp_python.llama_token_eos(
                 self.ctx
             ):
                 if not self.params.instruct:
@@ -629,14 +629,14 @@ def __exit__(self, type, value, tb):
         self.exit()
 
     def exit(self):
-        llama_cpp.llama_free(self.ctx)
+        llama_cpp_python.llama_free(self.ctx)
         self.set_color(util.CONSOLE_COLOR_DEFAULT)
 
     def token_to_str(self, token_id: int) -> bytes:
         size = 32
         buffer = (ctypes.c_char * size)()
-        n = llama_cpp.llama_token_to_piece(
-            self.model, llama_cpp.llama_token(token_id), buffer, size
+        n = llama_cpp_python.llama_token_to_piece(
+            self.model, llama_cpp_python.llama_token(token_id), buffer, size
         )
         assert n <= size
         return bytes(buffer[:n])
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index ba3545771d..928c00bcfd 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -2,32 +2,32 @@
 import os
 import multiprocessing
 
-import llama_cpp
+import llama_cpp_python
 
-llama_cpp.llama_backend_init(numa=False)
+llama_cpp_python.llama_backend_init(numa=False)
 
 N_THREADS = multiprocessing.cpu_count()
 MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin")
 
 prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
 
-lparams = llama_cpp.llama_model_default_params()
-cparams = llama_cpp.llama_context_default_params()
-model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
-ctx = llama_cpp.llama_new_context_with_model(model, cparams)
+lparams = llama_cpp_python.llama_model_default_params()
+cparams = llama_cpp_python.llama_context_default_params()
+model = llama_cpp_python.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
+ctx = llama_cpp_python.llama_new_context_with_model(model, cparams)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]
-llama_cpp.llama_eval(
-    ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0
+llama_cpp_python.llama_eval(
+    ctx=ctx, tokens=(llama_cpp_python.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0
 )  # Deprecated
 
 n_past = 0
 
 prompt = b" " + prompt
 
-embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
-n_of_tok = llama_cpp.llama_tokenize(
+embd_inp = (llama_cpp_python.llama_token * (len(prompt) + 1))()
+n_of_tok = llama_cpp_python.llama_tokenize(
     model=model,
     text=bytes(str(prompt), "utf-8"),
     text_len=len(embd_inp),
@@ -38,7 +38,7 @@
 )
 embd_inp = embd_inp[:n_of_tok]
 
-n_ctx = llama_cpp.llama_n_ctx(ctx)
+n_ctx = llama_cpp_python.llama_n_ctx(ctx)
 
 n_predict = 20
 n_predict = min(n_predict, n_ctx - len(embd_inp))
@@ -59,9 +59,9 @@
 
 while remaining_tokens > 0:
     if len(embd) > 0:
-        llama_cpp.llama_eval(
+        llama_cpp_python.llama_eval(
             ctx=ctx,
-            tokens=(llama_cpp.c_int * len(embd))(*embd),
+            tokens=(llama_cpp_python.c_int * len(embd))(*embd),
             n_tokens=len(embd),
             n_past=n_past,
         )  # Deprecated
@@ -69,21 +69,21 @@
     n_past += len(embd)
     embd = []
     if len(embd_inp) <= input_consumed:
-        logits = llama_cpp.llama_get_logits(ctx)
-        n_vocab = llama_cpp.llama_n_vocab(model)
+        logits = llama_cpp_python.llama_get_logits(ctx)
+        n_vocab = llama_cpp_python.llama_n_vocab(model)
 
-        _arr = (llama_cpp.llama_token_data * n_vocab)(
+        _arr = (llama_cpp_python.llama_token_data * n_vocab)(
             *[
-                llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+                llama_cpp_python.llama_token_data(token_id, logits[token_id], 0.0)
                 for token_id in range(n_vocab)
             ]
         )
-        candidates_p = llama_cpp.ctypes.pointer(
-            llama_cpp.llama_token_data_array(_arr, len(_arr), False)
+        candidates_p = llama_cpp_python.ctypes.pointer(
+            llama_cpp_python.llama_token_data_array(_arr, len(_arr), False)
         )
 
-        _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
-        llama_cpp.llama_sample_repetition_penalties(
+        _arr = (llama_cpp_python.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
+        llama_cpp_python.llama_sample_repetition_penalties(
             ctx,
             candidates_p,
             _arr,
@@ -93,10 +93,10 @@
             penalty_present=presence_penalty,
         )
 
-        llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
-        llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
-        llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2)
-        id = llama_cpp.llama_sample_token(ctx, candidates_p)
+        llama_cpp_python.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
+        llama_cpp_python.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
+        llama_cpp_python.llama_sample_temperature(ctx, candidates_p, temp=0.2)
+        id = llama_cpp_python.llama_sample_token(ctx, candidates_p)
 
         last_n_tokens_data = last_n_tokens_data[1:] + [id]
         embd.append(id)
@@ -113,8 +113,8 @@
         for id in embd:
             size = 32
             buffer = (ctypes.c_char * size)()
-            n = llama_cpp.llama_token_to_piece(
-                model, llama_cpp.llama_token(id), buffer, size
+            n = llama_cpp_python.llama_token_to_piece(
+                model, llama_cpp_python.llama_token(id), buffer, size
             )
             assert n <= size
             print(
@@ -123,11 +123,11 @@
                 flush=True,
             )
 
-    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
+    if len(embd) > 0 and embd[-1] == llama_cpp_python.llama_token_eos(ctx):
         break
 
 print()
 
-llama_cpp.llama_print_timings(ctx)
+llama_cpp_python.llama_print_timings(ctx)
 
-llama_cpp.llama_free(ctx)
+llama_cpp_python.llama_free(ctx)
diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py
index 057ac389eb..89620681fe 100644
--- a/examples/low_level_api/quantize.py
+++ b/examples/low_level_api/quantize.py
@@ -1,6 +1,6 @@
 import os
 import argparse
-import llama_cpp
+import llama_cpp_python
 
 
 def main(args):
@@ -11,9 +11,9 @@ def main(args):
     if os.path.exists(fname_out):
         raise RuntimeError(f"Output file already exists ({fname_out})")
     ftype = args.type
-    args = llama_cpp.llama_model_quantize_default_params()
+    args = llama_cpp_python.llama_model_quantize_default_params()
     args.ftype = ftype
-    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args)
+    return_code = llama_cpp_python.llama_model_quantize(fname_inp, fname_out, args)
     if return_code != 0:
         raise RuntimeError("Failed to quantize model")
 
@@ -25,7 +25,7 @@ def main(args):
     parser.add_argument(
         "type",
         type=int,
-        help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum",
+        help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp_python.py for enum",
     )
     args = parser.parse_args()
     main(args)
diff --git a/examples/ray/llm.py b/examples/ray/llm.py
index 2325dd303b..855a164e3e 100755
--- a/examples/ray/llm.py
+++ b/examples/ray/llm.py
@@ -2,7 +2,7 @@
 from typing import Dict
 from ray import serve
 from ray.serve import Application
-from llama_cpp import Llama
+from llama_cpp_python import Llama
 
 
 @serve.deployment
diff --git a/llama.cpp-server.cmd b/llama.cpp-server.cmd
new file mode 100644
index 0000000000..76d261cf48
--- /dev/null
+++ b/llama.cpp-server.cmd
@@ -0,0 +1,2 @@
+venv\Scripts\python.exe -m llama_cpp_python.server --config_file config.json
+pause
\ No newline at end of file
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 994d5f1497..0aff348442 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -362,6 +362,13 @@ def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: i
             self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
         )
 
+    def sample_tail_free(
+        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
+    ):
+        llama_cpp.llama_sample_tail_free(
+            self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep
+        )
+
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
@@ -678,6 +685,9 @@ def sample(
                 ctx_main.sample_top_k(
                     token_data_array, self.params.top_k, min_keep=min_keep
                 )
+                ctx_main.sample_tail_free(
+                    token_data_array, self.params.tfs_z, min_keep=min_keep
+                )
                 ctx_main.sample_typical(
                     token_data_array, self.params.typical_p, min_keep=min_keep
                 )
@@ -766,6 +776,10 @@ def add_min_p(self, p: float, min_keep: int):
         sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
         self._add_sampler(sampler)
 
+    def add_tail_free(self, z: float, min_keep: int):
+        sampler = llama_cpp.llama_sampler_init_tail_free(z, min_keep)
+        self._add_sampler(sampler)
+
     def add_typical(self, p: float, min_keep: int):
         sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
         self._add_sampler(sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index d15a88b00e..babb30cf09 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -745,6 +745,7 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
                 n_probs = 0
                 min_keep = max(1, n_probs)
                 sampler.add_top_k(top_k)
+                sampler.add_tail_free(tfs_z, min_keep)
                 sampler.add_typical(typical_p, min_keep)
                 sampler.add_top_p(top_p, min_keep)
                 sampler.add_min_p(min_p, min_keep)
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 457c6dddb4..97c9691363 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -464,8 +464,6 @@ class llama_token_data(ctypes.Structure):
 
 
 # typedef struct llama_token_data_array {
-#     // TODO: consider SoA
-#     // NOTE: this pointer can be modified by the samplers
 #     llama_token_data * data;
 #     size_t size;
 #     int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -509,11 +507,8 @@ class llama_token_data_array(ctypes.Structure):
 # // - token  : the token ids of the input (used when embd is NULL)
 # // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 # // - pos    : the positions of the respective token in the sequence
-# //            (if set to NULL, the token position will be tracked automatically by llama_decode)
 # // - seq_id : the sequence to which the respective token belongs
-# //            (if set to NULL, the sequence ID will be assumed to be 0)
 # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-# //            (if set to NULL, only the logits for last token will be returned)
 # //
 # typedef struct llama_batch {
 #     int32_t n_tokens;
@@ -524,6 +519,16 @@ class llama_token_data_array(ctypes.Structure):
 #     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
 #     int8_t       *  logits; // TODO: rename this to "output"
+
+
+#     // NOTE: helpers for smooth API transition - can be deprecated in the future
+#     //       for future-proof code, use the above fields instead and ignore everything below
+#     //
+#     // pos[i] = all_pos_0 + i*all_pos_1
+#     //
+#     llama_pos    all_pos_0;  // used if pos == NULL
+#     llama_pos    all_pos_1;  // used if pos == NULL
+#     llama_seq_id all_seq_id; // used if seq_id == NULL
 # } llama_batch;
 class llama_batch(ctypes.Structure):
     """Input data for llama_decode
@@ -558,6 +563,9 @@ class llama_batch(ctypes.Structure):
         ("n_seq_id", ctypes.POINTER(ctypes.c_int32)),
         ("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))),
         ("logits", ctypes.POINTER(ctypes.c_int8)),
+        ("all_pos_0", llama_pos),
+        ("all_pos_1", llama_pos),
+        ("all_seq_id", llama_seq_id),
     ]
 
 
@@ -1162,12 +1170,6 @@ def llama_supports_gpu_offload() -> bool:
     ...
 
 
-# LLAMA_API bool llama_supports_rpc        (void);
-@ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
-def llama_supports_rpc() -> bool:
-    ...
-
-
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ctx(ctx: llama_context_p, /) -> int:
@@ -2253,26 +2255,30 @@ def llama_state_seq_load_file(
 # //
 
 
-# // Return batch for single sequence of tokens
-# // The sequence ID will be fixed to 0
-# // The position of the tokens will be tracked automatically by llama_decode
+# // Return batch for single sequence of tokens starting at pos_0
 # //
 # // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
 # //
 # LLAMA_API struct llama_batch llama_batch_get_one(
 #               llama_token * tokens,
-#                   int32_t   n_tokens);
+#                   int32_t   n_tokens,
+#                 llama_pos   pos_0,
+#              llama_seq_id   seq_id);
 @ctypes_function(
     "llama_batch_get_one",
     [
         llama_token_p,
-        ctypes.c_int32,
+        ctypes.c_int,
+        llama_pos,
+        llama_seq_id,
     ],
     llama_batch,
 )
 def llama_batch_get_one(
     tokens: CtypesArray[llama_token],
     n_tokens: Union[ctypes.c_int, int],
+    pos_0: Union[llama_pos, int],
+    seq_id: llama_seq_id,
     /,
 ) -> llama_batch:
     """Return batch for single sequence of tokens starting at pos_0
@@ -2610,13 +2616,6 @@ def llama_token_eos(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
-@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
-def llama_token_eot(model: llama_model_p, /) -> int:
-    """end-of-turn"""
-    ...
-
-
 # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
 @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
 def llama_token_cls(model: llama_model_p, /) -> int:
@@ -2651,54 +2650,30 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool:
 
 
 # // Codellama infill tokens
-# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
+# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
 @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
 def llama_token_prefix(model: llama_model_p) -> int:
     """codellama infill tokens"""
     ...
 
 
-# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
+# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
 @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
 def llama_token_middle(model: llama_model_p, /) -> int:
     ...
 
 
-# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
+# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
 @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
 def llama_token_suffix(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
-@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_pre(model: llama_model_p, /) -> int:
-    ...
-
-# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
-@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_suf(model: llama_model_p, /) -> int:
-    ...
-
-# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
-@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_mid(model: llama_model_p, /) -> int:
-    ...
-
-# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
-@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_pad(model: llama_model_p, /) -> int:
-    ...
-
-# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
-@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_rep(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
+@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
+def llama_token_eot(model: llama_model_p, /) -> int:
     ...
 
-# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
-@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token)
-def llama_token_fim_sep(model: llama_model_p, /) -> int:
-    ...
 
 # //
 # // Tokenization
@@ -2811,23 +2786,6 @@ def llama_token_to_piece(
     ...
 
 
-# # // check if token0 is contained as a prefix in token1
-# # LLAMA_API bool llama_token_is_prefix(
-# #           const struct llama_model * model,
-# #                        llama_token   token0,
-# #                        llama_token   token1);
-# @ctypes_function(
-#     "llama_token_is_prefix",
-#     [llama_model_p_ctypes, llama_token, llama_token],
-#     ctypes.c_bool,
-# )
-# def llama_token_is_prefix(
-#     model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
-# ) -> bool:
-#     """Check if token0 is contained as a prefix in token1"""
-#     ...
-
-
 # /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
 # /// @param text The char pointer must be large enough to hold the resulting text.
 # /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -3141,22 +3099,20 @@ def llama_sampler_chain_remove(
 
 # // available samplers:
 #
-# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
+# LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
 @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
 def llama_sampler_init_greedy() -> llama_sampler_p:
     ...
 
 
-# LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
+# LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
 @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
 def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
     ...
 
 
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-#     "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+# LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
 @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
 def llama_sampler_init_softmax() -> llama_sampler_p:
     ...
@@ -3191,6 +3147,17 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
+# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+# LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
+@ctypes_function(
+    "llama_sampler_init_tail_free",
+    [ctypes.c_float, ctypes.c_size_t],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_tail_free(z: float, min_keep: int) -> llama_sampler_p:
+    ...
+
+
 # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 # LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
 @ctypes_function(
@@ -3221,19 +3188,6 @@ def llama_sampler_init_temp_ext(
     ...
 
 
-# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
-# LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
-@ctypes_function(
-    "llama_sampler_init_xtc",
-    [ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32],
-    llama_sampler_p_ctypes,
-)
-def llama_sampler_init_xtc(
-    p: float, t: float, min_keep: int, seed: int, /
-) -> llama_sampler_p:
-    ...
-
-
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
 # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
 # /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3332,41 +3286,6 @@ def llama_sampler_init_penalties(
     ...
 
 
-# ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
-# LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
-#         const struct llama_model *  model,
-#                            float    dry_multiplier,
-#                            float    dry_base,
-#                          int32_t    dry_allowed_length,
-#                          int32_t    dry_penalty_last_n,
-#                       const char ** seq_breakers,
-#                           size_t    num_breakers);
-@ctypes_function(
-    "llama_sampler_init_dry",
-    [
-        llama_model_p_ctypes,
-        ctypes.c_float,
-        ctypes.c_float,
-        ctypes.c_int32,
-        ctypes.c_int32,
-        ctypes.POINTER(ctypes.c_char_p),
-        ctypes.c_size_t,
-    ],
-    llama_sampler_p_ctypes,
-)
-def llama_sampler_init_dry(
-    model: llama_model_p,
-    dry_multiplier: float,
-    dry_base: float,
-    dry_allowed_length: int,
-    dry_penalty_last_n: int,
-    seq_breakers: CtypesArray[bytes],
-    num_breakers: int,
-    /,
-) -> llama_sampler_p:
-    ...
-
-
 # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
 #                          int32_t   n_vocab,
 #                          int32_t   n_logit_bias,
@@ -3382,39 +3301,6 @@ def llama_sampler_init_logit_bias(
     ...
 
 
-# // this sampler is meant to be used for fill-in-the-middle infilling
-# // it's supposed to be used after top_k + top_p sampling
-# //
-# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
-# // 2. combine probs of tokens that have the same prefix
-# //
-# // example:
-# //
-# // - before:
-# //   "hel":   0.5
-# //   "hell":  0.2
-# //   "hello": 0.1
-# //   "dummy": 0.1
-# //
-# // - after:
-# //   "hel":   0.8
-# //   "dummy": 0.1
-# //
-# // 3. discard non-EOG tokens with low prob
-# // 4. if no tokens are left -> pick EOT
-# //
-# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
-@ctypes_function(
-    "llama_sampler_init_infill",
-    [llama_model_p_ctypes],
-    llama_sampler_p_ctypes,
-)
-def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p:
-    """This sampler is meant to be used for fill-in-the-middle infilling.
-    """
-    ...
-
-
 # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
 # LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
 @ctypes_function(
diff --git a/mkdocs.yml b/mkdocs.yml
index 79a9e67a1a..a8bf03b0c2 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -41,7 +41,7 @@ plugins:
             - https://numpy.org/doc/stable/objects.inv
 
 watch:
-  - llama_cpp
+  - llama_cpp_python
   - README.md
 
 nav:
diff --git a/tests/test_llama.py b/tests/test_llama.py
index fc182ae20f..f2b3066f35 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -4,6 +4,7 @@
 import numpy as np
 from scipy.special import log_softmax
 
+import llama_cpp_python
 from huggingface_hub import hf_hub_download
 
 import pytest
@@ -20,7 +21,7 @@ def test_llama_cpp_version():
 
 
 def test_llama_cpp_tokenization():
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False)
+    llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, verbose=False)
 
     assert llama
     assert llama._ctx.ctx is not None
@@ -90,6 +91,11 @@ def test_real_model(llama_cpp_model_path):
     assert tokens == [9707, 11, 1879, 0]
 
     tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True)
+def test_llama_patch(mock_llama):
+    n_ctx = 128
+    llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
+    n_vocab = llama_cpp_python.llama_n_vocab(llama._model.model)
+    assert n_vocab == 32000
 
     batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1)
 
@@ -216,3 +222,6 @@ def logit_processor_func(input_ids, logits):
 
     assert number_1 != number_2
     assert number_1 == number_3
+
+def test_llama_cpp_version():
+    assert llama_cpp_python.__version__
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72b7..1633d9da43 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -2,13 +2,13 @@
 
 import jinja2
 
-from llama_cpp import (
+from llama_cpp_python import (
     ChatCompletionRequestUserMessage,
 )
-import llama_cpp.llama_types as llama_types
-import llama_cpp.llama_chat_format as llama_chat_format
+import llama_cpp_python.llama_types as llama_types
+import llama_cpp_python.llama_chat_format as llama_chat_format
 
-from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
+from llama_cpp_python.llama_chat_format import hf_tokenizer_config_to_chat_formatter
 
 def test_mistral_instruct():
     chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
diff --git a/tests/test_llama_grammar.py b/tests/test_llama_grammar.py
index 34ef2874df..e8fe9e3b08 100644
--- a/tests/test_llama_grammar.py
+++ b/tests/test_llama_grammar.py
@@ -1,4 +1,4 @@
-import llama_cpp
+import llama_cpp_python
 import json
 
 tree = """
@@ -9,7 +9,7 @@
 
 
 def test_grammar_from_string():
-    grammar = llama_cpp.LlamaGrammar.from_string(tree)
+    grammar = llama_cpp_python.LlamaGrammar.from_string(tree)
     # assert grammar._n_rules == 3
     # assert grammar._start_rule_index == 2
     # assert grammar.grammar is not None
@@ -47,7 +47,7 @@ class B(BaseModel):
         "type": "object",
     }
 
-    grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema))
+    grammar = llama_cpp_python.LlamaGrammar.from_json_schema(json.dumps(schema))
 
     # assert grammar.grammar is not None
 
@@ -73,6 +73,6 @@ def test_grammar_anyof():
         "type": "object",
     }
 
-    grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(sch))
+    grammar = llama_cpp_python.LlamaGrammar.from_json_schema(json.dumps(sch))
 
     # assert grammar.grammar is not None
diff --git a/tests/test_llama_speculative.py b/tests/test_llama_speculative.py
index b5d450567b..fe8e3545ab 100644
--- a/tests/test_llama_speculative.py
+++ b/tests/test_llama_speculative.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+from llama_cpp_python.llama_speculative import LlamaPromptLookupDecoding
 
 def test_find_candidate_pred_tokens():
     find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens