diff --git a/README.md b/README.md index dbaec50775..8ebf49c78a 100644 --- a/README.md +++ b/README.md @@ -272,20 +272,20 @@ The high-level API provides a simple managed interface through the [`Llama`](htt Below is a short example demonstrating how to use the high-level API to for basic text completion: ```python -from llama_cpp import Llama +from llama_cpp_python import Llama llm = Llama( - model_path="./models/7B/llama-model.gguf", - # n_gpu_layers=-1, # Uncomment to use GPU acceleration - # seed=1337, # Uncomment to set a specific seed - # n_ctx=2048, # Uncomment to increase the context window + model_path="./models/7B/llama-model.gguf", + # n_gpu_layers=-1, # Uncomment to use GPU acceleration + # seed=1337, # Uncomment to set a specific seed + # n_ctx=2048, # Uncomment to increase the context window ) output = llm( - "Q: Name the planets in the solar system? A: ", # Prompt - max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window - stop=["Q:", "\n"], # Stop generating just before the model would generate a new question - echo=True # Echo the prompt back in the output -) # Generate a completion, can also call create_completion + "Q: Name the planets in the solar system? A: ", # Prompt + max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window + stop=["Q:", "\n"], # Stop generating just before the model would generate a new question + echo=True # Echo the prompt back in the output +) # Generate a completion, can also call create_completion print(output) ``` @@ -346,19 +346,20 @@ The model will will format the messages into a single prompt using the following Set `verbose=True` to see the selected chat format. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama( - model_path="path/to/llama-2/llama-model.gguf", - chat_format="llama-2" + model_path="path/to/llama-2/llama-model.gguf", + chat_format="llama-2" ) llm.create_chat_completion( - messages = [ - {"role": "system", "content": "You are an assistant who perfectly describes images."}, - { - "role": "user", - "content": "Describe this image in detail please." - } - ] + messages=[ + {"role": "system", "content": "You are an assistant who perfectly describes images."}, + { + "role": "user", + "content": "Describe this image in detail please." + } + ] ) ``` @@ -376,7 +377,8 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the The following example will constrain the response to valid JSON strings only. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") llm.create_chat_completion( messages=[ @@ -398,7 +400,8 @@ llm.create_chat_completion( To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") llm.create_chat_completion( messages=[ @@ -425,47 +428,48 @@ llm.create_chat_completion( The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling") llm.create_chat_completion( - messages = [ + messages=[ { - "role": "system", - "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" + "role": "system", + "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" }, { - "role": "user", - "content": "Extract Jason is 25 years old" + "role": "user", + "content": "Extract Jason is 25 years old" } - ], - tools=[{ + ], + tools=[{ "type": "function", "function": { - "name": "UserDetail", - "parameters": { - "type": "object", - "title": "UserDetail", - "properties": { - "name": { - "title": "Name", - "type": "string" - }, - "age": { - "title": "Age", - "type": "integer" - } - }, - "required": [ "name", "age" ] - } + "name": "UserDetail", + "parameters": { + "type": "object", + "title": "UserDetail", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "age": { + "title": "Age", + "type": "integer" + } + }, + "required": ["name", "age"] + } } - }], - tool_choice={ + }], + tool_choice={ "type": "function", "function": { - "name": "UserDetail" + "name": "UserDetail" } - } + } ) ``` @@ -477,13 +481,14 @@ The various gguf-converted files for this set of models can be found [here](http Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files. ```python -from llama_cpp import Llama -from llama_cpp.llama_tokenizer import LlamaHFTokenizer +from llama_cpp_python import Llama +from llama_cpp_python.llama_tokenizer import LlamaHFTokenizer + llm = Llama.from_pretrained( - repo_id="meetkai/functionary-small-v2.2-GGUF", - filename="functionary-small-v2.2.q4_0.gguf", - chat_format="functionary-v2", - tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF") + repo_id="meetkai/functionary-small-v2.2-GGUF", + filename="functionary-small-v2.2.q4_0.gguf", + chat_format="functionary-v2", + tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF") ) ``` @@ -509,22 +514,24 @@ Below are the supported multi-modal models and their respective chat handlers (P Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. ```python -from llama_cpp import Llama -from llama_cpp.llama_chat_format import Llava15ChatHandler +from llama_cpp_python import Llama +from llama_cpp_python.llama_chat_format import Llava15ChatHandler + chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin") llm = Llama( - model_path="./path/to/llava/llama-model.gguf", - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + model_path="./path/to/llava/llama-model.gguf", + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding ) llm.create_chat_completion( - messages = [ + messages=[ {"role": "system", "content": "You are an assistant who perfectly describes images."}, { "role": "user", "content": [ - {"type" : "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } } + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}} ] } ] @@ -534,28 +541,29 @@ llm.create_chat_completion( You can also pull the model from the Hugging Face Hub using the `from_pretrained` method. ```python -from llama_cpp import Llama -from llama_cpp.llama_chat_format import MoondreamChatHandler +from llama_cpp_python import Llama +from llama_cpp_python.llama_chat_format import MoondreamChatHandler chat_handler = MoondreamChatHandler.from_pretrained( - repo_id="vikhyatk/moondream2", - filename="*mmproj*", + repo_id="vikhyatk/moondream2", + filename="*mmproj*", ) llm = Llama.from_pretrained( - repo_id="vikhyatk/moondream2", - filename="*text-model*", - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + repo_id="vikhyatk/moondream2", + filename="*text-model*", + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding ) response = llm.create_chat_completion( - messages = [ + messages=[ { "role": "user", "content": [ - {"type" : "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } } + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}} ] } @@ -607,12 +615,13 @@ The fastest way to use speculative decoding is through the `LlamaPromptLookupDec Just pass this as a draft model to the `Llama` class during initialization. ```python -from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaPromptLookupDecoding +from llama_cpp_python import Llama +from llama_cpp_python.llama_speculative import LlamaPromptLookupDecoding llama = Llama( model_path="path/to/model.gguf", - draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. + draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) + # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. ) ``` @@ -621,9 +630,9 @@ llama = Llama( To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly. ```python -import llama_cpp +import llama_cpp_python -llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True) +llm = llama_cpp_python.Llama(model_path="path/to/model.gguf", embedding=True) embeddings = llm.create_embedding("Hello, world!") @@ -657,14 +666,14 @@ To install the server package and get started: ```bash pip install 'llama-cpp-python[server]' -python3 -m llama_cpp.server --model models/7B/llama-model.gguf +python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf ``` Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this: ```bash CMAKE_ARGS="-DGGML_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' -python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35 +python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf --n_gpu_layers 35 ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. @@ -675,16 +684,16 @@ Similarly, to change the port (default is 8000), use `--port`. You probably also want to set the prompt format. For chatml, use ```bash -python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format chatml +python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf --chat_format chatml ``` That will format the prompt according to how model expects it. You can find the prompt format in the model card. -For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format". +For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp_python/llama_chat_format.py) and look for lines starting with "@register_chat_format". If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. ```bash -python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' +python3 -m llama_cpp_python.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' ``` ### Web Server Features @@ -714,18 +723,20 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github Below is a short example demonstrating how to use the low-level API to tokenize a prompt: ```python -import llama_cpp +import llama_cpp_python import ctypes -llama_cpp.llama_backend_init(False) # Must be called once at the start of each program -params = llama_cpp.llama_context_default_params() + +llama_cpp_python.llama_backend_init(False) # Must be called once at the start of each program +params = llama_cpp_python.llama_context_default_params() # use bytes for char * params -model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params) -ctx = llama_cpp.llama_new_context_with_model(model, params) +model = llama_cpp_python.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params) +ctx = llama_cpp_python.llama_new_context_with_model(model, params) max_tokens = params.n_ctx # use ctypes arrays for array params -tokens = (llama_cpp.llama_token * int(max_tokens))() -n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True)) -llama_cpp.llama_free(ctx) +tokens = (llama_cpp_python.llama_token * int(max_tokens))() +n_tokens = llama_cpp_python.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, + llama_cpp_python.c_bool(True)) +llama_cpp_python.llama_free(ctx) ``` Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. diff --git a/docs/server.md b/docs/server.md index cd6f86c513..f4d04956e4 100644 --- a/docs/server.md +++ b/docs/server.md @@ -19,7 +19,7 @@ pip install llama-cpp-python[server] The server can then be started by running the following command: ```bash -python3 -m llama_cpp.server --model +python3 -m llama_cpp_python.server --model ``` ### Server options @@ -27,7 +27,7 @@ python3 -m llama_cpp.server --model For a full list of options, run: ```bash -python3 -m llama_cpp.server --help +python3 -m llama_cpp_python.server --help ``` NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable. @@ -53,7 +53,7 @@ You'll first need to download one of the available code completion models in GGU Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests: ```bash -python3 -m llama_cpp.server --model --n_ctx 16192 +python3 -m llama_cpp_python.server --model --n_ctx 16192 ``` Then just update your settings in `.vscode/settings.json` to point to your code completion server: @@ -83,7 +83,7 @@ Then when you run the server you'll need to also specify either `functionary-v1` Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files. ```bash -python3 -m llama_cpp.server --model --chat_format functionary-v2 --hf_pretrained_model_name_or_path +python3 -m llama_cpp_python.server --model --chat_format functionary-v2 --hf_pretrained_model_name_or_path ``` Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling. @@ -104,7 +104,7 @@ You'll first need to download one of the available multi-modal models in GGUF fo Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format ```bash -python3 -m llama_cpp.server --model --clip_model_path --chat_format llava-1-5 +python3 -m llama_cpp_python.server --model --clip_model_path --chat_format llava-1-5 ``` Then you can just use the OpenAI API as normal @@ -138,7 +138,7 @@ print(response) The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable. ```bash -python3 -m llama_cpp.server --config_file +python3 -m llama_cpp_python.server --config_file ``` Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models. diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py index 0b36746f91..9dd0b583de 100644 --- a/examples/batch-processing/server.py +++ b/examples/batch-processing/server.py @@ -1,19 +1,19 @@ """llama-cpp-python server from scratch in a single file. """ -# import llama_cpp +# import llama_cpp_python # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf" -# model_params = llama_cpp.llama_model_default_params() -# model = llama_cpp.llama_load_model_from_file(path, model_params) +# model_params = llama_cpp_python.llama_model_default_params() +# model = llama_cpp_python.llama_load_model_from_file(path, model_params) # if model is None: # raise RuntimeError(f"Failed to load model from file: {path}") -# ctx_params = llama_cpp.llama_context_default_params() -# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params) +# ctx_params = llama_cpp_python.llama_context_default_params() +# ctx = llama_cpp_python.llama_new_context_with_model(model, ctx_params) # if ctx is None: # raise RuntimeError("Failed to create context") diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py index e16bf234ab..0bdeffddac 100644 --- a/examples/gradio_chat/local.py +++ b/examples/gradio_chat/local.py @@ -1,12 +1,12 @@ -import llama_cpp -import llama_cpp.llama_tokenizer +import llama_cpp_python +import llama_cpp_python.llama_tokenizer import gradio as gr -llama = llama_cpp.Llama.from_pretrained( +llama = llama_cpp_python.Llama.from_pretrained( repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", filename="*q8_0.gguf", - tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( + tokenizer=llama_cpp_python.llama_tokenizer.LlamaHFTokenizer.from_pretrained( "Qwen/Qwen1.5-0.5B" ), verbose=False, diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py index dfed17516d..bf064f83cc 100644 --- a/examples/hf_pull/main.py +++ b/examples/hf_pull/main.py @@ -1,11 +1,11 @@ -import llama_cpp -import llama_cpp.llama_tokenizer +import llama_cpp_python +import llama_cpp_python.llama_tokenizer -llama = llama_cpp.Llama.from_pretrained( +llama = llama_cpp_python.Llama.from_pretrained( repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", filename="*q8_0.gguf", - tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( + tokenizer=llama_cpp_python.llama_tokenizer.LlamaHFTokenizer.from_pretrained( "Qwen/Qwen1.5-0.5B" ), verbose=False, diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py index ee59767d65..6ca7bbe263 100644 --- a/examples/high_level_api/fastapi_server.py +++ b/examples/high_level_api/fastapi_server.py @@ -9,26 +9,26 @@ Then run: ``` -uvicorn --factory llama_cpp.server.app:create_app --reload +uvicorn --factory llama_cpp_python.server.app:create_app --reload ``` or ``` -python3 -m llama_cpp.server +python3 -m llama_cpp_python.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. -To actually see the implementation of the server, see llama_cpp/server/app.py +To actually see the implementation of the server, see llama_cpp_python/server/app.py """ import os import uvicorn -from llama_cpp.server.app import create_app +from llama_cpp_python.server.app import create_app if __name__ == "__main__": app = create_app() diff --git a/examples/high_level_api/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py index feb0ed68d9..85de8654f0 100644 --- a/examples/high_level_api/high_level_api_embedding.py +++ b/examples/high_level_api/high_level_api_embedding.py @@ -1,6 +1,6 @@ import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin") diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py index e41f375774..fe0de7e253 100644 --- a/examples/high_level_api/high_level_api_inference.py +++ b/examples/high_level_api/high_level_api_inference.py @@ -1,7 +1,7 @@ import json import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") diff --git a/examples/high_level_api/high_level_api_infill.py b/examples/high_level_api/high_level_api_infill.py index 282333e5a8..dfe0f754d6 100644 --- a/examples/high_level_api/high_level_api_infill.py +++ b/examples/high_level_api/high_level_api_infill.py @@ -1,6 +1,6 @@ import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py index 747c6130e3..5e61a15248 100644 --- a/examples/high_level_api/high_level_api_streaming.py +++ b/examples/high_level_api/high_level_api_streaming.py @@ -1,7 +1,7 @@ import json import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py index b91632f5bc..8c681d39fe 100644 --- a/examples/high_level_api/langchain_custom_llm.py +++ b/examples/high_level_api/langchain_custom_llm.py @@ -1,6 +1,6 @@ import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama from langchain.llms.base import LLM from typing import Optional, List, Mapping, Any diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 39081be17a..759bdfbdd8 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -16,7 +16,7 @@ from time import time from os import cpu_count, path -import llama_cpp +import llama_cpp_python from common import GptParams, gpt_params_parse, gpt_random_prompt import util @@ -71,7 +71,7 @@ def __init__(self, params: GptParams) -> None: self.multibyte_fix = [] # model load - self.lparams = llama_cpp.llama_model_default_params() + self.lparams = llama_cpp_python.llama_model_default_params() self.lparams.n_ctx = self.params.n_ctx self.lparams.n_parts = self.params.n_parts self.lparams.seed = self.params.seed @@ -79,23 +79,23 @@ def __init__(self, params: GptParams) -> None: self.lparams.use_mlock = self.params.use_mlock self.lparams.use_mmap = self.params.use_mmap - self.model = llama_cpp.llama_load_model_from_file( + self.model = llama_cpp_python.llama_load_model_from_file( self.params.model.encode("utf8"), self.lparams ) # Context Params. - self.cparams = llama_cpp.llama_context_default_params() + self.cparams = llama_cpp_python.llama_context_default_params() - self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams) + self.ctx = llama_cpp_python.llama_new_context_with_model(self.model, self.cparams) if not self.ctx: raise RuntimeError(f"error: failed to load model '{self.params.model}'") if self.params.ignore_eos: - self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + self.params.logit_bias[llama_cpp_python.llama_token_eos()] = -float("inf") if len(self.params.lora_adapter) > 0: if ( - llama_cpp.llama_apply_lora_from_file( + llama_cpp_python.llama_apply_lora_from_file( self.ctx, self.params.lora_adapter.encode("utf8"), ( @@ -113,26 +113,26 @@ def __init__(self, params: GptParams) -> None: print(file=sys.stderr) print( f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8')}", +| {llama_cpp_python.llama_print_system_info().decode('utf8')}", file=sys.stderr, ) # determine the required inference memory per token: if self.params.mem_test: tmp = [0, 1, 2, 3] - llama_cpp.llama_eval( + llama_cpp_python.llama_eval( self.ctx, - (llama_cpp.c_int * len(tmp))(*tmp), + (llama_cpp_python.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads, ) - llama_cpp.llama_print_timings(self.ctx) + llama_cpp_python.llama_print_timings(self.ctx) self.exit() return # create internal context - self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) + self.n_ctx = llama_cpp_python.llama_n_ctx(self.ctx) # Add a space in front of the first character to match OG llama tokenizer behavior self.params.prompt = " " + self.params.prompt @@ -142,7 +142,7 @@ def __init__(self, params: GptParams) -> None: with open(self.params.file) as f: self.params.prompt = f.read() - self.session_tokens: list[llama_cpp.llama_token] = [] + self.session_tokens: list[llama_cpp_python.llama_token] = [] if len(self.params.path_session) > 0: print( f"attempting to load saved session from '{self.params.path_session}'", @@ -150,10 +150,10 @@ def __init__(self, params: GptParams) -> None: ) if path.exists(self.params.path_session): - _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() - _n_token_count_out = llama_cpp.c_size_t() + _session_tokens = (llama_cpp_python.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp_python.c_size_t() if ( - llama_cpp.llama_load_session_file( + llama_cpp_python.llama_load_session_file( self.ctx, self.params.path_session.encode("utf8"), _session_tokens, @@ -312,8 +312,8 @@ def __init__(self, params: GptParams) -> None: # tokenize a prompt def _tokenize(self, prompt, bos=True): - _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() - _n = llama_cpp.llama_tokenize( + _arr = (llama_cpp_python.llama_token * ((len(prompt) + 1) * 4))() + _n = llama_cpp_python.llama_tokenize( self.model, prompt.encode("utf8", errors="ignore"), len(prompt), @@ -379,17 +379,17 @@ def generate(self): # TODO BUG: The batching code causes nonsensical generation """for i in range(0, len(self.embd), self.params.n_batch): n_eval = self.params.n_batch - _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) - if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + _arr = (llama_cpp_python.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp_python.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: print(f"failed to eval") return self.n_past += n_eval""" if ( - llama_cpp.llama_eval( + llama_cpp_python.llama_eval( self.ctx, - (llama_cpp.llama_token * len(self.embd))(*self.embd), + (llama_cpp_python.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, ) @@ -406,7 +406,7 @@ def generate(self): if len(self.embd_inp) <= self.input_consumed: # && !is_interacting # out of user input, sample next token top_k = ( - llama_cpp.llama_n_vocab(self.ctx) + llama_cpp_python.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k ) @@ -419,10 +419,10 @@ def generate(self): # optionally save the session on first sample (for faster prompt loading next time) if len(self.params.path_session) > 0 and self.need_to_save_session: self.need_to_save_session = False - llama_cpp.llama_save_session_file( + llama_cpp_python.llama_save_session_file( self.ctx, self.params.path_session.encode("utf8"), - (llama_cpp.llama_token * len(self.session_tokens))( + (llama_cpp_python.llama_token * len(self.session_tokens))( *self.session_tokens ), len(self.session_tokens), @@ -430,108 +430,108 @@ def generate(self): id = 0 - logits = llama_cpp.llama_get_logits(self.ctx) - n_vocab = llama_cpp.llama_n_vocab(self.model) + logits = llama_cpp_python.llama_get_logits(self.ctx) + n_vocab = llama_cpp_python.llama_n_vocab(self.model) # Apply params.logit_bias map for key, value in self.params.logit_bias.items(): logits[key] += value - _arr = (llama_cpp.llama_token_data * n_vocab)( + _arr = (llama_cpp_python.llama_token_data * n_vocab)( *[ - llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + llama_cpp_python.llama_token_data(token_id, logits[token_id], 0.0) for token_id in range(n_vocab) ] ) - candidates_p = llama_cpp.ctypes.pointer( - llama_cpp.llama_token_data_array(_arr, len(_arr), False) + candidates_p = llama_cpp_python.ctypes.pointer( + llama_cpp_python.llama_token_data_array(_arr, len(_arr), False) ) # Apply penalties - nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)] + nl_logit = logits[llama_cpp_python.llama_token_nl(self.ctx)] last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) - _arr = (llama_cpp.llama_token * last_n_repeat)( + _arr = (llama_cpp_python.llama_token * last_n_repeat)( *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :] ) - llama_cpp.llama_sample_repetition_penalties( + llama_cpp_python.llama_sample_repetition_penalties( ctx=self.ctx, candidates=candidates_p, last_tokens_data=_arr, penalty_last_n=last_n_repeat, - penalty_repeat=llama_cpp.c_float(self.params.repeat_penalty), - penalty_freq=llama_cpp.c_float(self.params.frequency_penalty), - penalty_present=llama_cpp.c_float(self.params.presence_penalty), + penalty_repeat=llama_cpp_python.c_float(self.params.repeat_penalty), + penalty_freq=llama_cpp_python.c_float(self.params.frequency_penalty), + penalty_present=llama_cpp_python.c_float(self.params.presence_penalty), ) # NOT PRESENT IN CURRENT VERSION ? - # llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p, + # llama_cpp_python.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p, # _arr, - # last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) + # last_n_repeat, llama_cpp_python.c_float(self.params.frequency_penalty), llama_cpp_python.c_float(self.params.presence_penalty)) if not self.params.penalize_nl: - logits[llama_cpp.llama_token_nl()] = nl_logit + logits[llama_cpp_python.llama_token_nl()] = nl_logit if self.params.temp <= 0: # Greedy sampling - id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + id = llama_cpp_python.llama_sample_token_greedy(self.ctx, candidates_p) else: if self.params.mirostat == 1: mirostat_mu = 2.0 * self.params.mirostat_tau mirostat_m = 100 - llama_cpp.llama_sample_temperature( - self.ctx, candidates_p, llama_cpp.c_float(self.params.temp) + llama_cpp_python.llama_sample_temperature( + self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp) ) - id = llama_cpp.llama_sample_token_mirostat( + id = llama_cpp_python.llama_sample_token_mirostat( self.ctx, candidates_p, - llama_cpp.c_float(self.params.mirostat_tau), - llama_cpp.c_float(self.params.mirostat_eta), - llama_cpp.c_int(mirostat_m), - llama_cpp.c_float(mirostat_mu), + llama_cpp_python.c_float(self.params.mirostat_tau), + llama_cpp_python.c_float(self.params.mirostat_eta), + llama_cpp_python.c_int(mirostat_m), + llama_cpp_python.c_float(mirostat_mu), ) elif self.params.mirostat == 2: mirostat_mu = 2.0 * self.params.mirostat_tau - llama_cpp.llama_sample_temperature( - self.ctx, candidates_p, llama_cpp.c_float(self.params.temp) + llama_cpp_python.llama_sample_temperature( + self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp) ) - id = llama_cpp.llama_sample_token_mirostat_v2( + id = llama_cpp_python.llama_sample_token_mirostat_v2( self.ctx, candidates_p, - llama_cpp.c_float(self.params.mirostat_tau), - llama_cpp.c_float(self.params.mirostat_eta), - llama_cpp.c_float(mirostat_mu), + llama_cpp_python.c_float(self.params.mirostat_tau), + llama_cpp_python.c_float(self.params.mirostat_eta), + llama_cpp_python.c_float(mirostat_mu), ) else: # Temperature sampling - llama_cpp.llama_sample_top_k( + llama_cpp_python.llama_sample_top_k( self.ctx, candidates_p, top_k, - min_keep=llama_cpp.c_size_t(1), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_tail_free( + llama_cpp_python.llama_sample_tail_free( self.ctx, candidates_p, - llama_cpp.c_float(self.params.tfs_z), - min_keep=llama_cpp.c_size_t(1), + llama_cpp_python.c_float(self.params.tfs_z), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_typical( + llama_cpp_python.llama_sample_typical( self.ctx, candidates_p, - llama_cpp.c_float(self.params.typical_p), - min_keep=llama_cpp.c_size_t(1), + llama_cpp_python.c_float(self.params.typical_p), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_top_p( + llama_cpp_python.llama_sample_top_p( self.ctx, candidates_p, - llama_cpp.c_float(self.params.top_p), - min_keep=llama_cpp.c_size_t(1), + llama_cpp_python.c_float(self.params.top_p), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_temperature( - self.ctx, candidates_p, llama_cpp.c_float(self.params.temp) + llama_cpp_python.llama_sample_temperature( + self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp) ) - id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + id = llama_cpp_python.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) self.last_n_tokens.pop(0) @@ -539,7 +539,7 @@ def generate(self): # replace end of text token with newline token when in interactive mode if ( - id == llama_cpp.llama_token_eos(self.ctx) + id == llama_cpp_python.llama_token_eos(self.ctx) and self.params.interactive and not self.params.instruct ): @@ -599,7 +599,7 @@ def generate(self): break # end of text token - if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos( + if len(self.embd) > 0 and self.embd[-1] == llama_cpp_python.llama_token_eos( self.ctx ): if not self.params.instruct: @@ -629,14 +629,14 @@ def __exit__(self, type, value, tb): self.exit() def exit(self): - llama_cpp.llama_free(self.ctx) + llama_cpp_python.llama_free(self.ctx) self.set_color(util.CONSOLE_COLOR_DEFAULT) def token_to_str(self, token_id: int) -> bytes: size = 32 buffer = (ctypes.c_char * size)() - n = llama_cpp.llama_token_to_piece( - self.model, llama_cpp.llama_token(token_id), buffer, size + n = llama_cpp_python.llama_token_to_piece( + self.model, llama_cpp_python.llama_token(token_id), buffer, size ) assert n <= size return bytes(buffer[:n]) diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index ba3545771d..928c00bcfd 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -2,32 +2,32 @@ import os import multiprocessing -import llama_cpp +import llama_cpp_python -llama_cpp.llama_backend_init(numa=False) +llama_cpp_python.llama_backend_init(numa=False) N_THREADS = multiprocessing.cpu_count() MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin") prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" -lparams = llama_cpp.llama_model_default_params() -cparams = llama_cpp.llama_context_default_params() -model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams) -ctx = llama_cpp.llama_new_context_with_model(model, cparams) +lparams = llama_cpp_python.llama_model_default_params() +cparams = llama_cpp_python.llama_context_default_params() +model = llama_cpp_python.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams) +ctx = llama_cpp_python.llama_new_context_with_model(model, cparams) # determine the required inference memory per token: tmp = [0, 1, 2, 3] -llama_cpp.llama_eval( - ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0 +llama_cpp_python.llama_eval( + ctx=ctx, tokens=(llama_cpp_python.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0 ) # Deprecated n_past = 0 prompt = b" " + prompt -embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() -n_of_tok = llama_cpp.llama_tokenize( +embd_inp = (llama_cpp_python.llama_token * (len(prompt) + 1))() +n_of_tok = llama_cpp_python.llama_tokenize( model=model, text=bytes(str(prompt), "utf-8"), text_len=len(embd_inp), @@ -38,7 +38,7 @@ ) embd_inp = embd_inp[:n_of_tok] -n_ctx = llama_cpp.llama_n_ctx(ctx) +n_ctx = llama_cpp_python.llama_n_ctx(ctx) n_predict = 20 n_predict = min(n_predict, n_ctx - len(embd_inp)) @@ -59,9 +59,9 @@ while remaining_tokens > 0: if len(embd) > 0: - llama_cpp.llama_eval( + llama_cpp_python.llama_eval( ctx=ctx, - tokens=(llama_cpp.c_int * len(embd))(*embd), + tokens=(llama_cpp_python.c_int * len(embd))(*embd), n_tokens=len(embd), n_past=n_past, ) # Deprecated @@ -69,21 +69,21 @@ n_past += len(embd) embd = [] if len(embd_inp) <= input_consumed: - logits = llama_cpp.llama_get_logits(ctx) - n_vocab = llama_cpp.llama_n_vocab(model) + logits = llama_cpp_python.llama_get_logits(ctx) + n_vocab = llama_cpp_python.llama_n_vocab(model) - _arr = (llama_cpp.llama_token_data * n_vocab)( + _arr = (llama_cpp_python.llama_token_data * n_vocab)( *[ - llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + llama_cpp_python.llama_token_data(token_id, logits[token_id], 0.0) for token_id in range(n_vocab) ] ) - candidates_p = llama_cpp.ctypes.pointer( - llama_cpp.llama_token_data_array(_arr, len(_arr), False) + candidates_p = llama_cpp_python.ctypes.pointer( + llama_cpp_python.llama_token_data_array(_arr, len(_arr), False) ) - _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) - llama_cpp.llama_sample_repetition_penalties( + _arr = (llama_cpp_python.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp_python.llama_sample_repetition_penalties( ctx, candidates_p, _arr, @@ -93,10 +93,10 @@ penalty_present=presence_penalty, ) - llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1) - llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1) - llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2) - id = llama_cpp.llama_sample_token(ctx, candidates_p) + llama_cpp_python.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1) + llama_cpp_python.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1) + llama_cpp_python.llama_sample_temperature(ctx, candidates_p, temp=0.2) + id = llama_cpp_python.llama_sample_token(ctx, candidates_p) last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) @@ -113,8 +113,8 @@ for id in embd: size = 32 buffer = (ctypes.c_char * size)() - n = llama_cpp.llama_token_to_piece( - model, llama_cpp.llama_token(id), buffer, size + n = llama_cpp_python.llama_token_to_piece( + model, llama_cpp_python.llama_token(id), buffer, size ) assert n <= size print( @@ -123,11 +123,11 @@ flush=True, ) - if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx): + if len(embd) > 0 and embd[-1] == llama_cpp_python.llama_token_eos(ctx): break print() -llama_cpp.llama_print_timings(ctx) +llama_cpp_python.llama_print_timings(ctx) -llama_cpp.llama_free(ctx) +llama_cpp_python.llama_free(ctx) diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py index 057ac389eb..89620681fe 100644 --- a/examples/low_level_api/quantize.py +++ b/examples/low_level_api/quantize.py @@ -1,6 +1,6 @@ import os import argparse -import llama_cpp +import llama_cpp_python def main(args): @@ -11,9 +11,9 @@ def main(args): if os.path.exists(fname_out): raise RuntimeError(f"Output file already exists ({fname_out})") ftype = args.type - args = llama_cpp.llama_model_quantize_default_params() + args = llama_cpp_python.llama_model_quantize_default_params() args.ftype = ftype - return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args) + return_code = llama_cpp_python.llama_model_quantize(fname_inp, fname_out, args) if return_code != 0: raise RuntimeError("Failed to quantize model") @@ -25,7 +25,7 @@ def main(args): parser.add_argument( "type", type=int, - help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum", + help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp_python.py for enum", ) args = parser.parse_args() main(args) diff --git a/examples/ray/llm.py b/examples/ray/llm.py index 2325dd303b..855a164e3e 100755 --- a/examples/ray/llm.py +++ b/examples/ray/llm.py @@ -2,7 +2,7 @@ from typing import Dict from ray import serve from ray.serve import Application -from llama_cpp import Llama +from llama_cpp_python import Llama @serve.deployment diff --git a/llama.cpp-server.cmd b/llama.cpp-server.cmd new file mode 100644 index 0000000000..76d261cf48 --- /dev/null +++ b/llama.cpp-server.cmd @@ -0,0 +1,2 @@ +venv\Scripts\python.exe -m llama_cpp_python.server --config_file config.json +pause \ No newline at end of file diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 994d5f1497..0aff348442 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -362,6 +362,13 @@ def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: i self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep ) + def sample_tail_free( + self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int + ): + llama_cpp.llama_sample_tail_free( + self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep + ) + def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): @@ -678,6 +685,9 @@ def sample( ctx_main.sample_top_k( token_data_array, self.params.top_k, min_keep=min_keep ) + ctx_main.sample_tail_free( + token_data_array, self.params.tfs_z, min_keep=min_keep + ) ctx_main.sample_typical( token_data_array, self.params.typical_p, min_keep=min_keep ) @@ -766,6 +776,10 @@ def add_min_p(self, p: float, min_keep: int): sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep) self._add_sampler(sampler) + def add_tail_free(self, z: float, min_keep: int): + sampler = llama_cpp.llama_sampler_init_tail_free(z, min_keep) + self._add_sampler(sampler) + def add_typical(self, p: float, min_keep: int): sampler = llama_cpp.llama_sampler_init_typical(p, min_keep) self._add_sampler(sampler) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d15a88b00e..babb30cf09 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -745,6 +745,7 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): n_probs = 0 min_keep = max(1, n_probs) sampler.add_top_k(top_k) + sampler.add_tail_free(tfs_z, min_keep) sampler.add_typical(typical_p, min_keep) sampler.add_top_p(top_p, min_keep) sampler.add_min_p(min_p, min_keep) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 457c6dddb4..97c9691363 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -464,8 +464,6 @@ class llama_token_data(ctypes.Structure): # typedef struct llama_token_data_array { -# // TODO: consider SoA -# // NOTE: this pointer can be modified by the samplers # llama_token_data * data; # size_t size; # int64_t selected; // this is the index in the data array (i.e. not the token id) @@ -509,11 +507,8 @@ class llama_token_data_array(ctypes.Structure): # // - token : the token ids of the input (used when embd is NULL) # // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) # // - pos : the positions of the respective token in the sequence -# // (if set to NULL, the token position will be tracked automatically by llama_decode) # // - seq_id : the sequence to which the respective token belongs -# // (if set to NULL, the sequence ID will be assumed to be 0) # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output -# // (if set to NULL, only the logits for last token will be returned) # // # typedef struct llama_batch { # int32_t n_tokens; @@ -524,6 +519,16 @@ class llama_token_data_array(ctypes.Structure): # int32_t * n_seq_id; # llama_seq_id ** seq_id; # int8_t * logits; // TODO: rename this to "output" + + +# // NOTE: helpers for smooth API transition - can be deprecated in the future +# // for future-proof code, use the above fields instead and ignore everything below +# // +# // pos[i] = all_pos_0 + i*all_pos_1 +# // +# llama_pos all_pos_0; // used if pos == NULL +# llama_pos all_pos_1; // used if pos == NULL +# llama_seq_id all_seq_id; // used if seq_id == NULL # } llama_batch; class llama_batch(ctypes.Structure): """Input data for llama_decode @@ -558,6 +563,9 @@ class llama_batch(ctypes.Structure): ("n_seq_id", ctypes.POINTER(ctypes.c_int32)), ("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))), ("logits", ctypes.POINTER(ctypes.c_int8)), + ("all_pos_0", llama_pos), + ("all_pos_1", llama_pos), + ("all_seq_id", llama_seq_id), ] @@ -1162,12 +1170,6 @@ def llama_supports_gpu_offload() -> bool: ... -# LLAMA_API bool llama_supports_rpc (void); -@ctypes_function("llama_supports_rpc", [], ctypes.c_bool) -def llama_supports_rpc() -> bool: - ... - - # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) def llama_n_ctx(ctx: llama_context_p, /) -> int: @@ -2253,26 +2255,30 @@ def llama_state_seq_load_file( # // -# // Return batch for single sequence of tokens -# // The sequence ID will be fixed to 0 -# // The position of the tokens will be tracked automatically by llama_decode +# // Return batch for single sequence of tokens starting at pos_0 # // # // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it # // # LLAMA_API struct llama_batch llama_batch_get_one( # llama_token * tokens, -# int32_t n_tokens); +# int32_t n_tokens, +# llama_pos pos_0, +# llama_seq_id seq_id); @ctypes_function( "llama_batch_get_one", [ llama_token_p, - ctypes.c_int32, + ctypes.c_int, + llama_pos, + llama_seq_id, ], llama_batch, ) def llama_batch_get_one( tokens: CtypesArray[llama_token], n_tokens: Union[ctypes.c_int, int], + pos_0: Union[llama_pos, int], + seq_id: llama_seq_id, /, ) -> llama_batch: """Return batch for single sequence of tokens starting at pos_0 @@ -2610,13 +2616,6 @@ def llama_token_eos(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn -@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) -def llama_token_eot(model: llama_model_p, /) -> int: - """end-of-turn""" - ... - - # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token) def llama_token_cls(model: llama_model_p, /) -> int: @@ -2651,54 +2650,30 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool: # // Codellama infill tokens -# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); +# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token) def llama_token_prefix(model: llama_model_p) -> int: """codellama infill tokens""" ... -# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); +# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token) def llama_token_middle(model: llama_model_p, /) -> int: ... -# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); +# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token) def llama_token_suffix(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); -@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token) -def llama_token_fim_pre(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); -@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token) -def llama_token_fim_suf(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); -@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token) -def llama_token_fim_mid(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); -@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token) -def llama_token_fim_pad(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); -@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token) -def llama_token_fim_rep(model: llama_model_p, /) -> int: +# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle +@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) +def llama_token_eot(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); -@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token) -def llama_token_fim_sep(model: llama_model_p, /) -> int: - ... # // # // Tokenization @@ -2811,23 +2786,6 @@ def llama_token_to_piece( ... -# # // check if token0 is contained as a prefix in token1 -# # LLAMA_API bool llama_token_is_prefix( -# # const struct llama_model * model, -# # llama_token token0, -# # llama_token token1); -# @ctypes_function( -# "llama_token_is_prefix", -# [llama_model_p_ctypes, llama_token, llama_token], -# ctypes.c_bool, -# ) -# def llama_token_is_prefix( -# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], / -# ) -> bool: -# """Check if token0 is contained as a prefix in token1""" -# ... - - # /// @details Convert the provided tokens into text (inverse of llama_tokenize()). # /// @param text The char pointer must be large enough to hold the resulting text. # /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -3141,22 +3099,20 @@ def llama_sampler_chain_remove( # // available samplers: # -# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); +# LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes) def llama_sampler_init_greedy() -> llama_sampler_p: ... -# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); +# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes) def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ... # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. -# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), -# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); +# LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void); @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) def llama_sampler_init_softmax() -> llama_sampler_p: ... @@ -3191,6 +3147,17 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ... +# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep); +@ctypes_function( + "llama_sampler_init_tail_free", + [ctypes.c_float, ctypes.c_size_t], + llama_sampler_p_ctypes, +) +def llama_sampler_init_tail_free(z: float, min_keep: int) -> llama_sampler_p: + ... + + # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. # LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); @ctypes_function( @@ -3221,19 +3188,6 @@ def llama_sampler_init_temp_ext( ... -# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 -# LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); -@ctypes_function( - "llama_sampler_init_xtc", - [ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32], - llama_sampler_p_ctypes, -) -def llama_sampler_init_xtc( - p: float, t: float, min_keep: int, seed: int, / -) -> llama_sampler_p: - ... - - # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. # /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -3332,41 +3286,6 @@ def llama_sampler_init_penalties( ... -# /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 -# LLAMA_API struct llama_sampler * llama_sampler_init_dry( -# const struct llama_model * model, -# float dry_multiplier, -# float dry_base, -# int32_t dry_allowed_length, -# int32_t dry_penalty_last_n, -# const char ** seq_breakers, -# size_t num_breakers); -@ctypes_function( - "llama_sampler_init_dry", - [ - llama_model_p_ctypes, - ctypes.c_float, - ctypes.c_float, - ctypes.c_int32, - ctypes.c_int32, - ctypes.POINTER(ctypes.c_char_p), - ctypes.c_size_t, - ], - llama_sampler_p_ctypes, -) -def llama_sampler_init_dry( - model: llama_model_p, - dry_multiplier: float, - dry_base: float, - dry_allowed_length: int, - dry_penalty_last_n: int, - seq_breakers: CtypesArray[bytes], - num_breakers: int, - /, -) -> llama_sampler_p: - ... - - # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( # int32_t n_vocab, # int32_t n_logit_bias, @@ -3382,39 +3301,6 @@ def llama_sampler_init_logit_bias( ... -# // this sampler is meant to be used for fill-in-the-middle infilling -# // it's supposed to be used after top_k + top_p sampling -# // -# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG -# // 2. combine probs of tokens that have the same prefix -# // -# // example: -# // -# // - before: -# // "hel": 0.5 -# // "hell": 0.2 -# // "hello": 0.1 -# // "dummy": 0.1 -# // -# // - after: -# // "hel": 0.8 -# // "dummy": 0.1 -# // -# // 3. discard non-EOG tokens with low prob -# // 4. if no tokens are left -> pick EOT -# // -# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); -@ctypes_function( - "llama_sampler_init_infill", - [llama_model_p_ctypes], - llama_sampler_p_ctypes, -) -def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p: - """This sampler is meant to be used for fill-in-the-middle infilling. - """ - ... - - # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise # LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); @ctypes_function( diff --git a/mkdocs.yml b/mkdocs.yml index 79a9e67a1a..a8bf03b0c2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,7 +41,7 @@ plugins: - https://numpy.org/doc/stable/objects.inv watch: - - llama_cpp + - llama_cpp_python - README.md nav: diff --git a/tests/test_llama.py b/tests/test_llama.py index fc182ae20f..f2b3066f35 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -4,6 +4,7 @@ import numpy as np from scipy.special import log_softmax +import llama_cpp_python from huggingface_hub import hf_hub_download import pytest @@ -20,7 +21,7 @@ def test_llama_cpp_version(): def test_llama_cpp_tokenization(): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False) + llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, verbose=False) assert llama assert llama._ctx.ctx is not None @@ -90,6 +91,11 @@ def test_real_model(llama_cpp_model_path): assert tokens == [9707, 11, 1879, 0] tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True) +def test_llama_patch(mock_llama): + n_ctx = 128 + llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx) + n_vocab = llama_cpp_python.llama_n_vocab(llama._model.model) + assert n_vocab == 32000 batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1) @@ -216,3 +222,6 @@ def logit_processor_func(input_ids, logits): assert number_1 != number_2 assert number_1 == number_3 + +def test_llama_cpp_version(): + assert llama_cpp_python.__version__ diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index f031bf72b7..1633d9da43 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -2,13 +2,13 @@ import jinja2 -from llama_cpp import ( +from llama_cpp_python import ( ChatCompletionRequestUserMessage, ) -import llama_cpp.llama_types as llama_types -import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp_python.llama_types as llama_types +import llama_cpp_python.llama_chat_format as llama_chat_format -from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter +from llama_cpp_python.llama_chat_format import hf_tokenizer_config_to_chat_formatter def test_mistral_instruct(): chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" diff --git a/tests/test_llama_grammar.py b/tests/test_llama_grammar.py index 34ef2874df..e8fe9e3b08 100644 --- a/tests/test_llama_grammar.py +++ b/tests/test_llama_grammar.py @@ -1,4 +1,4 @@ -import llama_cpp +import llama_cpp_python import json tree = """ @@ -9,7 +9,7 @@ def test_grammar_from_string(): - grammar = llama_cpp.LlamaGrammar.from_string(tree) + grammar = llama_cpp_python.LlamaGrammar.from_string(tree) # assert grammar._n_rules == 3 # assert grammar._start_rule_index == 2 # assert grammar.grammar is not None @@ -47,7 +47,7 @@ class B(BaseModel): "type": "object", } - grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema)) + grammar = llama_cpp_python.LlamaGrammar.from_json_schema(json.dumps(schema)) # assert grammar.grammar is not None @@ -73,6 +73,6 @@ def test_grammar_anyof(): "type": "object", } - grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(sch)) + grammar = llama_cpp_python.LlamaGrammar.from_json_schema(json.dumps(sch)) # assert grammar.grammar is not None diff --git a/tests/test_llama_speculative.py b/tests/test_llama_speculative.py index b5d450567b..fe8e3545ab 100644 --- a/tests/test_llama_speculative.py +++ b/tests/test_llama_speculative.py @@ -1,6 +1,6 @@ import numpy as np -from llama_cpp.llama_speculative import LlamaPromptLookupDecoding +from llama_cpp_python.llama_speculative import LlamaPromptLookupDecoding def test_find_candidate_pred_tokens(): find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens