From e40fcb05754d0ec9c65359e245a436794cbfefdb Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 00:47:35 -0700 Subject: [PATCH 1/9] llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. --- llama_cpp/server/app.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 640dd3fb2..5d87e780d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -66,6 +66,10 @@ def get_llama(): with llama_lock: yield llama +model_field = Field( + description="The model to use for generating completions." +) + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] suffix: Optional[str] = Field(None) @@ -76,8 +80,9 @@ class CreateCompletionRequest(BaseModel): stop: Optional[List[str]] = [] stream: bool = False - # ignored or currently unsupported - model: Optional[str] = Field(None) + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field + n: Optional[int] = 1 logprobs: Optional[int] = Field(None) presence_penalty: Optional[float] = 0 @@ -133,7 +138,8 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): - model: Optional[str] + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field input: str user: Optional[str] @@ -173,8 +179,9 @@ class CreateChatCompletionRequest(BaseModel): stop: Optional[List[str]] = [] max_tokens: int = 128 - # ignored or currently unsupported - model: Optional[str] = Field(None) + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field + n: Optional[int] = 1 presence_penalty: Optional[float] = 0 frequency_penalty: Optional[float] = 0 From b47b9549d57f146a00ee19cd7d2bb294111abb67 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 01:19:30 -0700 Subject: [PATCH 2/9] llama_cpp server: delete some ignored / unused parameters `n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it --- llama_cpp/llama_types.py | 2 -- llama_cpp/server/app.py | 30 +++--------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b62ff1b19..b8bdb0811 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -60,8 +60,6 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): role: Union[Literal["assistant"], Literal["user"], Literal["system"]] content: str - user: NotRequired[str] - class ChatCompletionChoice(TypedDict): index: int diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5d87e780d..cc467db98 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -83,13 +83,7 @@ class CreateCompletionRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - n: Optional[int] = 1 logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = 40 @@ -120,13 +114,7 @@ def create_completion( completion_or_chunks = llama( **request.dict( exclude={ - "model", - "n", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", + "model" } ) ) @@ -141,7 +129,6 @@ class CreateEmbeddingRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field input: str - user: Optional[str] class Config: schema_extra = { @@ -161,7 +148,7 @@ class Config: def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"model"})) class ChatCompletionRequestMessage(BaseModel): @@ -181,12 +168,6 @@ class CreateChatCompletionRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) # llama.cpp specific parameters repeat_penalty: float = 1.1 @@ -220,12 +201,7 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", + "model" } ), ) From 1e429135993f4e1298d8c801f2628bae3d8f18a9 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 11:48:49 -0700 Subject: [PATCH 3/9] llama_cpp server: move logprobs to supported I think this is actually supported (its in the arguments of `LLama.__call__`, which is how the completion is invoked). decision: mark as supported --- llama_cpp/server/app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index cc467db98..2d20f3795 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -79,12 +79,11 @@ class CreateCompletionRequest(BaseModel): echo: bool = False stop: Optional[List[str]] = [] stream: bool = False + logprobs: Optional[int] = Field(None) # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - logprobs: Optional[int] = Field(None) - # llama.cpp specific parameters top_k: int = 40 repeat_penalty: float = 1.1 From a5aa6c1478de7cc16b654df533be3dee6519c42a Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 11:52:20 -0700 Subject: [PATCH 4/9] llama_cpp server: add missing top_k param to CreateChatCompletionRequest `llama.create_chat_completion` definitely has a `top_k` argument, but its missing from `CreateChatCompletionRequest`. decision: add it --- llama_cpp/server/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2d20f3795..e1045afbc 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -169,6 +169,7 @@ class CreateChatCompletionRequest(BaseModel): model: str = model_field # llama.cpp specific parameters + top_k: int = 40, repeat_penalty: float = 1.1 class Config: From 978b6daf9313a11367d0a9393226379173fdb688 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 14:37:36 -0700 Subject: [PATCH 5/9] llama_cpp server: add some more information to fields for completions --- llama_cpp/server/app.py | 70 ++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e1045afbc..e16848558 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -71,22 +71,70 @@ def get_llama(): ) class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: Optional[List[str]] = [] - stream: bool = False - logprobs: Optional[int] = Field(None) + prompt: Union[str, List[str]] = Field( + default="", + description="The prompt to generate completions for." + ) + suffix: Optional[str] = Field( + default=None, + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." + ) + max_tokens: int = Field( + default=16, + ge=1, + le=2048, + description="The maximum number of tokens to generate." + ) + temperature: float = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." + ) + top_p: float = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." + ) + echo: bool = Field( + default=False, + description="Whether to echo the prompt in the generated text. Useful for chatbots." + ) + stop: Optional[List[str]] = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used." + ) + stream: bool = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots." + ) + logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated." + ) + + # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 + top_k: int = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." + ) + repeat_penalty: float = Field( + default=1.0, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." + ) class Config: schema_extra = { From 8dcbf65a45d729eedb4363f4e92247e6325d5b7d Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 18:37:43 -0700 Subject: [PATCH 6/9] llama_cpp server: define fields for chat completions Slight refactor for common fields shared between completion and chat completion --- llama_cpp/server/app.py | 125 +++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 54 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e16848558..ec5dbd3f4 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -70,6 +70,55 @@ def get_llama(): description="The model to use for generating completions." ) +max_tokens_field = Field( + default=16, + ge=1, + le=2048, + description="The maximum number of tokens to generate." +) + +temperature_field = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." +) + +top_p_field = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." +) + +stop_field = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used." +) + +stream_field = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots." +) + +top_k_field = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." +) + +repeat_penalty_field = Field( + default=1.0, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." +) + + + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] = Field( default="", @@ -79,62 +128,27 @@ class CreateCompletionRequest(BaseModel): default=None, description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." ) - max_tokens: int = Field( - default=16, - ge=1, - le=2048, - description="The maximum number of tokens to generate." - ) - temperature: float = Field( - default=0.8, - ge=0.0, - le=2.0, - description="Adjust the randomness of the generated text.\n\n" + - "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." - ) - top_p: float = Field( - default=0.95, - ge=0.0, - le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + - "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." - ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field echo: bool = Field( default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots." ) - stop: Optional[List[str]] = Field( - default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used." - ) - stream: bool = Field( - default=False, - description="Whether to stream the results as they are generated. Useful for chatbots." - ) + stop: Optional[List[str]] = stop_field + stream: bool = stream_field logprobs: Optional[int] = Field( default=None, ge=0, description="The number of logprobs to generate. If None, no logprobs are generated." ) - - # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = Field( - default=40, - ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" + - "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." - ) - repeat_penalty: float = Field( - default=1.0, - ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + - "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." - ) + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field class Config: schema_extra = { @@ -199,26 +213,29 @@ def create_embedding( class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None + role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field( + default=Literal["user"], description="The role of the message." + ) + content: str = Field(default="", description="The content of the message.") class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: Optional[List[str]] = [] - max_tokens: int = 128 + messages: List[ChatCompletionRequestMessage] = Field( + default=[], + description="A list of messages to generate completions for." + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + stop: Optional[List[str]] = stop_field + stream: bool = stream_field # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = 40, - repeat_penalty: float = 1.1 + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field class Config: schema_extra = { From fa2a61e06569bb600d36d7ea5fee2ab456b3434d Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 18:46:01 -0700 Subject: [PATCH 7/9] llama_cpp server: fields for the embedding endpoint --- llama_cpp/server/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ec5dbd3f4..9adddcd87 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -189,7 +189,9 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - input: str + input: str = Field( + description="The input to embed." + ) class Config: schema_extra = { From dbbfc4ba2f8460e130dc268096f5906d3d22347b Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Mon, 1 May 2023 11:48:37 -0700 Subject: [PATCH 8/9] llama_cpp server: fix to ChatCompletionRequestMessage When I generate a client, it breaks because it fails to process the schema of ChatCompletionRequestMessage These fix that: - I think `Union[Literal["user"], Literal["channel"], ...]` is the same as Literal["user", "channel", ...] - Turns out default value `Literal["user"]` isn't JSON serializable, so replace with "user" --- llama_cpp/llama_types.py | 2 +- llama_cpp/server/app.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b8bdb0811..b770a0144 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -58,7 +58,7 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): - role: Union[Literal["assistant"], Literal["user"], Literal["system"]] + role: Literal["assistant", "user", "system"] content: str class ChatCompletionChoice(TypedDict): diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9adddcd87..886ee6d65 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -215,8 +215,8 @@ def create_embedding( class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field( - default=Literal["user"], description="The role of the message." + role: Literal["system", "user", "assistant"] = Field( + default="user", description="The role of the message." ) content: str = Field(default="", description="The content of the message.") From b9098b0ef7309b63ebff99cdfadf641223c15025 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Tue, 2 May 2023 14:08:51 -0700 Subject: [PATCH 9/9] llama_cpp server: prompt is a string Not sure why this union type was here but taking a look at llama.py, prompt is only ever processed as a string for completion This was breaking types when generating an openapi client --- llama_cpp/server/app.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef8aa4eb5..595476f3c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -126,7 +126,7 @@ def get_llama(): ) class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] = Field( + prompt: Optional[str] = Field( default="", description="The prompt to generate completions for." ) @@ -175,9 +175,6 @@ class Config: def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) - completion_or_chunks = llama( **request.dict( exclude={