8000 Better server params and fields by Stonelinks · Pull Request #130 · abetlen/llama-cpp-python · GitHub
[go: up one dir, main page]

Skip to content
8000

Better server params and fields #130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions llama_cpp/llama_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,8 @@ class Completion(TypedDict):


class ChatCompletionMessage(TypedDict):
role: Union[Literal["assistant"], Literal["user"], Literal["system"]]
role: Literal["assistant", "user", "system"]
content: str
user: NotRequired[str]


class ChatCompletionChoice(TypedDict):
index: int
Expand Down
162 changes: 103 additions & 59 deletions llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,30 +74,87 @@ def get_llama():
with llama_lock:
yield llama

model_field = Field(
description="The model to use for generating completions."
)

max_tokens_field = Field(
default=16,
ge=1,
le=2048,
description="The maximum number of tokens to generate."
)

temperature_field = Field(
default=0.8,
ge=0.0,
le=2.0,
description="Adjust the randomness of the generated text.\n\n" +
"Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
)

top_p_field = Field(
default=0.95,
ge=0.0,
le=1.0,
description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
"Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
)

stop_field = Field(
default=None,
description="A list of tokens at which to stop generation. If None, no stop tokens are used."
)

stream_field = Field(
default=False,
description="Whether to stream the results as they are generated. Useful for chatbots."
)

top_k_field = Field(
default=40,
ge=0,
description="Limit the next token selection to the K most probable tokens.\n\n" +
"Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
)

repeat_penalty_field = Field(
default=1.0,
ge=0.0,
description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
"Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
)

class CreateCompletionRequest(BaseModel):
prompt: Union[str, List[str]]
suffix: Optional[str] = Field(None)
max_tokens: int = 16
temperature: float = 0.8
top_p: float = 0.95
echo: bool = False
stop: Optional[List[str]] = []
stream: bool = False

# ignored or currently unsupported
model: Optional[str] = Field(None)
n: Optional[int] = 1
logprobs: Optional[int] = Field(None)
presence_penalty: Optional[float] = 0
frequency_penalty: Optional[float] = 0
best_of: Optional[int] = 1
logit_bias: Optional[Dict[str, float]] = Field(None)
user: Optional[str] = Field(None)
prompt: Optional[str] = Field(
default="",
description="The prompt to generate completions for."
)
suffix: Optional[str] = Field(
default=None,
description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."
)
max_tokens: int = max_tokens_field
temperature: float = temperature_field
top_p: float = top_p_field
echo: bool = Field(
default=False,
description="Whether to echo the prompt in the generated text. Useful for chatbots."
)
stop: Optional[List[str]] = stop_field
stream: bool = stream_field
logprobs: Optional[int] = Field(
default=None,
ge=0,
description="The number of logprobs to generate. If None, no logprobs are generated."
)

# ignored, but marked as required for the sake of compatibility with openai's api
model: str = model_field

# llama.cpp specific parameters
top_k: int = 40
repeat_penalty: float = 1.1
top_k: int = top_k_field
repeat_penalty: float = repeat_penalty_field

class Config:
schema_extra = {
Expand All @@ -118,19 +175,10 @@ class Config:
def create_completion(
request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
):
if isinstance(request.prompt, list):
request.prompt = "".join(request.prompt)

completion_or_chunks = llama(
**request.dict(
exclude={
"model",
"n",
"frequency_penalty",
"presence_penalty",
"best_of",
"logit_bias",
"user",
"model"
}
)
)
Expand All @@ -142,9 +190,11 @@ def create_completion(


class CreateEmbeddingRequest(BaseModel):
model: Optional[str]
input: str
user: Optional[str]
# ignored, but marked as required for the sake of compatibility with openai's api
model: str = model_field
input: str = Field(
description="The input to embed."
)

class Config:
schema_extra = {
Expand All @@ -164,34 +214,33 @@ class Config:
def create_embedding(
request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
):
return llama.create_embedding(**request.dict(exclude={"model", "user"}))
return llama.create_embedding(**request.dict(exclude={"model"}))


class ChatCompletionRequestMessage(BaseModel):
role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
content: str
user: Optional[str] = None
role: Literal["system", "user", "assistant"] = Field(
default="user", description="The role of the message."
)
content: str = Field(default="", description="The content of the message.")


class CreateChatCompletionRequest(BaseModel):
model: Optional[str]
messages: List[ChatCompletionRequestMessage]
temperature: float = 0.8
top_p: float = 0.95
stream: bool = False
stop: Optional[List[str]] = []
max_tokens: int = 128

# ignored or currently unsupported
model: Optional[str] = Field(None)
n: Optional[int] = 1
presence_penalty: Optional[float] = 0
frequency_penalty: Optional[float] = 0
logit_bias: Optional[Dict[str, float]] = Field(None)
user: Optional[str] = Field(None)
messages: List[ChatCompletionRequestMessage] = Field(
default=[],
description="A list of messages to generate completions for."
)
max_tokens: int = max_tokens_field
temperature: float = temperature_field
top_p: float = top_p_field
stop: Optional[List[str]] = stop_field
stream: bool = stream_field

# ignored, but marked as required for the sake of compatibility with openai's api
model: str = model_field

# llama.cpp specific parameters
repeat_penalty: float = 1.1
top_k: int = top_k_field
repeat_penalty: float = repeat_penalty_field

class Config:
schema_extra = {
Expand Down Expand Up @@ -222,12 +271,7 @@ def create_chat_completion(
completion_or_chunks = llama.create_chat_completion(
**request.dict(
exclude={
"model",
"n",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"model"
}
),
)
Expand Down
0