8000 Fix: default max_tokens matches openai api (16 for completion, max le… · tk-master/llama-cpp-python@e7962d2 · GitHub
[go: up one dir, main page]

Skip to content

Commit e7962d2

Browse files
committed
Fix: default max_tokens matches openai api (16 for completion, max length for chat completion)
1 parent 8207280 commit e7962d2

File tree

2 files changed

+9
-7
lines changed

2 files changed

+9
-7
lines changed

llama_cpp/llama.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,7 +1296,7 @@ def _create_completion(
12961296
self,
12971297
prompt: Union[str, List[int]],
12981298
suffix: Optional[str] = None,
1299-
max_tokens: int = 16,
1299+
max_tokens: Optional[int] = 16,
13001300
temperature: float = 0.8,
13011301
top_p: float = 0.95,
13021302
logprobs: Optional[int] = None,
@@ -1350,7 +1350,7 @@ def _create_completion(
13501350
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
13511351
)
13521352

1353-
if max_tokens <= 0:
1353+
if max_tokens is None or max_tokens <= 0:
13541354
# Unlimited, depending on n_ctx.
13551355
max_tokens = self._n_ctx - len(prompt_tokens)
13561356

@@ -1762,7 +1762,7 @@ def create_completion(
17621762
self,
17631763
prompt: Union[str, List[int]],
17641764
suffix: Optional[str] = None,
1765-
max_tokens: int = 128,
1765+
max_tokens: Optional[int] = 16,
17661766
temperature: float = 0.8,
17671767
top_p: float = 0.95,
17681768
logprobs: Optional[int] = None,
@@ -1788,7 +1788,7 @@ def create_completion(
17881788
Args:
17891789
prompt: The prompt to generate text from.
17901790
suffix: A suffix to append to the generated text. If None, no suffix is appended.
1791-
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1791+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
17921792
temperature: The temperature to use for sampling.
17931793
top_p: The top-p value to use for sampling.
17941794
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1921,7 +1921,7 @@ def create_chat_completion(
19211921
stop: Optional[Union[str, List[str]]] = [],
19221922
seed: Optional[int] = None,
19231923
response_format: Optional[ChatCompletionRequestResponseFormat] = None,
1924-
max_tokens: int = 256,
1924+
max_tokens: Optional[int] = None,
19251925
presence_penalty: float = 0.0,
19261926
frequency_penalty: float = 0.0,
19271927
repeat_penalty: float = 1.1,
@@ -1944,7 +1944,7 @@ def create_chat_completion(
19441944
top_k: The top-k value to use for sampling.
19451945
stream: Whether to stream the results.
19461946
stop: A list of strings to stop generation when encountered.
1947-
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1947+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
19481948
repeat_penalty: The penalty to apply to repeated tokens.
19491949
19501950
Returns:
8D0A

llama_cpp/server/app.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,9 @@ class CreateChatCompletionRequest(BaseModel):
783783
default=None,
784784
description="A tool to apply to the generated completions.",
785785
) # TODO: verify
786-
max_tokens: int = max_tokens_field
786+
max_tokens: Optional[int] = Field(
787+
default=None, description="The maximum number of tokens to generate. Defaults to inf"
788+
)
787789
temperature: float = temperature_field
788790
top_p: float = top_p_field
789791
stop: Optional[List[str]] = stop_field

0 commit comments

Comments
 (0)
0