10000 Merge pull request #64 from jm12138/add_unlimited_max_tokens · KoolSameer/llama-cpp-python@2472420 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2472420

Browse files
authored
Merge pull request abetlen#64 from jm12138/add_unlimited_max_tokens
Add unlimited max_tokens
2 parents 236c4cf + 5d756de commit 2472420

File tree

1 file changed

+12
-4
lines changed

1 file changed

+12
-4
lines changed

llama_cpp/llama.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -824,7 +824,15 @@ def _create_completion(
824824
if self.verbose:
825825
llama_cpp.llama_reset_timings(self.ctx)
826826

827-
if len(prompt_tokens) > self._n_ctx:
827+
if max_tokens <= 0:
828+
# Unlimited, depending on n_ctx.
829+
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
830+
raise ValueError(
831+
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
832+
)
833+
else:
834+
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
835+
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
828836
raise ValueError(
829837
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
830838
)
@@ -1231,7 +1239,7 @@ def create_completion(
12311239
Args:
12321240
prompt: The prompt to generate text from.
12331241
suffix: A suffix to append to the generated text. If None, no suffix is appended.
1234-
max_tokens: The maximum number of tokens to generate.
1242+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
12351243
temperature: The temperature to use for sampling.
12361244
top_p: The top-p value to use for sampling.
12371245
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1304,7 +1312,7 @@ def __call__(
13041312
Args:
13051313
prompt: The prompt to generate text from.
13061314
suffix: A suffix to append to the generated text. If None, no suffix is appended.
1307-
max_tokens: The maximum number of tokens to generate.
1315+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
13081316
temperature: The temperature to use for sampling.
13091317
top_p: The top-p value to use for sampling.
13101318
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1432,7 +1440,7 @@ def create_chat_completion(
14321440
top_k: The top-k value to use for sampling.
14331441
stream: Whether to stream the results.
14341442
stop: A list of strings to stop generation when encountered.
1435-
max_tokens: The maximum number of tokens to generate.
1443+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
14361444
repeat_penalty: The penalty to apply to repeated tokens.
14371445
14381446
Returns:

0 commit comments

Comments
 (0)
0