8000 Add unlimited max_tokens · MobinX/llama-cpp-python@90e1021 · GitHub
[go: up one dir, main page]

Skip to content

Commit 90e1021

Browse files
committed
Add unlimited max_tokens
1 parent a5554a2 commit 90e1021

File tree

1 file changed

+12
-4
lines changed

1 file changed

+12
-4
lines changed

llama_cpp/llama.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,15 @@ def _create_completion(
317317
if self.verbose:
318318
llama_cpp.llama_reset_timings(self.ctx)
319319

320-
if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
320+
if max_tokens <= 0:
321+
# Unlimited, depending on n_ctx.
322+
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
323+
raise ValueError(
324+
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
325+
)
326+
else:
327+
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
328+
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
321329
raise ValueError(
322330
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
323331
)
@@ -455,7 +463,7 @@ def create_completion(
455463
Args:
456464
prompt: The prompt to generate text from.
457465
suffix: A suffix to append to the generated text. If None, no suffix is appended.
458-
max_tokens: The maximum number of tokens to generate.
466+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
459467
temperature: The temperature to use for sampling.
460468
top_p: The top-p value to use for sampling.
461469
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -510,7 +518,7 @@ def __call__(
510518
Args:
511519
prompt: The prompt to generate text from.
512520
suffix: A suffix to append to the generated text. If None, no suffix is appended.
513-
max_tokens: The maximum number of tokens to generate.
521+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
514522
temperature: The temperature to use for sampling.
515523
top_p: The top-p value to use for sampling.
516524
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -619,7 +627,7 @@ def create_chat_completion(
619627
top_k: The top-k value to use for sampling.
620628
stream: Whether to stream the results.
621629
stop: A list of strings to stop generation when encountered.
622-
max_tokens: The maximum number of tokens to generate.
630+
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
623631
repeat_penalty: The penalty to apply to repeated tokens.
624632
625633
Returns:

0 commit comments

Comments
 (0)
0