@@ -317,7 +317,15 @@ def _create_completion(
317
317
if self .verbose :
318
318
llama_cpp .llama_reset_timings (self .ctx )
319
319
320
- if len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
320
+ if max_tokens <= 0 :
321
+ # Unlimited, depending on n_ctx.
322
+ if len (prompt_tokens ) >= int (llama_cpp .llama_n_ctx (self .ctx )):
323
+ raise ValueError (
324
+ f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
325
+ )
326
+ else :
327
+ max_tokens = int (llama_cpp .llama_n_ctx (self .ctx )) - len (prompt_tokens )
328
+ elif len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
321
329
raise ValueError (
322
330
f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
323
331
)
@@ -455,7 +463,7 @@ def create_completion(
455
463
Args:
456
464
prompt: The prompt to generate text from.
457
465
suffix: A suffix to append to the generated text. If None, no suffix is appended.
458
- max_tokens: The maximum number of tokens to generate.
466
+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
459
467
temperature: The temperature to use for sampling.
460
468
top_p: The top-p value to use for sampling.
461
469
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -510,7 +518,7 @@ def __call__(
510
518
Args:
511
519
prompt: The prompt to generate text from.
512
520
suffix: A suffix to append to the generated text. If None, no suffix is appended.
513
- max_tokens: The maximum number of tokens to generate.
521
+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
514
522
temperature: The temperature to use for sampling.
515
523
top_p: The top-p value to use for sampling.
516
524
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -619,7 +627,7 @@ def create_chat_completion(
619
627
top_k: The top-k value to use for sampling.
620
628
stream: Whether to stream the results.
621
629
stop: A list of strings to stop generation when encountered.
622
- max_tokens: The maximum number of tokens to generate.
630
+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
623
631
repeat_penalty: The penalty to apply to repeated tokens.
624
632
625
633
Returns:
0 commit comments