@@ -824,7 +824,15 @@ def _create_completion(
824
824
if self .verbose :
825
825
llama_cpp .llama_reset_timings (self .ctx )
826
826
827
- if len (prompt_tokens ) > self ._n_ctx :
827
+ if max_tokens <= 0 :
828
+ # Unlimited, depending on n_ctx.
829
+ if len (prompt_tokens ) >= int (llama_cpp .llama_n_ctx (self .ctx )):
830
+ raise ValueError (
831
+ f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
832
+ )
833
+ else :
834
+ max_tokens = int (llama_cpp .llama_n_ctx (self .ctx )) - len (prompt_tokens )
835
+ elif len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
828
836
raise ValueError (
829
837
f"Requested tokens ({ len (prompt_tokens )} ) exceed context window of { self ._n_ctx } "
830
838
)
@@ -1231,7 +1239,7 @@ def create_completion(
1231
1239
Args:
1232
1240
prompt: The prompt to generate text from.
1233
1241
suffix: A suffix to append to the generated text. If None, no suffix is appended.
1234
- max_tokens: The maximum number of tokens to generate.
1242
+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1235
1243
temperature: The temperature to use for sampling.
1236
1244
top_p: The top-p value to use for sampling.
1237
1245
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1304,7 +1312,7 @@ def __call__(
1304
1312
Args:
1305
1313
prompt: The prompt to generate text from.
1306
1314
suffix: A suffix to append to the generated text. If None, no suffix is appended.
1307
- max_tokens: The maximum number of tokens to generate.
1315
+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1308
1316
temperature: The temperature to use for sampling.
1309
1317
top_p: The top-p value to use for sampling.
1310
1318
logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1432,7 +1440,7 @@ def create_chat_completion(
1432
1440
top_k: The top-k value to use for sampling.
1433
1441
stream: Whether to stream the results.
1434
1442
stop: A list of strings to stop generation when encountered.
1435
- max_tokens: The maximum number of tokens to generate.
1443
+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1436
1444
repeat_penalty: The penalty to apply to repeated tokens.
1437
1445
1438
1446
Returns:
0 commit comments