8000 bugfix: truncate completion max_tokens to fit context length by default · MobinX/llama-cpp-python@a86bfdf · GitHub
[go: up one dir, main page]

Skip to content

Commit a86bfdf

Browse files
committed
bugfix: truncate completion max_tokens to fit context length by default
1 parent 6f70cc4 commit a86bfdf

File tree

1 file changed

+6
-10
lines changed

1 file changed

+6
-10
lines changed

llama_cpp/llama.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -824,19 +824,15 @@ def _create_completion(
824824
if self.verbose:
825825
llama_cpp.llama_reset_timings(self.ctx)
826826

827-
if max_tokens <= 0:
828-
# Unlimited, depending on n_ctx.
829-
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
830-
raise ValueError(
831-
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
832-
)
833-
else:
834-
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
835-
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
827+
if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
836828
raise ValueError(
837-
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
829+
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
838830
)
839831

832+
if max_tokens <= 0:
833+
# Unlimited, depending on n_ctx.
834+
max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
835+
840836
# Truncate max_tokens if requested tokens would exceed the context window
841837
max_tokens = (
842838
max_tokens

0 commit comments

Comments
 (0)
0