8000 Add verbose flag. Closes #19 · coderonion/llama-cpp-python@c137789 · GitHub
[go: up one dir, main page]

Skip to content

Commit c137789

Browse files
committed
Add verbose flag. Closes abetlen#19
1 parent 5075c16 commit c137789

File tree

1 file changed

+21
-0
lines changed

1 file changed

+21
-0
lines changed

llama_cpp/llama.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import sys
23
import uuid
34
import time
45
import multiprocessing
@@ -27,6 +28,7 @@ def __init__(
2728
n_threads: Optional[int] = None,
2829
n_batch: int = 8,
2930
last_n_tokens_size: int = 64,
31+
verbose: bool = True,
3032
):
3133
"""Load a llama.cpp model from `model_path`.
3234
@@ -43,13 +45,15 @@ def __init__(
4345
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
4446
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
4547
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
48+
verbose: Print verbose output to stderr.
4649
4750
Raises:
4851
ValueError: If the model path does not exist.
4952
5053
Returns:
5154
A Llama instance.
5255
"""
56+
self.verbose = verbose
5357
self.model_path = model_path
5458

5559
self.params = llama_cpp.llama_context_default_params()
@@ -79,6 +83,9 @@ def __init__(
7983
self.model_path.encode("utf-8"), self.params
8084
)
8185

86+
if self.verbose:
87+
print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
88+
8289
def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
8390
"""Tokenize a string.
8491
@@ -239,13 +246,21 @@ def create_embedding(self, input: str) -> Embedding:
239246
An embedding object.
240247
"""
241248
assert self.ctx is not None
249+
250+
if self.verbose:
251+
llama_cpp.llama_reset_timings(self.ctx)
252+
242253
tokens = self.tokenize(input.encode("utf-8"))
243254
self.reset()
244255
self.eval(tokens)
245256
n_tokens = len(tokens)
246257
embedding = llama_cpp.llama_get_embeddings(self.ctx)[
247258
: llama_cpp.llama_n_embd(self.ctx)
248259
]
260+
261+
if self.verbose:
262+
llama_cpp.llama_print_timings(self.ctx)
263+
249264
return {
250265
"object": "list",
251266
"data": [
@@ -296,6 +311,9 @@ def _create_completion(
296311
text = b""
297312
returned_characters = 0
298313

314+
if self.verbose:
315+
llama_cpp.llama_reset_timings(self.ctx)
316+
299317
if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
300318
raise ValueError(
301319
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
@@ -392,6 +410,9 @@ def _create_completion(
392410
if logprobs is not None:
393411
raise NotImplementedError("logprobs not implemented")
394412

413+
if self.verbose:
414+
llama_cpp.llama_print_timings(self.ctx)
415+
395416
yield {
396417
"id": completion_id,
397418
"object": "text_completion",

0 commit comments

Comments
 (0)
0