coderonion
diff --git a/‎llama_cpp/llama.py
Lines changed: 21 additions & 0 deletions b/‎llama_cpp/llama.py
Lines changed: 21 additions & 0 deletions
@@ -1,4 +1,5 @@
 import os
+import sys
 import uuid
 import time
 import multiprocessing
@@ -27,6 +28,7 @@ def __init__(
         n_threads: Optional[int] = None,
         n_batch: int = 8,
         last_n_tokens_size: int = 64,
+        verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
 
@@ -43,13 +45,15 @@ def __init__(
             n_threads: Number of threads to use. If None, the number of threads is automatically determined.
             n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
+            verbose: Print verbose output to stderr.
 
         Raises:
             ValueError: If the model path does not exist.
 
         Returns:
             A Llama instance.
         """
+        self.verbose = verbose
         self.model_path = model_path
 
         self.params = llama_cpp.llama_context_default_params()
@@ -79,6 +83,9 @@ def __init__(
             self.model_path.encode("utf-8"), self.params
         )
 
+        if self.verbose:
+            print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
+
     def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
         """Tokenize a string.
 
@@ -239,13 +246,21 @@ def create_embedding(self, input: str) -> Embedding:
             An embedding object.
         """
         assert self.ctx is not None
+
+        if self.verbose:
+            llama_cpp.llama_reset_timings(self.ctx)
+
         tokens = self.tokenize(input.encode("utf-8"))
         self.reset()
         self.eval(tokens)
         n_tokens = len(tokens)
         embedding = llama_cpp.llama_get_embeddings(self.ctx)[
             : llama_cpp.llama_n_embd(self.ctx)
         ]
+
+        if self.verbose:
+            llama_cpp.llama_print_timings(self.ctx)
+
         return {
             "object": "list",
             "data": [
@@ -296,6 +311,9 @@ def _create_completion(
         text = b""
         returned_characters = 0
 
+        if self.verbose:
+            llama_cpp.llama_reset_timings(self.ctx)
+
         if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
             raise ValueError(
                 f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
@@ -392,6 +410,9 @@ def _create_completion(
         if logprobs is not None:
             raise NotImplementedError("logprobs not implemented")
 
+        if self.verbose:
+            llama_cpp.llama_print_timings(self.ctx)
+
         yield {
             "id": completion_id,
             "object": "text_completion",