1
1
import os
2
+ import sys
2
3
import uuid
3
4
import time
4
5
import multiprocessing
@@ -27,6 +28,7 @@ def __init__(
27
28
n_threads : Optional [int ] = None ,
28
29
n_batch : int = 8 ,
29
30
last_n_tokens_size : int = 64 ,
31
+ verbose : bool = True ,
30
32
):
31
33
"""Load a llama.cpp model from `model_path`.
32
34
@@ -43,13 +45,15 @@ def __init__(
43
45
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
44
46
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
45
47
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
48
+ verbose: Print verbose output to stderr.
46
49
47
50
Raises:
48
51
ValueError: If the model path does not exist.
49
52
50
53
Returns:
51
54
A Llama instance.
52
55
"""
56
+ self .verbose = verbose
53
57
self .model_path = model_path
54
58
55
59
self .params = llama_cpp .llama_context_default_params ()
@@ -79,6 +83,9 @@ def __init__(
79
83
self .model_path .encode ("utf-8" ), self .params
80
84
)
81
85
86
+ if self .verbose :
87
+ print (llama_cpp .llama_print_system_info ().decode ("utf-8" ), file = sys .stderr )
88
+
82
89
def tokenize (self , text : bytes ) -> List [llama_cpp .llama_token ]:
83
90
"""Tokenize a string.
84
91
@@ -239,13 +246,21 @@ def create_embedding(self, input: str) -> Embedding:
239
246
An embedding object.
240
247
"""
241
248
assert self .ctx is not None
249
+
250
+ if self .verbose :
251
+ llama_cpp .llama_reset_timings (self .ctx )
252
+
242
253
tokens = self .tokenize (input .encode ("utf-8" ))
243
254
self .reset ()
244
255
self .eval (tokens )
245
256
n_tokens = len (tokens )
246
257
embedding = llama_cpp .llama_get_embeddings (self .ctx )[
247
258
: llama_cpp .llama_n_embd (self .ctx )
248
259
]
260
+
261
+ if self .verbose :
262
+ llama_cpp .llama_print_timings (self .ctx )
263
+
249
264
return {
250
265
"object" : "list" ,
251
266
"data" : [
@@ -296,6 +311,9 @@ def _create_completion(
296
311
text = b""
297
312
returned_characters = 0
298
313
314
+ if self .verbose :
315
+ llama_cpp .llama_reset_timings (self .ctx )
316
+
299
317
if len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
300
318
raise ValueError (
301
319
f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
@@ -392,6 +410,9 @@ def _create_completion(
392
410
if logprobs is not None :
393
411
raise NotImplementedError ("logprobs not implemented" )
394
412
413
+ if self .verbose :
414
+ llama_cpp .llama_print_timings (self .ctx )
415
+
395
416
yield {
396
417
"id" : completion_id ,
397
418
"object" : "text_completion" ,
0 commit comments