@@ -155,6 +155,7 @@ class llama_token_data_array(Structure):
155
155
# int n_gpu_layers; // number of layers to store in VRAM
156
156
# int main_gpu; // the GPU that is used for scratch and small tensors
157
157
# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
158
+ # bool low_vram; // if true, reduce VRAM usage at the cost of performance
158
159
# int seed; // RNG seed, -1 for random
159
160
160
161
# bool f16_kv; // use fp16 for KV cache
@@ -177,6 +178,7 @@ class llama_context_params(Structure):
177
178
("n_gpu_layers" , c_int ),
178
179
("main_gpu" , c_int ),
179
180
("tensor_split" , c_float * LLAMA_MAX_DEVICES .value ),
181
+ ("low_vram" , c_bool ),
180
182
("seed" , c_int ),
181
183
("f16_kv" , c_bool ),
182
184
(
@@ -555,6 +557,26 @@ def llama_n_embd(ctx: llama_context_p) -> int:
555
557
_lib .llama_n_embd .restype = c_int
556
558
557
559
560
+ # // Get the vocabulary as output parameters.
561
+ # // Returns number of results.
562
+ # LLAMA_API int llama_get_vocab(
563
+ # const struct llama_context * ctx,
564
+ # const char * * strings,
565
+ # float * scores,
566
+ # int capacity);
567
+ def llama_get_vocab (
568
+ ctx : llama_context_p ,
569
+ strings , # type: Array[c_char_p] # type: ignore
570
+ scores , # type: Array[c_float] # type: ignore
571
+ capacity : c_int ,
572
+ ) -> int :
573
+ return _lib .llama_get_vocab (ctx , strings , scores , capacity )
574
+
575
+
576
+ _lib .llama_get_vocab .argtypes = [llama_context_p , c_char_p , c_float , c_int ]
577
+ _lib .llama_get_vocab .restype = c_int
578
+
579
+
558
580
# Token logits obtained from the last call to llama_eval()
559
581
# The logits for the last token are stored in the last row
560
582
# Can be mutated in order to change the probabilities of the next token
0 commit comments