8000 Update llama.cpp · shawnx11/llama-cpp-python@f1edc66 · GitHub
[go: up one dir, main page]

Skip to content

Commit f1edc66

Browse files
committed
Update llama.cpp
1 parent f3b844e commit f1edc66

File tree

2 files changed

+46
-7
lines changed

2 files changed

+46
-7
lines changed

llama_cpp/llama_cpp.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ def _load_shared_library(lib_base_name: str):
103103

104104
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
105105
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
106-
# define LLAMA_SESSION_VERSION 2
107-
LLAMA_SESSION_VERSION = 2
106+
# define LLAMA_SESSION_VERSION 3
107+
LLAMA_SESSION_VERSION = 3
108108

109109

110110
# struct llama_model;
@@ -309,6 +309,35 @@ class llama_batch(Structure):
309309
("all_seq_id", llama_seq_id),
310310
]
311311

312+
# enum llama_model_kv_override_type {
313+
# LLAMA_KV_OVERRIDE_INT,
314+
# LLAMA_KV_OVERRIDE_FLOAT,
315+
# LLAMA_KV_OVERRIDE_BOOL,
316+
# };
317+
class llama_model_kv_override_type(Structure):
318+
_fields_ = [
319+
("LLAMA_KV_OVERRIDE_INT", c_int),
320+
("LLAMA_KV_OVERRIDE_FLOAT", c_int),
321+
("LLAMA_KV_OVERRIDE_BOOL", c_int),
322+
]
323+
324+
# struct llama_model_kv_override {
325+
# char key[128];
326+
# enum llama_model_kv_override_type tag;
327+
# union {
328+
# int64_t int_value;
329+
# double float_value;
330+
# bool bool_value;
331+
# };
332+
# };
333+
class llama_model_kv_override(Structure):
334+
_fields_ = [
335+
("key", ctypes.c_char * 128),
336+
("tag", llama_model_kv_override_type),
337+
("int_value", ctypes.c_int64),
338+
("float_value", c_double),
339+
("bool_value", c_bool),
340+
]
312341

313342
# struct llama_model_params {
314343
# int32_t n_gpu_layers; // number of layers to store in VRAM
@@ -320,6 +349,8 @@ class llama_batch(Structure):
320349
# // context pointer passed to the progress callback
321350
# void * progress_callback_user_data;
322351

352+
# // override key-value pairs of the model meta data
353+
# const struct llama_model_kv_override * kv_overrides;
323354

324355
# // Keep the booleans together to avoid misalignment during copy-by-value.
325356
# bool vocab_only; // only load the vocabulary, no weights
@@ -335,6 +366,7 @@ class llama_model_params(Structure):
335366
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
336367
progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
337368
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
369+
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
338370
vocab_only (bool): only load the vocabulary, no weights
339371
use_mmap (bool): use mmap if possible
340372
use_mlock (bool): force system to keep model in RAM"""
@@ -344,6 +376,7 @@ class llama_model_params(Structure):
344376
("tensor_split", c_float_p),
345377
("progress_callback", llama_progress_callback),
346378
("progress_callback_user_data", c_void_p),
379+
("kv_overrides", POINTER(llama_model_kv_override)),
347380
("vocab_only", c_bool),
348381
("use_mmap", c_bool),
349382
("use_mlock", c_bool),
@@ -367,12 +400,14 @@ class llama_model_params(Structure):
367400
# float yarn_beta_slow; // YaRN high correction dim
368401
# uint32_t yarn_orig_ctx; // YaRN original context size
369402

403+
# enum ggml_type type_k; // data type for K cache
404+
# enum ggml_type type_v; // data type for V cache
370405

371406
# // Keep the booleans together to avoid misalignment during copy-by-value.
372-
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
373-
# bool f16_kv; // use fp16 for KV cache, fp32 otherwise
374-
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
375-
# bool embedding; // embedding mode only
407+
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
408+
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
409+
# bool embedding; // embedding mode only
410+
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
376411
# };
377412
class llama_context_params(Structure):
378413
"""Parameters for llama_context
@@ -391,6 +426,8 @@ class llama_context_params(Structure):
391426
yarn_beta_fast (float): YaRN low correction dim
392427
yarn_beta_slow (float): YaRN high correction dim
393428
yarn_orig_ctx (int): YaRN original context size
429+
type_k (int): data type for K cache
430+
type_v (int): data type for V cache
394431
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
395432
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
396433
logits_all (bool): the llama_eval() call computes all logits, not just the last one
@@ -409,6 +446,8 @@ class llama_context_params(Structure):
409446
("yarn_beta_fast", c_float),
410447
("yarn_beta_slow", c_float),
411448
("yarn_orig_ctx", c_uint32),
449+
("type_k", c_int),
450+
("type_v", c_int),
412451
("mul_mat_q", c_bool),
413452
("f16_kv", c_bool),
414453
("logits_all", c_bool),

vendor/llama.cpp

0 commit comments

Comments
 (0)
0