8000 feat: Update llama.cpp · asusevski/llama-cpp-python@901fe02 · GitHub
[go: up one dir, main page]

Skip to content

Commit 901fe02

Browse files
committed
feat: Update llama.cpp
1 parent b64fa4e commit 901fe02

File tree

2 files changed

+21
-11
lines changed

2 files changed

+21
-11
lines changed

llama_cpp/llama_cpp.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
175175

176176
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
177177
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
178-
# define LLAMA_SESSION_VERSION 4
179-
LLAMA_SESSION_VERSION = 4
178+
# define LLAMA_SESSION_VERSION 5
179+
LLAMA_SESSION_VERSION = 5
180180

181181

182182
# struct llama_model;
@@ -274,6 +274,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
274274
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
275275
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
276276
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
277+
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
277278

278279
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
279280
# };
@@ -677,6 +678,7 @@ class llama_context_params(ctypes.Structure):
677678
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
678679
# bool pure; // quantize all tensors to the default type
679680
# void * imatrix; // pointer to importance matrix data
681+
# void * kv_overrides; // pointer to vector containing overrides
680682
# } llama_model_quantize_params;
681683
class llama_model_quantize_params(ctypes.Structure):
682684
"""Parameters for llama_model_quantize
@@ -691,6 +693,7 @@ class llama_model_quantize_params(ctypes.Structure):
691693
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
692694
pure (bool): quantize all tensors to the default type
693695
imatrix (ctypes.c_void_p): pointer to importance matrix data
696+
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
694697
"""
695698

696699
_fields_ = [
@@ -703,6 +706,7 @@ class llama_model_quantize_params(ctypes.Structure):
703706
("only_copy", ctypes.c_bool),
704707
("pure", ctypes.c_bool),
705708
("imatrix", ctypes.c_void_p),
709+
("kv_overrides", ctypes.c_void_p),
706710
]
707711

708712

@@ -1838,9 +1842,9 @@ def llama_synchronize(ctx: llama_context_p, /):
18381842

18391843

18401844
# // Token logits obtained from the last call to llama_decode()
1841-
# // The logits for the last token are stored in the last row
1842-
# // Logits for which llama_batch.logits[i] == 0 are undefined
1843-
# // Rows: n_tokens provided with llama_batch
1845+
# // The logits for which llama_batch.logits[i] != 0 are stored contiguously
1846+
# // in the order they have appeared in the batch.
1847+
# // Rows: number of tokens for which llama_batch.logits[i] != 0
18441848
# // Cols: n_vocab
18451849
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
18461850
@ctypes_function(
@@ -1859,7 +1863,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
18591863

18601864

18611865
# // Logits for the ith token. Equivalent to:
1862-
# // llama_get_logits(ctx) + i*n_vocab
1866+
# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
1867+
# // returns NULL for invalid ids.
18631868
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
18641869
@ctypes_function(
18651870
"llama_get_logits_ith",
@@ -1874,8 +1879,12 @@ def llama_get_logits_ith(
18741879
...
18751880

18761881

1877-
# // Get all output token embeddings
1878-
# // shape: [n_tokens*n_embd] (1-dimensional)
1882+
# // Get all output token embeddings.
1883+
# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
1884+
# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
1885+
# // in the order they have appeared in the batch.
1886+
# // shape: [n_outputs*n_embd]
1887+
# // Otherwise, returns NULL.
18791888
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
18801889
@ctypes_function(
18811890
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -1886,9 +1895,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
18861895
...
18871896

18881897

1889-
# // Get the embeddings for the ith token
1890-
# // llama_get_embeddings(ctx) + i*n_embd
1898+
# // Get the embeddings for the ith token. Equivalent to:
1899+
# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
18911900
# // shape: [n_embd] (1-dimensional)
1901+
# // returns NULL for invalid ids.
18921902
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
18931903
@ctypes_function(
18941904
"llama_get_embeddings_ith",

vendor/llama.cpp

0 commit comments

Comments
 (0)
0