8000 feat: Update llama.cpp · coderonion/llama-cpp-python@8d298b4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8d298b4

Browse files
committed
feat: Update llama.cpp
1 parent 6eb2523 commit 8d298b4

File tree

2 files changed

+78
-9
lines changed

2 files changed

+78
-9
lines changed

llama_cpp/llama_cpp.py

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
581581
# bool embeddings; // if true, extract embeddings (together with logits)
582582
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
583583

584+
584585
# // Abort callback
585586
# // if it returns true, execution of llama_decode() will be aborted
586587
# // currently works only with CPU execution
@@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
10061007
def llama_n_embd(model: llama_model_p, /) -> int: ...
10071008

10081009

1010+
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
1011+
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
1012+
def llama_n_layer(model: llama_model_p, /) -> int: ...
1013+
1014+
10091015
# // Get the model's RoPE frequency scaling factor
10101016
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
10111017
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -1166,12 +1172,18 @@ def llama_model_quantize(
11661172
...
11671173

11681174

1175+
# // Apply a LoRA adapter to a loaded model
1176+
# // path_base_model is the path to a higher quality model to use as a base for
1177+
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
1178+
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
1179+
# // will be applied on top of the previous one
1180+
# // Returns 0 on success
11691181
# LLAMA_API int32_t llama_model_apply_lora_from_file(
11701182
# const struct llama_model * model,
1171-
# const char * path_lora,
1172-
# float scale,
1173-
# const char * path_base_model,
1174-
# int32_t n_threads);
1183+
# const char * path_lora,
1184+
# float scale,
1185+
# const char * path_base_model,
1186+
# int32_t n_threads);
11751187
@ctypes_function(
11761188
"llama_model_apply_lora_from_file",
11771189
[
@@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
11901202
path_base_model: Union[ctypes.c_char_p, bytes, None],
11911203
n_threads: Union[ctypes.c_int32, int],
11921204
/,
1193-
) -> int: ...
1205+
) -> int:
1206+
"""Apply a LoRA adapter to a loaded model
1207+
path_base_model is the path to a higher quality model to use as a base for
1208+
the layers modified by the adapter. Can be NULL to use the current loaded model.
1209+
The model needs to be reloaded before applying a new adapter, otherwise the adapter
1210+
will be applied on top of the previous one
1211+
Returns 0 on success"""
1212+
...
1213+
1214+
1215+
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
1216+
# // the currently loaded vector.
1217+
# // n_embd should be the size of a single layer's control, and data should point
1218+
# // to an n_embd x n_layers buffer starting from layer 1.
1219+
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
1220+
# // See llama_control_vector_load in common to load a control vector.
1221+
# LLAMA_API int32_t llama_control_vector_apply(
1222+
# struct llama_context * lctx,
1223+
# const float * data,
1224+
# size_t len,
1225+
# int32_t n_embd,
1226+
# int32_t il_start,
1227+
# int32_t il_end);
1228+
@ctypes_function(
1229+
"llama_control_vector_apply",
1230+
[
1231+
llama_context_p_ctypes,
1232+
ctypes.POINTER(ctypes.c_float),
1233+
ctypes.c_size_t,
1234+
ctypes.c_int32,
1235+
ctypes.c_int32,
1236+
ctypes.c_int32,
1237+
],
1238+
ctypes.c_int32,
1239+
)
1240+
def llama_control_vector_apply(
1241+
lctx: llama_context_p,
1242+
data: CtypesPointerOrRef[ctypes.c_float],
1243+
len: int,
1244+
n_embd: int,
1245+
il_start: int,
1246+
il_end: int,
1247+
/,
1248+
) -> int:
1249+
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
1250+
the currently loaded vector.
1251+
n_embd should be the size of a single layer's control, and data should point
1252+
to an n_embd x n_layers buffer starting from layer 1.
1253+
il_start and il_end are the layer range the vector should apply to (both inclusive)
1254+
See llama_control_vector_load in common to load a control vector."""
1255+
...
11941256

11951257

11961258
# //
@@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
12051267
# llama_pos pos;
12061268
# };
12071269
class llama_kv_cache_view_cell(ctypes.Structure):
1270+
"""Information associated with an individual cell in the KV cache view.
1271+
1272+
Attributes:
1273+
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
1274+
May be negative if the cell is not populated."""
1275+
12081276
_fields_ = [("pos", llama_pos)]
12091277

12101278

@@ -1985,7 +2053,7 @@ def llama_tokenize(
19852053
/,
19862054
) -> int:
19872055
"""Convert the provided text into tokens.
1988-
2056+
19892057
Args:
19902058
model: The model to use for tokenization.
19912059
text: The text to tokenize.
@@ -1995,10 +2063,11 @@ def llama_tokenize(
19952063
add_bos: Whether to add a beginning-of-sentence token.
19962064
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
19972065
Does not insert a leading space.
1998-
2066+
19992067
Returns:
20002068
Returns the number of tokens on success, no more than n_tokens_max
2001-
Returns a negative number on failure - the number of tokens that would have been returned"""
2069+
Returns a negative number on failure - the number of tokens that would have been returned
2070+
"""
20022071
...
20032072

20042073

vendor/llama.cpp

0 commit comments

Comments
 (0)
0