8000 feat: Update llama.cpp · ducky777/llama-cpp-python@0e70984 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0e70984

Browse files
committed
feat: Update llama.cpp
1 parent d5df431 commit 0e70984

File tree

2 files changed

+35
-3
lines changed

2 files changed

+35
-3
lines changed

llama_cpp/llama_cpp.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,12 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
148148
ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p
149149
)
150150

151+
# // Abort callback
152+
# // If not NULL, called before ggml computation
153+
# // If it returns true, the computation is aborted
154+
# typedef bool (*ggml_abort_callback)(void * data);
155+
ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p)
156+
151157
# llama.h bindings
152158

153159
_lib.llama_max_devices.argtypes = []
@@ -560,10 +566,16 @@ class llama_model_params(ctypes.Structure):
560566
# enum ggml_type type_v; // data type for V cache
561567

562568
# // Keep the booleans together to avoid misalignment during copy-by-value.
563-
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
569+
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
564570
# bool embedding; // embedding mode only
565571
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
566572
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
573+
574+
# // Abort callback
575+
# // if it returns true, execution of llama_decode() will be aborted
576+
# // currently works only with CPU execution
577+
# ggml_abort_callback abort_callback;
578+
# void * abort_callback_data;
567579
# };
568580
class llama_context_params(ctypes.Structure):
569581
"""Parameters for llama_context
@@ -591,6 +603,8 @@ class llama_context_params(ctypes.Structure):
591603
embedding (bool): embedding mode only
592604
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
593605
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606+
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
607+
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
594608
"""
595609

596610
_fields_ = [
@@ -616,6 +630,8 @@ class llama_context_params(ctypes.Structure):
616630
("embedding", ctypes.c_bool),
617631
("offload_kqv", ctypes.c_bool),
618632
("do_pooling", ctypes.c_bool),
633+
("abort_callback", ggml_abort_callback),
634+
("abort_callback_data", ctypes.c_void_p),
619635
]
620636

621637

@@ -1703,8 +1719,24 @@ def llama_set_n_threads(
17031719
"""
17041720
...
17051721

1722+
# // Set abort callback
1723+
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
1724+
@ctypes_function(
1725+
"llama_set_abort_callback",
1726+
[llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
1727+
None,
1728+
)
1729+
def llama_set_abort_callback(
1730+
ctx: llama_context_p,
1731+
abort_callback: Callable[[ctypes.c_void_p], None],
1732+
abort_callback_data: ctypes.c_void_p,
1733+
/,
1734+
):
1735+
"""Set abort callback"""
1736+
...
1737+
17061738

1707-
# // Token logits obtained from the last call to llama_eval()
1739+
# // Token logits obtained from the last call to llama_decode()
17081740
# // The logits for the last token are stored in the last row
17091741
# // Logits for which llama_batch.logits[i] == 0 are undefined
17101742
# // Rows: n_tokens provided with llama_batch

vendor/llama.cpp

0 commit comments

Comments
 (0)
0