8000 feat: Update llama.cpp · brookareru/llama-cpp-python@8c2b24d · GitHub
[go: up one dir, main page]

Skip to content

Commit 8c2b24d

Browse files
committed
feat: Update llama.cpp
1 parent 6332527 commit 8c2b24d

File tree

2 files changed

+8
-4
lines changed

2 files changed

+8
-4
lines changed

llama_cpp/llama_cpp.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
242242

243243
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
244244
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
245-
# define LLAMA_SESSION_VERSION 5
246-
LLAMA_SESSION_VERSION = 5
245+
# define LLAMA_SESSION_VERSION 6
246+
LLAMA_SESSION_VERSION = 6
247247

248248
# define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
249249
LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
@@ -730,6 +730,7 @@ class llama_model_params(ctypes.Structure):
730730
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
731731
# bool embeddings; // if true, extract embeddings (together with logits)
732732
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
733+
# bool flash_attn; // whether to use flash attention
733734

734735

735736
# // Abort callback
@@ -766,6 +767,7 @@ class llama_context_params(ctypes.Structure):
766767
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
767768
embeddings (bool): if true, extract embeddings (together with logits)
768769
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
770+
flash_attn (bool): whether to use flash attention
769771
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
770772
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
771773
"""
@@ -795,6 +797,7 @@ class llama_context_params(ctypes.Structure):
795797
logits_all: bool
796798
embeddings: bool
797799
offload_kqv: bool
800+
flash_attn: bool
798801
abort_callback: Callable[[ctypes.c_void_p], bool]
799802
abort_callback_data: ctypes.c_void_p
800803

@@ -823,6 +826,7 @@ class llama_context_params(ctypes.Structure):
823826
("logits_all", ctypes.c_bool),
824827
("embeddings", ctypes.c_bool),
825828
("offload_kqv", ctypes.c_bool),
829+
("flash_attn", ctypes.c_bool),
826830
("abort_callback", ggml_abort_callback),
827831
("abort_callback_data", ctypes.c_void_p),
828832
]
@@ -1615,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
16151619
...
16161620

16171621

1618-
# // Clear the KV cache
1622+
# // Clear the KV cache - both cell info is erased and KV data is zeroed
16191623
# LLAMA_API void llama_kv_cache_clear(
16201624
# struct llama_context * ctx);
16211625
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)

vendor/llama.cpp

0 commit comments

Comments
 (0)
0