@@ -242,8 +242,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
242
242
243
243
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
244
244
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
245
- # define LLAMA_SESSION_VERSION 5
246
- LLAMA_SESSION_VERSION = 5
245
+ # define LLAMA_SESSION_VERSION 6
246
+ LLAMA_SESSION_VERSION = 6
247
247
248
248
# define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
249
249
LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
@@ -730,6 +730,7 @@ class llama_model_params(ctypes.Structure):
730
730
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
731
731
# bool embeddings; // if true, extract embeddings (together with logits)
732
732
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
733
+ # bool flash_attn; // whether to use flash attention
733
734
734
735
735
736
# // Abort callback
@@ -766,6 +767,7 @@ class llama_context_params(ctypes.Structure):
766
767
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
767
768
embeddings (bool): if true, extract embeddings (together with logits)
768
769
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
770
+ flash_attn (bool): whether to use flash attention
769
771
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
770
772
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
771
773
"""
@@ -795,6 +797,7 @@ class llama_context_params(ctypes.Structure):
795
797
logits_all : bool
796
798
embeddings : bool
797
799
offload_kqv : bool
800
+ flash_attn : bool
798
801
abort_callback : Callable [[ctypes .c_void_p ], bool ]
799
802
abort_callback_data : ctypes .c_void_p
800
803
@@ -823,6 +826,7 @@ class llama_context_params(ctypes.Structure):
823
826
("logits_all" , ctypes .c_bool ),
824
827
("embeddings" , ctypes .c_bool ),
825
828
("offload_kqv" , ctypes .c_bool ),
829
+ ("flash_attn" , ctypes .c_bool ),
826
830
("abort_callback" , ggml_abort_callback ),
827
831
("abort_callback_data" , ctypes .c_void_p ),
828
832
]
@@ -1615,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
1615
1619
...
1616
1620
1617
1621
1618
- # // Clear the KV cache
1622
+ # // Clear the KV cache - both cell info is erased and KV data is zeroed
1619
1623
# LLAMA_API void llama_kv_cache_clear(
1620
1624
# struct llama_context * ctx);
1621
1625
@ctypes_function ("llama_kv_cache_clear" , [llama_context_p_ctypes ], None )
0 commit comments