@@ -549,10 +549,7 @@ class llama_model_params(ctypes.Structure):
549
549
# uint32_t n_batch; // prompt processing maximum batch size
550
550
# uint32_t n_threads; // number of threads to use for generation
551
551
# uint32_t n_threads_batch; // number of threads to use for batch processing
552
-
553
- # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
554
- # enum
8000
llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
555
- # // (ignored if no pooling layer)
552
+ # int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
556
553
557
554
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
558
555
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -571,15 +568,13 @@ class llama_model_params(ctypes.Structure):
571
568
# enum ggml_type type_v; // data type for V cache
572
569
573
570
# // Keep the booleans together to avoid misalignment during copy-by-value.
574
- # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
571
+ # bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
572
+ # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - setllama_batch.logits instead)
575
573
# bool embedding; // embedding mode only
576
574
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
575
+ # bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
577
576
578
- # // Abort callback
579
- # // if it returns true, execution of llama_decode() will be aborted
580
- # // currently works only with CPU execution
581
- # ggml_abort_callback abort_callback;
582
- # void * abort_callback_data;
577
+ # bool enable_timing; // enable timing op
583
578
# };
584
579
class llama_context_params (ctypes .Structure ):
585
580
"""Parameters for llama_context
@@ -591,7 +586,6 @@ class llama_context_params(ctypes.Structure):
591
586
n_threads (int): number of threads to use for generation
592
587
n_threads_batch (int): number of threads to use for batch processing
593
588
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
594
- pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
595
589
rope_freq_base (float): RoPE base frequency, 0 = from model
596
590
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
597
591
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -604,11 +598,12 @@ class llama_context_params(ctypes.Structure):
604
598
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
605
599
type_k (int): data type for K cache
606
600
type_v (int): data type for V cache
601
+ mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
607
602
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
608
603
embedding (bool): embedding mode only
609
604
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
610
- abort_callback (ggml_abort_callback ): abort callback if it returns true, execution of llama_decode() will be aborted
611
- abort_callback_data (ctypes.ctypes.c_void_p ): data for abort_callback
605
+ do_pooling (pool ): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606
+ enable_timing (bool ): enable timing op
612
607
"""
613
608
614
609
_fields_ = [
@@ -618,7 +613,6 @@ class llama_context_params(ctypes.Structure):
618
613
("n_threads" , ctypes .c_uint32 ),
619
614
("n_threads_batch" , ctypes .c_uint32 ),
620
615
("rope_scaling_type" , ctypes .c_int ),
621
- ("pooling_type" , ctypes .c_int ),
622
616
("rope_freq_base" , ctypes .c_float ),
623
617
("rope_freq_scale" , ctypes .c_float ),
624
618
("yarn_ext_factor" , ctypes .c_float ),
@@ -631,11 +625,12 @@ class llama_context_params(ctypes.Structure):
631
625
("cb_eval_user_data" , ctypes .c_void_p ),
632
626
("type_k" , ctypes .c_int ),
633
627
("type_v" , ctypes .c_int ),
628
+ ("mul_mat_q" , ctypes .c_bool ),
634
629
("logits_all" , ctypes .c_bool ),
635
630
("embedding" , ctypes .c_bool ),
636
631
("offload_kqv" , ctypes .c_bool ),
637
- ("abort_callback " , ggml_abort_callback ),
638
- ("abort_callback_data " , ctypes .c_void_p ),
632
+ ("do_pooling " , ctypes . c_bool ),
633
+ ("enable_timing " , ctypes .c_bool ),
639
634
]
640
635
641
636
@@ -1723,22 +1718,6 @@ def llama_set_n_threads(
1723
1718
"""
1724
1719
...
1725
1720
1726
- # // Set abort callback
1727
- # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
1728
- # @ctypes_function(
1729
- # "llama_set_abort_callback",
1730
- # [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
1731
- # None,
1732
- # )
1733
- # def llama_set_abort_callback(
1734
- # ctx: llama_context_p,
1735
- # abort_callback: Callable[[ctypes.c_void_p], None],
1736
- # abort_callback_data: ctypes.c_void_p,
1737
- # /,
1738
- # ):
1739
- # """Set abort callback"""
1740
- # ...
1741
-
1742
1721
1743
1722
# // Token logits obtained from the last call to llama_decode()
1744
1723
# // The logits for the last token are stored in the last row
@@ -2711,6 +2690,7 @@ def llama_set_timestamp(ctx: llama_context_p, name: bytes):
2711
2690
"""Set timestamp with name"""
2712
2691
...
2713
2692
2693
+
2714
2694
# LLAMA_API int64_t llama_get_timestamp(struct llama_context * ctx, const char * name);
2715
2695
@ctypes_function (
2716
2696
"llama_get_timestamp" ,
0 commit comments