8000 updata python binding APIs · abetlen/llama-cpp-python@dcffc11 · GitHub
[go: up one dir, main page]

Skip to content

Commit dcffc11

Browse files
committed
updata python binding APIs
1 parent e32ffd4 commit dcffc11

File tree

1 file changed

+12
-32
lines changed

1 file changed

+12
-32
lines changed

llama_cpp/llama_cpp.py

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -549,10 +549,7 @@ class llama_model_params(ctypes.Structure):
549549
# uint32_t n_batch; // prompt processing maximum batch size
550550
# uint32_t n_threads; // number of threads to use for generation
551551
# uint32_t n_threads_batch; // number of threads to use for batch processing
552-
553-
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
554-
# enum 8000 llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
555-
# // (ignored if no pooling layer)
552+
# int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
556553

557554
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
558555
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -571,15 +568,13 @@ class llama_model_params(ctypes.Structure):
571568
# enum ggml_type type_v; // data type for V cache
572569

573570
# // Keep the booleans together to avoid misalignment during copy-by-value.
574-
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
571+
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
572+
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - setllama_batch.logits instead)
575573
# bool embedding; // embedding mode only
576574
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
575+
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
577576

578-
# // Abort callback
579-
# // if it returns true, execution of llama_decode() will be aborted
580-
# // currently works only with CPU execution
581-
# ggml_abort_callback abort_callback;
582-
# void * abort_callback_data;
577+
# bool enable_timing; // enable timing op
583578
# };
584579
class llama_context_params(ctypes.Structure):
585580
"""Parameters for llama_context
@@ -591,7 +586,6 @@ class llama_context_params(ctypes.Structure):
591586
n_threads (int): number of threads to use for generation
592587
n_threads_batch (int): number of threads to use for batch processing
593588
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
594-
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
595589
rope_freq_base (float): RoPE base frequency, 0 = from model
596590
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
597591
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -604,11 +598,12 @@ class llama_context_params(ctypes.Structure):
604598
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
605599
type_k (int): data type for K cache
606600
type_v (int): data type for V cache
601+
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
607602
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
608603
embedding (bool): embedding mode only
609604
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
610-
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
611-
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
605+
do_pooling (pool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606+
enable_timing (bool): enable timing op
612607
"""
613608

614609
_fields_ = [
@@ -618,7 +613,6 @@ class llama_context_params(ctypes.Structure):
618613
("n_threads", ctypes.c_uint32),
619614
("n_threads_batch", ctypes.c_uint32),
620615
("rope_scaling_type", ctypes.c_int),
621-
("pooling_type", ctypes.c_int),
622616
("rope_freq_base", ctypes.c_float),
623617
("rope_freq_scale", ctypes.c_float),
624618
("yarn_ext_factor", ctypes.c_float),
@@ -631,11 +625,12 @@ class llama_context_params(ctypes.Structure):
631625
("cb_eval_user_data", ctypes.c_void_p),
632626
("type_k", ctypes.c_int),
633627
("type_v", ctypes.c_int),
628+
("mul_mat_q", ctypes.c_bool),
634629
("logits_all", ctypes.c_bool),
635630
("embedding", ctypes.c_bool),
636631
("offload_kqv", ctypes.c_bool),
637-
("abort_callback", ggml_abort_callback),
638-
("abort_callback_data", ctypes.c_void_p),
632+
("do_pooling", ctypes.c_bool),
633+
("enable_timing", ctypes.c_bool),
639634
]
640635

641636

@@ -1723,22 +1718,6 @@ def llama_set_n_threads(
17231718
"""
17241719
...
17251720

1726-
# // Set abort callback
1727-
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
1728-
# @ctypes_function(
1729-
# "llama_set_abort_callback",
1730-
# [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
1731-
# None,
1732-
# )
1733-
# def llama_set_abort_callback(
1734-
# ctx: llama_context_p,
1735-
# abort_callback: Callable[[ctypes.c_void_p], None],
1736-
# abort_callback_data: ctypes.c_void_p,
1737-
# /,
1738-
# ):
1739-
# """Set abort callback"""
1740-
# ...
1741-
17421721

17431722
# // Token logits obtained from the last call to llama_decode()
17441723
# // The logits for the last token are stored in the last row
@@ -2711,6 +2690,7 @@ def llama_set_timestamp(ctx: llama_context_p, name: bytes):
27112690
"""Set timestamp with name"""
27122691
...
27132692

2693+
27142694
# LLAMA_API int64_t llama_get_timestamp(struct llama_context * ctx, const char * name);
27152695
@ctypes_function(
27162696
"llama_get_timestamp",

0 commit comments

Comments
 (0)
0