8000 feat: Update llama.cpp · zeroAucrux/llama-cpp-python@87a6e57 · GitHub
[go: up one dir, main page]

Skip to content

Commit 87a6e57

Browse files
committed
feat: Update llama.cpp
1 parent 13177aa commit 87a6e57

File tree

2 files changed

+10
-6
lines changed

2 files changed

+10
-6
lines changed

llama_cpp/llama_cpp.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,10 +320,12 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
320320
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
321321

322322
# enum llama_pooling_type {
323+
# LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
323324
# LLAMA_POOLING_TYPE_NONE = 0,
324325
# LLAMA_POOLING_TYPE_MEAN = 1,
325326
# LLAMA_POOLING_TYPE_CLS = 2,
326327
# };
328+
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
327329
LLAMA_POOLING_TYPE_NONE = 0
328330
LLAMA_POOLING_TYPE_MEAN = 1
329331
LLAMA_POOLING_TYPE_CLS = 2
@@ -547,7 +549,10 @@ class llama_model_params(ctypes.Structure):
547549
# uint32_t n_batch; // prompt processing maximum batch size
548550
# uint32_t n_threads; // number of threads to use for generation
549551
# uint32_t n_threads_batch; // number of threads to use for batch processing
550-
# int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
552+
553+
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
554+
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
555+
# // (ignored if no pooling layer)
551556

552557
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
553558
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -569,7 +574,6 @@ class llama_model_params(ctypes.Structure):
569574
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
570575
# bool embedding; // embedding mode only
571576
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
572-
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
573577

574578
# // Abort callback
575579
# // if it returns true, execution of llama_decode() will be aborted
@@ -587,6 +591,7 @@ class llama_context_params(ctypes.Structure):
587591
n_threads (int): number of threads to use for generation
588592
n_threads_batch (int): number of threads to use for batch processing
589593
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
594+
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
590595
rope_freq_base (float): RoPE base frequency, 0 = from model
591596
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
592597
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -602,7 +607,6 @@ class llama_context_params(ctypes.Structure):
602607
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
603608
embedding (bool): embedding mode only
604609
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
605-
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606610
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
607611
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
608612
"""
@@ -613,7 +617,8 @@ class llama_context_params(ctypes.Structure):
613617
("n_batch", ctypes.c_uint32),
614618
("n_threads", ctypes.c_uint32),
615619
("n_threads_batch", ctypes.c_uint32),
616-
("rope_scaling_type", ctypes.c_int32),
620+
("rope_scaling_type", ctypes.c_int),
621+
("pooling_type", ctypes.c_int),
617622
("rope_freq_base", ctypes.c_float),
618623
("rope_freq_scale", ctypes.c_float),
619624
("yarn_ext_factor", ctypes.c_float),
@@ -629,7 +634,6 @@ class llama_context_params(ctypes.Structure):
629634
("logits_all", ctypes.c_bool),
630635
("embedding", ctypes.c_bool),
631636
("offload_kqv", ctypes.c_bool),
632-
("do_pooling", ctypes.c_bool),
633637
("abort_callback", ggml_abort_callback),
634638
("abort_callback_data", ctypes.c_void_p),
635639
]

vendor/llama.cpp

0 commit comments

Comments
 (0)
0