@@ -320,10 +320,12 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
320
320
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
321
321
322
322
# enum llama_pooling_type {
323
+ # LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
323
324
# LLAMA_POOLING_TYPE_NONE = 0,
324
325
# LLAMA_POOLING_TYPE_MEAN = 1,
325
326
# LLAMA_POOLING_TYPE_CLS = 2,
326
327
# };
328
+ LLAMA_POOLING_TYPE_UNSPECIFIED = - 1
327
329
LLAMA_POOLING_TYPE_NONE = 0
328
330
LLAMA_POOLING_TYPE_MEAN = 1
329
331
LLAMA_POOLING_TYPE_CLS = 2
@@ -547,7 +549,10 @@ class llama_model_params(ctypes.Structure):
547
549
# uint32_t n_batch; // prompt processing maximum batch size
548
550
# uint32_t n_threads; // number of threads to use for generation
549
551
# uint32_t n_threads_batch; // number of threads to use for batch processing
550
- # int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
552
+
553
+ # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
554
+ # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
555
+ # // (ignored if no pooling layer)
551
556
552
557
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
553
558
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -569,7 +574,6 @@ class llama_model_params(ctypes.Structure):
569
574
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
570
575
# bool embedding; // embedding mode only
571
576
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
572
- # bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
573
577
574
578
# // Abort callback
575
579
# // if it returns true, execution of llama_decode() will be aborted
@@ -587,6 +591,7 @@ class llama_context_params(ctypes.Structure):
587
591
n_threads (int): number of threads to use for generation
588
592
n_threads_batch (int): number of threads to use for batch processing
589
593
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
594
+ pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
590
595
rope_freq_base (float): RoPE base frequency, 0 = from model
591
596
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
592
597
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -602,7 +607,6 @@ class llama_context_params(ctypes.Structure):
602
607
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
603
608
embedding (bool): embedding mode only
604
609
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
605
- do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606
610
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
607
611
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
608
612
"""
@@ -613,7 +617,8 @@ class llama_context_params(ctypes.Structure):
613
617
("n_batch" , ctypes .c_uint32 ),
614
618
("n_threads" , ctypes .c_uint32 ),
615
619
("n_threads_batch" , ctypes .c_uint32 ),
616
- ("rope_scaling_type" , ctypes .c_int32 ),
620
+ ("rope_scaling_type" , ctypes .c_int ),
621
+ ("pooling_type" , ctypes .c_int ),
617
622
("rope_freq_base" , ctypes .c_float ),
618
623
("rope_freq_scale" , ctypes .c_float ),
619
624
("yarn_ext_factor" , ctypes .c_float ),
@@ -629,7 +634,6 @@ class llama_context_params(ctypes.Structure):
629
634
("logits_all" , ctypes .c_bool ),
630
635
("embedding" , ctypes .c_bool ),
631
636
("offload_kqv" , ctypes .c_bool ),
632
- ("do_pooling" , ctypes .c_bool ),
633
637
("abort_callback" , ggml_abort_callback ),
634
638
("abort_callback_data" , ctypes .c_void_p ),
635
639
]
0 commit comments