@@ -668,30 +668,36 @@ class llama_context_params(ctypes.Structure):
668
668
669
669
# // model quantization parameters
670
670
# typedef struct llama_model_quantize_params {
671
- # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
672
- # enum llama_ftype ftype; // quantize to this llama_ftype
673
- # bool allow_requantize; // allow quantizing non-f32/f16 tensors
674
- # bool quantize_output_tensor; // quantize output.weight
675
- # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
676
- # bool pure; // quantize all tensors to the default type
677
- # void * imatrix; // pointer to importance matrix data
671
+ # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
672
+ # enum llama_ftype ftype; // quantize to this llama_ftype
673
+ # enum ggml_type output_tensor_type; // output tensor type
674
+ # enum ggml_type token_embedding_type; // itoken embeddings tensor type
675
+ # bool allow_requantize; // allow quantizing non-f32/f16 tensors
676
+ # bool quantize_output_tensor; // quantize output.weight
677
+ # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
678
+ # bool pure; // quantize all tensors to the default type
679
+ # void * imatrix; // pointer to importance matrix data
678
680
# } llama_model_quantize_params;
679
681
class llama_model_quantize_params (ctypes .Structure ):
680
682
"""Parameters for llama_model_quantize
681
683
682
684
Attributes:
683
685
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
684
686
ftype (int): quantize to this llama_ftype
687
+ output_tensor_type (int): output tensor type
688
+ token_embedding_type (int): itoken embeddings tensor type
685
689
allow_requantize (bool): allow quantizing non-f32/f16 tensors
686
690
quantize_output_tensor (bool): quantize output.weight
687
691
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
688
692
pure (bool): quantize all tensors to the default type
689
- imatrix (ctypes.ctypes. c_void_p): pointer to importance matrix data
693
+ imatrix (ctypes.c_void_p): pointer to importance matrix data
690
694
"""
691
695
692
696
_fields_ = [
693
697
("nthread" , ctypes .c_int32 ),
694
698
("ftype" , ctypes .c_int ),
699
+ ("output_tensor_type" , ctypes .c_int ),
700
+ ("token_embedding_type" , ctypes .c_int ),
695
701
("allow_requantize" , ctypes .c_bool ),
696
702
("quantize_output_tensor" , ctypes .c_bool ),
697
703
("only_copy" , ctypes .c_bool ),
@@ -2743,6 +2749,48 @@ def llama_beam_search(
2743
2749
): ...
2744
2750
2745
2751
2752
+ # /// @details Build a split GGUF final path for this chunk.
2753
+ # /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
2754
+ # // Returns the split_path length.
2755
+ # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
2756
+ @ctypes_function (
2757
+ "llama_split_path" ,
2758
+ [ctypes .c_char_p , ctypes .c_size_t , ctypes .c_char_p , ctypes .c_int , ctypes .c_int ],
2759
+ ctypes .c_int ,
2760
+ )
2761
+ def llama_split_path (
2762
+ split_path : bytes ,
2763
+ maxlen : Union [ctypes .c_size_t , int ],
2764
+ path_prefix : bytes ,
2765
+ split_no : Union [ctypes .c_int , int ],
2766
+ split_count : Union [ctypes .c_int , int ],
2767
+ / ,
2768
+ ) -> int :
2769
+ """Build a split GGUF final path for this chunk."""
2770
+ ...
2771
+
2772
+
2773
+ # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
2774
+ # /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
2775
+ # // Returns the split_prefix length.
2776
+ # LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
2777
+ @ctypes_function (
2778
+ "llama_split_prefix" ,
2779
+ [ctypes .c_char_p , ctypes .c_size_t , ctypes .c_char_p , ctypes .c_int , ctypes .c_int ],
2780
+ ctypes .c_int ,
2781
+ )
2782
+ def llama_split_prefix (
2783
+ split_prefix : bytes ,
2784
+ maxlen : Union [ctypes .c_size_t , int ],
2785
+ split_path : bytes ,
2786
+ split_no : Union [ctypes .c_int , int ],
2787
+ split_count : Union [ctypes .c_int , int ],
2788
+ / ,
2789
+ ) -> int :
2790
+ """Extract the path prefix from the split_path if and only if the split_no and split_count match."""
2791
+ ...
2792
+
2793
+
2746
2794
# Performance information
2747
2795
2748
2796
0 commit comments