8000 feat: Update llama.cpp · mikeFore4/llama-cpp-python@e325a83 · GitHub
[go: up one dir, main page]

Skip to content

Commit e325a83

Browse files
committed
feat: Update llama.cpp
1 parent c89be28 commit e325a83

File tree

2 files changed

+57
-9
lines changed

2 files changed

+57
-9
lines changed

llama_cpp/llama_cpp.py

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -668,30 +668,36 @@ class llama_context_params(ctypes.Structure):
668668

669669
# // model quantization parameters
670670
# typedef struct llama_model_quantize_params {
671-
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
672-
# enum llama_ftype ftype; // quantize to this llama_ftype
673-
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
674-
# bool quantize_output_tensor; // quantize output.weight
675-
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
676-
# bool pure; // quantize all tensors to the default type
677-
# void * imatrix; // pointer to importance matrix data
671+
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
672+
# enum llama_ftype ftype; // quantize to this llama_ftype
673+
# enum ggml_type output_tensor_type; // output tensor type
674+
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
675+
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
676+
# bool quantize_output_tensor; // quantize output.weight
677+
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
678+
# bool pure; // quantize all tensors to the default type
679+
# void * imatrix; // pointer to importance matrix data
678680
# } llama_model_quantize_params;
679681
class llama_model_quantize_params(ctypes.Structure):
680682
"""Parameters for llama_model_quantize
681683
682684
Attributes:
683685
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
684686
ftype (int): quantize to this llama_ftype
687+
output_tensor_type (int): output tensor type
688+
token_embedding_type (int): itoken embeddings tensor type
685689
allow_requantize (bool): allow quantizing non-f32/f16 tensors
686690
quantize_output_tensor (bool): quantize output.weight
687691
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
688692
pure (bool): quantize all tensors to the default type
689-
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
693+
imatrix (ctypes.c_void_p): pointer to importance matrix data
690694
"""
691695

692696
_fields_ = [
693697
("nthread", ctypes.c_int32),
694698
("ftype", ctypes.c_int),
699+
("output_tensor_type", ctypes.c_int),
700+
("token_embedding_type", ctypes.c_int),
695701
("allow_requantize", ctypes.c_bool),
696702
("quantize_output_tensor", ctypes.c_bool),
697703
("only_copy", ctypes.c_bool),
@@ -2743,6 +2749,48 @@ def llama_beam_search(
27432749
): ...
27442750

27452751

2752+
# /// @details Build a split GGUF final path for this chunk.
2753+
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
2754+
# // Returns the split_path length.
2755+
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
2756+
@ctypes_function(
2757+
"llama_split_path",
2758+
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
2759+
ctypes.c_int,
2760+
)
2761+
def llama_split_path(
2762+
split_path: bytes,
2763+
maxlen: Union[ctypes.c_size_t, int],
2764+
path_prefix: bytes,
2765+
split_no: Union[ctypes.c_int, int],
2766+
split_count: Union[ctypes.c_int, int],
2767+
/,
2768+
) -> int:
2769+
"""Build a split GGUF final path for this chunk."""
2770+
...
2771+
2772+
2773+
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
2774+
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
2775+
# // Returns the split_prefix length.
2776+
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
2777+
@ctypes_function(
2778+
"llama_split_prefix",
2779+
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
2780+
ctypes.c_int,
2781+
)
2782+
def llama_split_prefix(
2783+
split_prefix: bytes,
2784+
maxlen: Union[ctypes.c_size_t, int],
2785+
split_path: bytes,
2786+
split_no: Union[ctypes.c_int, int],
2787+
split_count: Union[ctypes.c_int, int],
2788+
/,
2789+
) -> int:
2790+
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
2791+
...
2792+
2793+
27462794
# Performance information
27472795

27482796

vendor/llama.cpp

0 commit comments

Comments
 (0)
0