8000 feat: Update llama.cpp · coderonion/llama-cpp-python@62804ee · GitHub
[go: up one dir, main page]

Skip to content

Commit 62804ee

Browse files
committed
feat: Update llama.cpp
1 parent 7e20e34 commit 62804ee

File tree

2 files changed

+71
-3
lines changed

2 files changed

+71
-3
lines changed

llama_cpp/llama_cpp.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,15 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
488488
LLAMA_POOLING_TYPE_CLS = 2
489489
LLAMA_POOLING_TYPE_LAST = 3
490490

491+
# enum llama_attention_type {
492+
# LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
493+
# LLAMA_ATTENTION_TYPE_CAUSAL = 0,
494+
# LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
495+
# };
496+
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1
497+
LLAMA_ATTENTION_TYPE_CAUSAL = 0
498+
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
499+
491500
# enum llama_split_mode {
492501
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
493502
# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -775,6 +784,7 @@ class llama_model_params(ctypes.Structure):
775784

776785
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
777786
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
787+
# enum llama_attention_type attention_type; // attention type to use for embeddings
778788

779789
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
780790
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -817,6 +827,7 @@ class llama_context_params(ctypes.Structure):
817827
n_threads_batch (int): number of threads to use for batch processing
818828
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
819829
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
830+
attention_type (int): attention type to use for embeddings
820831
rope_freq_base (float): RoPE base frequency, 0 = from model
821832
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
822833
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -847,6 +858,7 @@ class llama_context_params(ctypes.Structure):
847858
n_threads_batch: int
848859
rope_scaling_type: int
849860
pooling_type: int
861+
attention_type: int
850862
rope_freq_base: float
851863
rope_freq_scale: float
852864
yarn_ext_factor: float
@@ -876,6 +888,7 @@ class llama_context_params(ctypes.Structure):
876888
("n_threads_batch", ctypes.c_uint32),
877889
("rope_scaling_type", ctypes.c_int),
878890
("pooling_type", ctypes.c_int),
891+
("attention_type", ctypes.c_int),
879892
("rope_freq_base", ctypes.c_float),
880893
("rope_freq_scale", ctypes.c_float),
881894
("yarn_ext_factor", ctypes.c_float),
@@ -2642,6 +2655,7 @@ def llama_token_eot(model: llama_model_p, /) -> int: ...
26422655
# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
26432656
# /// @return Returns the number of tokens on success, no more than n_tokens_max
26442657
# /// @return Returns a negative number on failure - the number of tokens that would have been returned
2658+
# /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
26452659
# /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
26462660
# /// as plaintext. Does not insert a leading space.
26472661
# LLAMA_API int32_t llama_tokenize(
@@ -2683,7 +2697,7 @@ def llama_tokenize(
26832697
text_len: The length of the text.
26842698
tokens: The tokens pointer must be large enough to hold the resulting tokens.
26852699
n_max_tokens: The maximum number of tokens to return.
2686-
add_special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
2700+
add_special: Allow adding special tokenns if the model is configured to do so.
26872701
parse_special: Allow parsing special tokens.
26882702
26892703
Returns:
@@ -2696,13 +2710,14 @@ def llama_tokenize(
26962710
# // Token Id -> Piece.
26972711
# // Uses the vocabulary in the provided context.
26982712
# // Does not write null terminator to the buffer.
2699-
# // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
2713+
# // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
27002714
# // @param special If true, special tokens are rendered in the output.
27012715
# LLAMA_API int32_t llama_token_to_piece(
27022716
# const struct llama_model * model,
27032717
# llama_token token,
27042718
# char * buf,
27052719
# int32_t length,
2720+
# int32_t lstrip,
27062721
# bool special);
27072722
@ctypes_function(
27082723
"llama_token_to_piece",
@@ -2711,6 +2726,7 @@ def llama_tokenize(
27112726
llama_token,
27122727
ctypes.c_char_p,
27132728
ctypes.c_int32,
2729+
ctypes.c_int32,
27142730
ctypes.c_bool,
27152731
],
27162732
ctypes.c_int32,
@@ -2720,6 +2736,7 @@ def llama_token_to_piece(
27202736
token: Union[llama_token, int],
27212737
buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
27222738
length: Union[ctypes.c_int, int],
2739+
lstrip: Union[ctypes.c_int, int],
27232740
special: Union[ctypes.c_bool, bool],
27242741
/,
27252742
) -> int:
@@ -2733,10 +2750,61 @@ def llama_token_to_piece(
27332750
token: The token to convert.
27342751
buf: The buffer to write the token to.
27352752
length: The length of the buffer.
2753+
lstrip: The number of leading spaces to skip.
27362754
special: If true, special tokens are rendered in the output."""
27372755
...
27382756

27392757

2758+
# /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
2759+
# /// @param text The char pointer must be large enough to hold the resulting text.
2760+
# /// @return Returns the number of chars/bytes on success, no more than text_len_max.
2761+
# /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
2762+
# /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
2763+
# /// @param unparse_special If true, special tokens are rendered in the output.
2764+
# LLAMA_API int32_t llama_detokenize(
2765+
# const struct llama_model * model,
2766+
# const llama_token * tokens,
2767+
# int32_t n_tokens,
2768+
# char * text,
2769+
# int32_t text_len_max,
2770+
# bool remove_special,
2771+
# bool unparse_special);
2772+
@ctypes_function(
2773+
"llama_detokenize",
2774+
[
2775+
llama_model_p_ctypes,
2776+
ctypes.POINTER(llama_token),
2777+
ctypes.c_int32,
2778+
ctypes.c_char_p,
2779+
ctypes.c_int32,
2780+
ctypes.c_bool,
2781+
ctypes.c_bool,
2782+
],
2783+
ctypes.c_int32,
2784+
)
2785+
def llama_detokenize(
2786+
model: llama_model_p,
2787+
tokens: CtypesArray[llama_token],
2788+
n_tokens: Union[ctypes.c_int, int],
2789+
text: bytes,
2790+
text_len_max: Union[ctypes.c_int, int],
2791+
remove_special: Union[ctypes.c_bool, bool],
2792+
unparse_special: Union[ctypes.c_bool, bool],
2793+
/,
2794+
) -> int:
2795+
"""Convert the provided tokens into text (inverse of llama_tokenize()).
2796+
2797+
Args:
2798+
model: The model to use for tokenization.
2799+
tokens: The tokens to convert.
2800+
n_tokens: The number of tokens.
2801+
text: The buffer to write the text to.
2802+
text_len_max: The length of the buffer.
2803+
remove_special: Allow to remove BOS and EOS tokens if model is configured to do so.
2804+
unparse_special: If true, special tokens are rendered in the output."""
2805+
...
2806+
2807+
27402808
# /// Apply chat template. Inspired by hf apply_chat_template() on python.
27412809
# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
27422810
# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template

vendor/llama.cpp

0 commit comments

Comments
 (0)
0