8000 Update llama.cpp · nivibilla/llama-cpp-python@36048d4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 36048d4

Browse files
committed
Update llama.cpp
1 parent 4474157 commit 36048d4

File tree

2 files changed

+120
-13
lines changed

2 files changed

+120
-13
lines changed

llama_cpp/llama_cpp.py

Lines changed: 119 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -273,11 +273,11 @@ class llama_token_data_array(Structure):
273273
# } llama_batch;
274274
class llama_batch(Structure):
275275
"""Input data for llama_decode
276-
276+
277277
A llama_batch object can contain input about one or many sequences
278-
278+
279279
The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
280-
280+
281281
Attributes:
282282
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
283283
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
@@ -890,18 +890,121 @@ def llama_model_apply_lora_from_file(
890890
# //
891891

892892

893-
# // Returns the number of tokens in the KV cache
894-
# LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
895-
# "avoid using this, it will be removed in the future, instead - count the tokens in user code");
893+
# // Information associated with an individual cell in the KV cache view.
894+
# struct llama_kv_cache_view_cell {
895+
# // The position for this cell. Takes KV cache shifts into account.
896+
# // May be negative if the cell is not populated.
897+
# llama_pos pos;
898+
# };
899+
class llama_kv_cache_view_cell(Structure):
900+
_fields_ = [("pos", llama_pos)]
901+
902+
903+
# // An updateable view of the KV cache.
904+
# struct llama_kv_cache_view {
905+
# // Number of KV cache cells. This will be the same as the context size.
906+
# int32_t n_cells;
907+
908+
# // Maximum number of sequences that can exist in a cell. It's not an error
909+
# // if there are more sequences in a cell than this value, however they will
910+
# // not be visible in the view cells_sequences.
911+
# int32_t n_max_seq;
912+
913+
# // Number of tokens in the cache. For example, if there are two populated
914+
# // cells, the first with 1 sequence id in it and the second with 2 sequence
915+
# // ids then you'll have 3 tokens.
916+
# int32_t token_count;
917+
918+
# // Number of populated cache cells.
919+
# int32_t used_cells;
920+
921+
# // Maximum contiguous empty slots in the cache.
922+
# int32_t max_contiguous;
923+
924+
# // Index to the start of the max_contiguous slot range. Can be negative
925+
# // when cache is full.
926+
# int32_t max_contiguous_idx;
927+
928+
# // Information for an individual cell.
929+
# struct llama_kv_cache_view_cell * cells;
930+
931+
932+
# // The sequences for each cell. There will be n_max_seq items per cell.
933+
# llama_seq_id * cells_sequences;
934+
# };
935+
class llama_kv_cache_view(Structure):
936+
_fields_ = [
937+
("n_cells", c_int32),
938+
("n_max_seq", c_int32),
939+
("token_count", c_int32),
940+
("used_cells", c_int32),
941+
("max_contiguous", c_int32),
942+
("max_contiguous_idx", c_int32),
943+
("cells", POINTER(llama_kv_cache_view_cell)),
944+
("cells_sequences", POINTER(llama_seq_id)),
945+
]
946+
947+
948+
# // Create an empty KV cache view. (use only for debugging purposes)
949+
# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
950+
def llama_kv_cache_view_init(
951+
ctx: llama_context_p, n_max_seq: Union[c_int32, int]
952+
) -> llama_kv_cache_view:
953+
"""Create an empty KV cache view. (use only for debugging purposes)"""
954+
return _lib.llama_kv_cache_view_init(ctx, n_max_seq)
955+
956+
957+
_lib.llama_kv_cache_view_init.argtypes = [llama_context_p, c_int32]
958+
_lib.llama_kv_cache_view_init.restype = llama_kv_cache_view
959+
960+
961+
# // Free a KV cache view. (use only for debugging purposes)
962+
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
963+
def llama_kv_cache_view_free(view: llama_kv_cache_view):
964+
"""Free a KV cache view. (use only for debugging purposes)"""
965+
return _lib.llama_kv_cache_view_free(view)
966+
967+
968+
_lib.llama_kv_cache_view_free.argtypes = [llama_kv_cache_view]
969+
_lib.llama_kv_cache_view_free.restype = None
970+
971+
972+
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
973+
# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
974+
def llama_kv_cache_view_update(ctx: llama_context_p 6D4E , view: llama_kv_cache_view):
975+
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
976+
return _lib.llama_kv_cache_view_update(ctx, view)
977+
978+
979+
_lib.llama_kv_cache_view_update.argtypes = [llama_context_p, llama_kv_cache_view]
980+
_lib.llama_kv_cache_view_update.restype = None
981+
982+
983+
# // Returns the number of tokens in the KV cache (slow, use only for debug)
984+
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
985+
# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
896986
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
897-
"""Returns the number of tokens in the KV cache"""
987+
"""Returns the number of tokens in the KV cache (slow, use only for debug)
988+
If a KV cell has multiple sequences assigned to it, it will be counted multiple times
989+
"""
898990
return _lib.llama_get_kv_cache_token_count(ctx)
899991

900992

901993
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
902994
_lib.llama_get_kv_cache_token_count.restype = c_int
903995

904996

997+
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
998+
# LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
999+
def llama_get_kv_cache_used_cells(ctx: llama_context_p) -> int:
1000+
"""Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1001+
return _lib.llama_get_kv_cache_used_cells(ctx)
1002+
1003+
1004+
_lib.llama_get_kv_cache_used_cells.argtypes = [llama_context_p]
1005+
_lib.llama_get_kv_cache_used_cells.restype = c_int
1006+
1007+
9051008
# // Clear the KV cache
9061009
# LLAMA_API void llama_kv_cache_clear(
9071010
# struct llama_context * ctx);
@@ -1205,8 +1308,9 @@ def llama_batch_get_one(
12051308
seq_id: llama_seq_id,
12061309
) -> llama_batch:
12071310
"""Return batch for single sequence of tokens starting at pos_0
1208-
1209-
NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it"""
1311+
1312+
NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
1313+
"""
12101314
return _lib.llama_batch_get_one(tokens, n_tokens, pos_0, seq_id)
12111315

12121316

@@ -1290,7 +1394,8 @@ def llama_set_n_threads(
12901394
):
12911395
"""Set the number of threads used for decoding
12921396
n_threads is the number of threads used for generation (single token)
1293-
n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)"""
1397+
n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
1398+
"""
12941399
return _lib.llama_set_n_threads(ctx, n_threads, n_threads_batch)
12951400

12961401

@@ -1540,7 +1645,8 @@ def llama_token_to_piece(
15401645
"""Token Id -> Piece.
15411646
Uses the vocabulary in the provided context.
15421647
Does not write null terminator to the buffer.
1543-
User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens."""
1648+
User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
1649+
"""
15441650
return _lib.llama_token_to_piece(model, token, buf, length)
15451651

15461652

@@ -1626,7 +1732,8 @@ def llama_sample_repetition_penalties(
16261732
penalty_present: Union[c_float, float],
16271733
):
16281734
"""Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
1629-
Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."""
1735+
Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
1736+
"""
16301737
return _lib.llama_sample_repetition_penalties(
16311738
ctx,
16321739
candidates,

vendor/llama.cpp

0 commit comments

Comments
 (0)
0