8000 Merge pull request #3 from mamei16/v0.3.9 · bot08/llama-cpp-python@17dccda · GitHub
[go: up one dir, main page]

Skip to content

Commit 17dccda

Browse files
authored
Merge pull request #3 from mamei16/v0.3.9
V0.3.9
2 parents 12bac61 + b6e3c89 commit 17dccda

File tree

4 files changed

+134
-38
lines changed

4 files changed

+134
-38
lines changed

llama_cpp/_internals.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
from dataclasses import dataclass, field
1414
from contextlib import ExitStack
1515

16+
try:
17+
from warnings import deprecated
18+
except ImportError:
19+
from ._utils import deprecated
20+
1621
import numpy as np
1722
import numpy.typing as npt
1823

@@ -276,21 +281,37 @@ def n_ctx(self) -> int:
276281
def pooling_type(self) -> int:
277282
return llama_cpp.llama_pooling_type(self.ctx)
278283

284+
@deprecated("Use llama_kv_self_clear")
279285
def kv_cache_clear(self):
280-
llama_cpp.llama_kv_cache_clear(self.ctx)
286+
self.llama_kv_self_clear()
281287

288+
@deprecated("Use kv_self_seq_rm")
282289
def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
283-
llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
290+
self.kv_self_seq_rm(seq_id, p0, p1)
284291

292+
@deprecated("Use kv_self_seq_cp")
285293
def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
286-
llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
294+
self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1)
287295

296+
@deprecated("Use kv_self_seq_keep")
288297
def kv_cache_seq_keep(self, seq_id: int):
289-
llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
298+
self.kv_self_seq_keep(seq_id)
290299

291300
def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
292301
llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
293302

303+
def llama_kv_self_clear(self):
304+
llama_cpp.llama_llama_kv_self_clear(self.ctx)
305+
306+
def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
307+
llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)
308+
309+
def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
310+
llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
311+
312+
def kv_self_seq_keep(self, seq_id: int):
313+
llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)
314+
294315
def get_state_size(self) -> int:
295316
return llama_cpp.llama_get_state_size(self.ctx)
296317

llama_cpp/_utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22
import sys
3+
import warnings
4+
import functools
35

46
from typing import Any, Dict
57

@@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton):
7678

7779
def __init__(self):
7880
super(Singleton, self).__init__()
81+
82+
83+
def deprecated(reason):
84+
def decorator(func):
85+
@functools.wraps(func)
86+
def wrapper(*args, **kwargs):
87+
warnings.warn(
88+
f"Call to deprecated function {func.__name__} ({reason}).",
89+
category=DeprecationWarning,
90+
stacklevel=2,
91+
)
92+
return func(*args, **kwargs)
93+
return wrapper
94+
return decorator

llama_cpp/llama_cpp.py

Lines changed: 92 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@
165165
# llama_sampler_p = NewType("llama_sampler_p", int)
166166
# llama_sampler_p_ctypes = ctypes.c_void_p
167167

168+
# struct llama_kv_cache;
169+
llama_kv_cache_p = NewType("llama_kv_cache_p", int)
170+
llama_kv_cache_p_ctypes = ctypes.c_void_p
171+
168172
# typedef int32_t llama_pos;
169173
llama_pos = ctypes.c_int32
170174
# typedef int32_t llama_token;
@@ -259,7 +263,9 @@
259263
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
260264
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
261265
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
262-
266+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
267+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
268+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
263269

264270
# // note: these values should be synchronized with ggml_rope
265271
# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
@@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure):
630636
value: Union[int, float, bool, bytes]
631637

632638

639+
640+
# struct llama_model_tensor_buft_override {
641+
# const char * pattern;
642+
# ggml_backend_buffer_type_t buft;
643+
#
644+
# };
645+
class llama_model_tensor_buft_override(ctypes.Structure):
646+
_fields_ = [
647+
("pattern", ctypes.c_char_p),
648+
("buft", ctypes.c_void_p)
649+
]
650+
651+
652+
llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override)
653+
654+
633655
# struct llama_model_params {
634656
# // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
635657
# ggml_backend_dev_t * devices;
636658

659+
# // NULL-terminated list of buffer types to use for tensors that match a pattern
660+
# const struct llama_model_tensor_buft_override * tensor_buft_overrides;
661+
637662
# int32_t n_gpu_layers; // number of layers to store in VRAM
638663
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
639664

@@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure):
695720

696721
_fields_ = [
697722
("devices", ctypes.c_void_p), # NOTE: unnused
723+
("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p),
698724
("n_gpu_layers", ctypes.c_int32),
699725
("split_mode", ctypes.c_int),
700726
("main_gpu", ctypes.c_int32),
@@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int:
13161342
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
13171343
...
13181344

1345+
# LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
1346+
@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes)
1347+
def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
1348+
...
13191349

13201350
# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
13211351
@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
@@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll
18101840

18111841
# // Returns the number of tokens in the KV cache (slow, use only for debug)
18121842
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1813-
# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
1843+
# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
1844+
@ctypes_function(
1845+
"llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
1846+
)
1847+
def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
1848+
"""Returns the number of tokens in the KV cache (slow, use only for debug)
1849+
If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1850+
"""
1851+
...
1852+
1853+
# // Returns the number of tokens in the KV cache (slow, use only for debug)
1854+
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1855+
# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead");
18141856
@ctypes_function(
18151857
"llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
18161858
)
@@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
18321874

18331875

18341876
# // Clear the KV cache - both cell info is erased and KV data is zeroed
1835-
# LLAMA_API void llama_kv_cache_clear(
1877+
# LLAMA_API void llama_kv_self_clear(
18361878
# struct llama_context * ctx);
1837-
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
1838-
def llama_kv_cache_clear(ctx: llama_context_p, /):
1879+
@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
1880+
def llama_kv_self_clear(ctx: llama_context_p, /):
18391881
"""Clear the KV cache"""
18401882
...
18411883

@@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
18451887
# // seq_id < 0 : match any sequence
18461888
# // p0 < 0 : [0, p1]
18471889
# // p1 < 0 : [p0, inf)
1848-
# LLAMA_API bool llama_kv_cache_seq_rm(
1890+
# LLAMA_API bool llama_kv_self_seq_rm(
18491891
# struct llama_context * ctx,
18501892
# llama_seq_id seq_id,
18511893
# llama_pos p0,
18521894
# llama_pos p1);
18531895
@ctypes_function(
1854-
"llama_kv_cache_seq_rm",
1896+
"llama_kv_self_seq_rm",
18551897
[
18561898
llama_context_p_ctypes,
18571899
llama_seq_id,
@@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
18601902
],
18611903
ctypes.c_bool,
18621904
)
1863-
def llama_kv_cache_seq_rm(
1905+
def llama_kv_self_seq_rm(
18641906
ctx: llama_context_p,
18651907
seq_id: Union[llama_seq_id, int],
18661908
p0: Union[llama_pos, int],
@@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm(
18811923
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
18821924
# // p0 < 0 : [0, p1]
18831925
# // p1 < 0 : [p0, inf)
1884-
# LLAMA_API void llama_kv_cache_seq_cp(
1926+
# LLAMA_API void llama_kv_self_seq_cp(
18851927
# struct llama_context * ctx,
18861928
# llama_seq_id seq_id_src,
18871929
# llama_seq_id seq_id_dst,
18881930
# llama_pos p0,
18891931
# llama_pos p1);
18901932
@ctypes_function(
1891-
"llama_kv_cache_seq_cp",
1933+
"llama_kv_self_seq_cp",
18921934
[
18931935
llama_context_p_ctypes,
18941936
llama_seq_id,
@@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm(
18981940
],
18991941
None,
19001942
)
1901-
def llama_kv_cache_seq_cp(
1943+
def llama_kv_self_seq_cp(
19021944
ctx: llama_context_p,
19031945
seq_id_src: Union[llama_seq_id, int],
19041946
seq_id_dst: Union[llama_seq_id, int],
@@ -1914,31 +1956,31 @@ def llama_kv_cache_seq_cp(
19141956

19151957

19161958
# // Removes all tokens that do not belong to the specified sequence
1917-
# LLAMA_API void llama_kv_cache_seq_keep(
1959+
# LLAMA_API void llama_kv_self_seq_keep(
19181960
# struct llama_context * ctx,
19191961
# llama_seq_id seq_id);
19201962
@ctypes_function(
1921-
"llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
1963+
"llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
19221964
)
1923-
def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
1965+
def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
19241966
"""Removes all tokens that do not belong to the specified sequence"""
19251967
...
19261968

19271969

19281970
# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
19291971
# // If the KV cache is RoPEd, the KV data is updated accordingly:
19301972
# // - lazily on next llama_decode()
1931-
# // - explicitly with llama_kv_cache_update()
1973+
# // - explicitly with llama_kv_self_update()
19321974
# // p0 < 0 : [0, p1]
19331975
# // p1 < 0 : [p0, inf)
1934-
# LLAMA_API void llama_kv_cache_seq_add(
1976+
# LLAMA_API void llama_kv_self_seq_add(
19351977
# struct llama_context * ctx,
19361978
# llama_seq_id seq_id,
19371979
# llama_pos p0,
19381980
# llama_pos p1,
19391981
# llama_pos delta);
19401982
@ctypes_function(
1941-
"llama_kv_cache_seq_add",
1983+
"llama_kv_self_seq_add",
19421984
[
19431985
llama_context_p_ctypes,
19441986
llama_seq_id,
@@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
19481990
],
19491991
None,
19501992
)
1951-
def llama_kv_cache_seq_add(
1993+
def llama_kv_self_seq_add(
19521994
ctx: llama_context_p,
19531995
seq_id: Union[llama_seq_id, int],
19541996
p0: Union[llama_pos, int],
@@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add(
19592001
"""Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
19602002
If the KV cache is RoPEd, the KV data is updated accordingly:
19612003
- lazily on next llama_decode()
1962-
- explicitly with llama_kv_cache_update()
2004+
- explicitly with llama_kv_self_update()
19632005
p0 < 0 : [0, p1]
19642006
p1 < 0 : [p0, inf)"""
19652007
...
@@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add(
19692011
# // If the KV cache is RoPEd, the KV data is updated accordingly
19702012
# // p0 < 0 : [0, p1]
19712013
# // p1 < 0 : [p0, inf)
1972-
# LLAMA_API void llama_kv_cache_seq_div(
2014+
# LLAMA_API void llama_kv_self_seq_div(
19732015
# struct llama_context * ctx,
19742016
# llama_seq_id seq_id,
19752017
# llama_pos p0,
19762018
# llama_pos p1,
19772019
# int d);
19782020
@ctypes_function(
1979-
"llama_kv_cache_seq_div",
2021+
"llama_kv_self_seq_div",
19802022
[
19812023
llama_context_p_ctypes,
19822024
llama_seq_id,
@@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add(
19862028
],
19872029
None,
19882030
)
1989-
def llama_kv_cache_seq_div(
2031+
def llama_kv_self_seq_div(
19902032
ctx: llama_context_p,
19912033
seq_id: Union[llama_seq_id, int],
19922034
p0: Union[llama_pos, int],
@@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div(
20042046
# // Defragment the KV cache
20052047
# // This will be applied:
20062048
# // - lazily on next llama_decode()
2007-
# // - explicitly with llama_kv_cache_update()
2008-
# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
2009-
@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
2010-
def llama_kv_cache_defrag(ctx: llama_context_p, /):
2049+
# // - explicitly with llama_kv_self_update()
2050+
# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
2051+
@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
2052+
def llama_kv_self_defrag(ctx: llama_context_p, /):
20112053
"""Defragment the KV cache
20122054
This will be applied:
20132055
- lazily on next llama_decode()
2014-
- explicitly with llama_kv_cache_update()"""
2056+
- explicitly with llama_kv_self_update()"""
20152057
...
20162058

20172059

20182060
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
2019-
# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
2020-
@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
2021-
def llama_kv_cache_update(ctx: llama_context_p, /):
2061+
# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
2062+
@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
2063+
def llama_kv_self_update(ctx: llama_context_p, /):
20222064
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
20232065
...
20242066

20252067

20262068
# // Check if the context supports KV cache shifting
2027-
# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
2028-
@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
2029-
def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
2069+
# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
2070+
@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
2071+
def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
20302072
"""Check if the context supports KV cache shifting"""
20312073
...
20322074

@@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
25472589
...
25482590

25492591

2592+
# // Set whether the model is in warmup mode or not
2593+
# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
2594+
# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
2595+
@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
2596+
def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
2597+
"""Set whether to use causal attention or not
2598+
If set to true, the model will only attend to the past tokens"""
2599+
...
2600+
2601+
25502602
# // Set abort callback
25512603
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
25522604
@ctypes_function(
@@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2(
37013753
...
37023754

37033755

3756+
3757+
3758+
3759+
# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
3760+
# /// @param vocab The vocabulary that this grammar will be used with.
3761+
# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
3762+
# /// @param grammar_root The name of the start symbol for the grammar.
37043763
# LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
37053764
# const struct llama_vocab * vocab,
37063765
# const char * grammar_str,

vendor/llama.cpp

0 commit comments

Comments
 (0)
0