165
165
# llama_sampler_p = NewType("llama_sampler_p", int)
166
166
# llama_sampler_p_ctypes = ctypes.c_void_p
167
167
168
+ # struct llama_kv_cache;
169
+ llama_kv_cache_p = NewType ("llama_kv_cache_p" , int )
170
+ llama_kv_cache_p_ctypes = ctypes .c_void_p
171
+
168
172
# typedef int32_t llama_pos;
169
173
llama_pos = ctypes .c_int32
170
174
# typedef int32_t llama_token;
259
263
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
260
264
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
261
265
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
262
-
266
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
267
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
268
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
263
269
264
270
# // note: these values should be synchronized with ggml_rope
265
271
# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
@@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure):
630
636
value : Union [int , float , bool , bytes ]
631
637
632
638
639
+
640
+ # struct llama_model_tensor_buft_override {
641
+ # const char * pattern;
642
+ # ggml_backend_buffer_type_t buft;
643
+ #
644
+ # };
645
+ class llama_model_tensor_buft_override (ctypes .Structure ):
646
+ _fields_ = [
647
+ ("pattern" , ctypes .c_char_p ),
648
+ ("buft" , ctypes .c_void_p )
649
+ ]
650
+
651
+
652
+ llama_model_tensor_buft_override_p = ctypes .POINTER (llama_model_tensor_buft_override )
653
+
654
+
633
655
# struct llama_model_params {
634
656
# // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
635
657
# ggml_backend_dev_t * devices;
636
658
659
+ # // NULL-terminated list of buffer types to use for tensors that match a pattern
660
+ # const struct llama_model_tensor_buft_override * tensor_buft_overrides;
661
+
637
662
# int32_t n_gpu_layers; // number of layers to store in VRAM
638
663
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
639
664
@@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure):
695
720
696
721
_fields_ = [
697
722
("devices" , ctypes .c_void_p ), # NOTE: unnused
723
+ ("llama_model_tensor_buft_override" , llama_model_tensor_buft_override_p ),
698
724
("n_gpu_layers" , ctypes .c_int32 ),
699
725
("split_mode" , ctypes .c_int ),
700
726
("main_gpu" , ctypes .c_int32 ),
@@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int:
1316
1342
def llama_get_model (ctx : llama_context_p , / ) -> Optional [llama_model_p ]:
1317
1343
...
1318
1344
1345
+ # LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
1346
+ @ctypes_function ("llama_get_kv_self" , [llama_context_p_ctypes ], llama_model_p_ctypes )
1347
+ def llama_get_kv_self (ctx : llama_context_p , / ) -> Optional [llama_kv_cache_p ]:
1348
+ ...
1319
1349
1320
1350
# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
1321
1351
@ctypes_function ("llama_pooling_type" , [llama_context_p_ctypes ], ctypes .c_int )
@@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll
1810
1840
1811
1841
# // Returns the number of tokens in the KV cache (slow, use only for debug)
1812
1842
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1813
- # LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
1843
+ # LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
1844
+ @ctypes_function (
1845
+ "llama_kv_self_n_tokens" , [llama_context_p_ctypes ], ctypes .c_int32
1846
+ )
1847
+ def llama_kv_self_n_tokens (ctx : llama_context_p , / ) -> int :
1848
+ """Returns the number of tokens in the KV cache (slow, use only for debug)
1849
+ If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1850
+ """
1851
+ ...
1852
+
1853
+ # // Returns the number of tokens in the KV cache (slow, use only for debug)
1854
+ # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1855
+ # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead");
1814
1856
@ctypes_function (
1815
1857
"llama_get_kv_cache_token_count" , [llama_context_p_ctypes ], ctypes .c_int32
1816
1858
)
@@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
1832
1874
1833
1875
1834
1876
# // Clear the KV cache - both cell info is erased and KV data is zeroed
1835
- # LLAMA_API void llama_kv_cache_clear (
1877
+ # LLAMA_API void llama_kv_self_clear (
1836
1878
# struct llama_context * ctx);
1837
- @ctypes_function ("llama_kv_cache_clear " , [llama_context_p_ctypes ], None )
1838
- def llama_kv_cache_clear (ctx : llama_context_p , / ):
1879
+ @ctypes_function ("llama_kv_self_clear " , [llama_context_p_ctypes ], None )
1880
+ def llama_kv_self_clear (ctx : llama_context_p , / ):
1839
1881
"""Clear the KV cache"""
1840
1882
...
1841
1883
@@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
1845
1887
# // seq_id < 0 : match any sequence
1846
1888
# // p0 < 0 : [0, p1]
1847
1889
# // p1 < 0 : [p0, inf)
1848
- # LLAMA_API bool llama_kv_cache_seq_rm (
1890
+ # LLAMA_API bool llama_kv_self_seq_rm (
1849
1891
# struct llama_context * ctx,
1850
1892
# llama_seq_id seq_id,
1851
1893
# llama_pos p0,
1852
1894
# llama_pos p1);
1853
1895
@ctypes_function (
1854
- "llama_kv_cache_seq_rm " ,
1896
+ "llama_kv_self_seq_rm " ,
1855
1897
[
1856
1898
llama_context_p_ctypes ,
1857
1899
llama_seq_id ,
@@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
1860
1902
],
1861
1903
ctypes .c_bool ,
1862
1904
)
1863
- def llama_kv_cache_seq_rm (
1905
+ def llama_kv_self_seq_rm (
1864
1906
ctx : llama_context_p ,
1865
1907
seq_id : Union [llama_seq_id , int ],
1866
1908
p0 : Union [llama_pos , int ],
@@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm(
1881
1923
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
1882
1924
# // p0 < 0 : [0, p1]
1883
1925
# // p1 < 0 : [p0, inf)
1884
- # LLAMA_API void llama_kv_cache_seq_cp (
1926
+ # LLAMA_API void llama_kv_self_seq_cp (
1885
1927
# struct llama_context * ctx,
1886
1928
# llama_seq_id seq_id_src,
1887
1929
# llama_seq_id seq_id_dst,
1888
1930
# llama_pos p0,
1889
1931
# llama_pos p1);
1890
1932
@ctypes_function (
1891
- "llama_kv_cache_seq_cp " ,
1933
+ "llama_kv_self_seq_cp " ,
1892
1934
[
1893
1935
llama_context_p_ctypes ,
1894
1936
llama_seq_id ,
@@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm(
1898
1940
],
1899
1941
None ,
1900
1942
)
1901
- def llama_kv_cache_seq_cp (
1943
+ def llama_kv_self_seq_cp (
1902
1944
ctx : llama_context_p ,
1903
1945
seq_id_src : Union [llama_seq_id , int ],
1904
1946
seq_id_dst : Union [llama_seq_id , int ],
@@ -1914,31 +1956,31 @@ def llama_kv_cache_seq_cp(
1914
1956
1915
1957
1916
1958
# // Removes all tokens that do not belong to the specified sequence
1917
- # LLAMA_API void llama_kv_cache_seq_keep (
1959
+ # LLAMA_API void llama_kv_self_seq_keep (
1918
1960
# struct llama_context * ctx,
1919
1961
# llama_seq_id seq_id);
1920
1962
@ctypes_function (
1921
- "llama_kv_cache_seq_keep " , [llama_context_p_ctypes , llama_seq_id ], None
1963
+ "llama_kv_self_seq_keep " , [llama_context_p_ctypes , llama_seq_id ], None
1922
1964
)
1923
- def llama_kv_cache_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
1965
+ def llama_kv_self_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
1924
1966
"""Removes all tokens that do not belong to the specified sequence"""
1925
1967
...
1926
1968
1927
1969
1928
1970
# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
1929
1971
# // If the KV cache is RoPEd, the KV data is updated accordingly:
1930
1972
# // - lazily on next llama_decode()
1931
- # // - explicitly with llama_kv_cache_update ()
1973
+ # // - explicitly with llama_kv_self_update ()
1932
1974
# // p0 < 0 : [0, p1]
1933
1975
# // p1 < 0 : [p0, inf)
1934
- # LLAMA_API void llama_kv_cache_seq_add (
1976
+ # LLAMA_API void llama_kv_self_seq_add (
1935
1977
# struct llama_context * ctx,
1936
1978
# llama_seq_id seq_id,
1937
1979
# llama_pos p0,
1938
1980
# llama_pos p1,
1939
1981
# llama_pos delta);
1940
1982
@ctypes_function (
1941
- "llama_kv_cache_seq_add " ,
1983
+ "llama_kv_self_seq_add " ,
1942
1984
[
1943
1985
llama_context_p_ctypes ,
1944
1986
llama_seq_id ,
@@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
1948
1990
],
1949
1991
None ,
1950
1992
)
1951
- def llama_kv_cache_seq_add (
1993
+ def llama_kv_self_seq_add (
1952
1994
ctx : llama_context_p ,
1953
1995
seq_id : Union [llama_seq_id , int ],
1954
1996
p0 : Union [llama_pos , int ],
@@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add(
1959
2001
"""Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
1960
2002
If the KV cache is RoPEd, the KV data is updated accordingly:
1961
2003
- lazily on next llama_decode()
1962
- - explicitly with llama_kv_cache_update ()
2004
+ - explicitly with llama_kv_self_update ()
1963
2005
p0 < 0 : [0, p1]
1964
2006
p1 < 0 : [p0, inf)"""
1965
2007
...
@@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add(
1969
2011
# // If the KV cache is RoPEd, the KV data is updated accordingly
1970
2012
# // p0 < 0 : [0, p1]
1971
2013
# // p1 < 0 : [p0, inf)
1972
- # LLAMA_API void llama_kv_cache_seq_div (
2014
+ # LLAMA_API void llama_kv_self_seq_div (
1973
2015
# struct llama_context * ctx,
1974
2016
# llama_seq_id seq_id,
1975
2017
# llama_pos p0,
1976
2018
# llama_pos p1,
1977
2019
# int d);
1978
2020
@ctypes_function (
1979
- "llama_kv_cache_seq_div " ,
2021
+ "llama_kv_self_seq_div " ,
1980
2022
[
1981
2023
llama_context_p_ctypes ,
1982
2024
llama_seq_id ,
@@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add(
1986
2028
],
1987
2029
None ,
1988
2030
)
1989
- def llama_kv_cache_seq_div (
2031
+ def llama_kv_self_seq_div (
1990
2032
ctx : llama_context_p ,
1991
2033
seq_id : Union [llama_seq_id , int ],
1992
2034
p0 : Union [llama_pos , int ],
@@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div(
2004
2046
# // Defragment the KV cache
2005
2047
# // This will be applied:
2006
2048
# // - lazily on next llama_decode()
2007
- # // - explicitly with llama_kv_cache_update ()
2008
- # LLAMA_API void llama_kv_cache_defrag (struct llama_context * ctx);
2009
- @ctypes_function ("llama_kv_cache_defrag " , [llama_context_p_ctypes ], None )
2010
- def llama_kv_cache_defrag (ctx : llama_context_p , / ):
2049
+ # // - explicitly with llama_kv_self_update ()
2050
+ # LLAMA_API void llama_kv_self_defrag (struct llama_context * ctx);
2051
+ @ctypes_function ("llama_kv_self_defrag " , [llama_context_p_ctypes ], None )
2052
+ def llama_kv_self_defrag (ctx : llama_context_p , / ):
2011
2053
"""Defragment the KV cache
2012
2054
This will be applied:
2013
2055
- lazily on next llama_decode()
2014
- - explicitly with llama_kv_cache_update ()"""
2056
+ - explicitly with llama_kv_self_update ()"""
2015
2057
...
2016
2058
2017
2059
2018
2060
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
2019
- # LLAMA_API void llama_kv_cache_update (struct llama_context * ctx);
2020
- @ctypes_function ("llama_kv_cache_update " , [llama_context_p_ctypes ], None )
2021
- def llama_kv_cache_update (ctx : llama_context_p , / ):
2061
+ # LLAMA_API void llama_kv_self_update (struct llama_context * ctx);
2062
+ @ctypes_function ("llama_kv_self_update " , [llama_context_p_ctypes ], None )
2063
+ def llama_kv_self_update (ctx : llama_context_p , / ):
2022
2064
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
2023
2065
...
2024
2066
2025
2067
2026
2068
# // Check if the context supports KV cache shifting
2027
- # LLAMA_API bool llama_kv_cache_can_shift (struct llama_context * ctx);
2028
- @ctypes_function ("llama_kv_cache_can_shift " , [llama_context_p_ctypes ], ctypes .c_bool )
2029
- def llama_kv_cache_can_shift (ctx : llama_context_p , / ) -> bool :
2069
+ # LLAMA_API bool llama_kv_self_can_shift (struct llama_context * ctx);
2070
+ @ctypes_function ("llama_kv_self_can_shift " , [llama_context_p_ctypes ], ctypes .c_bool )
2071
+ def llama_kv_self_can_shift (ctx : llama_context_p , / ) -> bool :
2030
2072
"""Check if the context supports KV cache shifting"""
2031
2073
...
2032
2074
@@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
2547
2589
...
2548
2590
2549
2591
2592
+ # // Set whether the model is in warmup mode or not
2593
+ # // If true, all model tensors are activated during llama_decode() to load and cache their weights.
2594
+ # LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
2595
+ @ctypes_function ("llama_set_warmup" , [llama_context_p_ctypes , ctypes .c_bool ], None )
2596
+ def llama_set_warmup (ctx : llama_context_p , warmup : bool , / ):
2597
+ """Set whether to use causal attention or not
2598
+ If set to true, the model will only attend to the past tokens"""
2599
+ ...
2600
+
2601
+
2550
2602
# // Set abort callback
2551
2603
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
2552
2604
@ctypes_function (
@@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2(
3701
3753
...
3702
3754
3703
3755
3756
+
3757
+
3758
+
3759
+ # /// @details Intializes a GBNF grammar, see grammars/README.md for details.
3760
+ # /// @param vocab The vocabulary that this grammar will be used with.
3761
+ # /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
3762
+ # /// @param grammar_root The name of the start symbol for the grammar.
3704
3763
# LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
3705
3764
# const struct llama_vocab * vocab,
3706
3765
# const char * grammar_str,
0 commit comments