@@ -175,8 +175,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
175
175
176
176
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
177
177
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
178
- # define LLAMA_SESSION_VERSION 4
179
- LLAMA_SESSION_VERSION = 4
178
+ # define LLAMA_SESSION_VERSION 5
179
+ LLAMA_SESSION_VERSION = 5
180
180
181
181
182
182
# struct llama_model;
@@ -274,6 +274,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
274
274
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
275
275
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
276
276
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
277
+ # LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
277
278
278
279
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
279
280
# };
@@ -677,6 +678,7 @@ class llama_context_params(ctypes.Structure):
677
678
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
678
679
# bool pure; // quantize all tensors to the default type
679
680
# void * imatrix; // pointer to importance matrix data
681
+ # void * kv_overrides; // pointer to vector containing overrides
680
682
# } llama_model_quantize_params;
681
683
class llama_model_quantize_params (ctypes .Structure ):
682
684
"""Parameters for llama_model_quantize
@@ -691,6 +693,7 @@ class llama_model_quantize_params(ctypes.Structure):
691
693
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
692
694
pure (bool): quantize all tensors to the default type
693
695
imatrix (ctypes.c_void_p): pointer to importance matrix data
696
+ kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
694
697
"""
695
698
696
699
_fields_ = [
@@ -703,6 +706,7 @@ class llama_model_quantize_params(ctypes.Structure):
703
706
("only_copy" , ctypes .c_bool ),
704
707
("pure" , ctypes .c_bool ),
705
708
("imatrix" , ctypes .c_void_p ),
709
+ ("kv_overrides" , ctypes .c_void_p ),
706
710
]
707
711
708
712
@@ -1838,9 +1842,9 @@ def llama_synchronize(ctx: llama_context_p, /):
1838
1842
1839
1843
1840
1844
# // Token logits obtained from the last call to llama_decode()
1841
- # // The logits for the last token are stored in the last row
1842
- # // Logits for which llama_batch.logits[i] == 0 are undefined
1843
- # // Rows: n_tokens provided with llama_batch
1845
+ # // The logits for which llama_batch.logits[i] != 0 are stored contiguously
1846
+ # // in the order they have appeared in the batch.
1847
+ # // Rows: number of tokens for which llama_batch.logits[i] != 0
1844
1848
# // Cols: n_vocab
1845
1849
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
1846
1850
@ctypes_function (
@@ -1859,7 +1863,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
1859
1863
1860
1864
1861
1865
# // Logits for the ith token. Equivalent to:
1862
- # // llama_get_logits(ctx) + i*n_vocab
1866
+ # // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
1867
+ # // returns NULL for invalid ids.
1863
1868
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
1864
1869
@ctypes_function (
1865
1870
"llama_get_logits_ith" ,
@@ -1874,8 +1879,12 @@ def llama_get_logits_ith(
1874
1879
...
1875
1880
1876
1881
1877
- # // Get all output token embeddings
1878
- # // shape: [n_tokens*n_embd] (1-dimensional)
1882
+ # // Get all output token embeddings.
1883
+ # // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
1884
+ # // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
1885
+ # // in the order they have appeared in the batch.
1886
+ # // shape: [n_outputs*n_embd]
1887
+ # // Otherwise, returns NULL.
1879
1888
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
1880
1889
@ctypes_function (
1881
1890
"llama_get_embeddings" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
@@ -1886,9 +1895,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
1886
1895
...
1887
1896
1888
1897
1889
- # // Get the embeddings for the ith token
1890
- # // llama_get_embeddings(ctx) + i *n_embd
1898
+ # // Get the embeddings for the ith token. Equivalent to:
1899
+ # // llama_get_embeddings(ctx) + ctx->output_ids[i] *n_embd
1891
1900
# // shape: [n_embd] (1-dimensional)
1901
+ # // returns NULL for invalid ids.
1892
1902
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
1893
1903
@ctypes_function (
1894
1904
"llama_get_embeddings_ith" ,
0 commit comments