8000 Merge branch 'main' into batch-processing · sunnykim1206/llama-cpp-python@456a601 · GitHub
[go: up one dir, main page]

Skip to content

Commit 456a601

Browse files
committed
Merge branch 'main' into batch-processing
2 parents e1cd61e + f015966 commit 456a601

File tree

6 files changed

+86
-22
lines changed

6 files changed

+86
-22
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.28]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
13+
- feat: Add ability to pass in penalize_nl param by @shankinson in #1068
14+
- fix: print_grammar to stderr by @turian in #1052
15+
1016
## [0.2.27]
1117

1218
- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.27"
4+
__version__ = "0.2.28"

llama_cpp/llama.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@ def generate(
518518
mirostat_mode: int = 0,
519519
mirostat_tau: float = 5.0,
520520
mirostat_eta: float = 0.1,
521+
penalize_nl: bool = True,
521522
logits_processor: Optional[LogitsProcessorList] = None,
522523
stopping_criteria: Optional[StoppingCriteriaList] = None,
523524
grammar: Optional[LlamaGrammar] = None,
@@ -578,6 +579,7 @@ def generate(
578579
mirostat_eta=mirostat_eta,
579580
logits_processor=logits_processor,
580581
grammar=grammar,
582+
penalize_nl=penalize_nl,
581583
)
582584
if stopping_criteria is not None and stopping_criteria(
583585
self._input_ids, self._scores[-1, :]

llama_cpp/llama_cpp.py

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def _load_shared_library(lib_base_name: str):
104104
# define LLAMA_MAX_RNG_STATE (64*1024)
105105
LLAMA_MAX_RNG_STATE = 64 * 1024
106106

107-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
107+
# define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
108108
LLAMA_FILE_MAGIC_GGLA = 0x67676C61
109109

110110
# define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
@@ -179,6 +179,7 @@ def _load_shared_library(lib_base_name: str):
179179
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
180180
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
181181
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
182+
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
182183

183184
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
184185
# };
@@ -222,11 +223,12 @@ def _load_shared_library(lib_base_name: str):
222223
# } llama_token_data;
223224
class llama_token_data(Structure):
224225
"""Used to store token data
225-
226+
226227
Attributes:
227228
id (llama_token): token id
228229
logit (float): log-odds of the token
229230
p (float): probability of the token"""
231+
230232
_fields_ = [
231233
("id", llama_token),
232234
("logit", c_float),
@@ -244,11 +246,12 @@ class llama_token_data(Structure):
244246
# } llama_token_data_array;
245247
class llama_token_data_array(Structure):
246248
"""Used to sample tokens given logits
247-
249+
248250
Attributes:
249251
data (ctypes.Array[llama_token_data]): token data
250252
size (int): size of the array
251253
sorted (bool): whether the array is sorted"""
254+
252255
_fields_ = [
253256
("data", llama_token_data_p),
254257
("size", c_size_t),
@@ -303,7 +306,8 @@ class llama_batch(Structure):
303306
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
304307
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
305308
pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
306-
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs"""
309+
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
310+
"""
307311

308312
_fields_ = [
309313
("n_tokens", c_int32),
@@ -318,6 +322,7 @@ class llama_batch(Structure):
318322
("all_seq_id", llama_seq_id),
319323
]
320324

325+
321326
# enum llama_model_kv_override_type {
322327
# LLAMA_KV_OVERRIDE_INT,
323328
# LLAMA_KV_OVERRIDE_FLOAT,
@@ -327,6 +332,7 @@ class llama_batch(Structure):
327332
LLAMA_KV_OVERRIDE_FLOAT = 1
328333
LLAMA_KV_OVERRIDE_BOOL = 2
329334

335+
330336
# struct llama_model_kv_override {
331337
# char key[128];
332338
# enum llama_model_kv_override_type tag;
@@ -343,13 +349,15 @@ class llama_model_kv_override_value(CtypesUnion):
343349
("bool_value", c_bool),
344350
]
345351

352+
346353
class llama_model_kv_override(Structure):
347354
_fields_ = [
348355
("key", ctypes.c_char * 128),
349356
("tag", c_int),
350357
("value", llama_model_kv_override_value),
351358
]
352359

360+
353361
# struct llama_model_params {
354362
# int32_t n_gpu_layers; // number of layers to store in VRAM
355363
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
@@ -365,14 +373,15 @@ class llama_model_kv_override(Structure):
365373
# // override key-value pairs of the model meta data
366374
# const struct llama_model_kv_override * kv_overrides;
367375

376+
368377
# // Keep the booleans together to avoid misalignment during copy-by-value.
369378
# bool vocab_only; // only load the vocabulary, no weights
370379
# bool use_mmap; // use mmap if possible
371380
# bool use_mlock; // force system to keep model in RAM
372381
# };
373382
class llama_model_params(Structure):
374383
"""Parameters for llama_model
375-
384+
376385
Attributes:
377386
n_gpu_layers (int): number of layers to store in VRAM
378387
main_gpu (int): the GPU that is used for scratch and small tensors
@@ -383,6 +392,7 @@ class llama_model_params(Structure):
383392
vocab_only (bool): only load the vocabulary, no weights
384393
use_mmap (bool): use mmap if possible
385394
use_mlock (bool): force system to keep model in RAM"""
395+
386396
_fields_ = [
387397
("n_gpu_layers", c_int32),
388398
("main_gpu", c_int32),
@@ -416,6 +426,7 @@ class llama_model_params(Structure):
416426
# enum ggml_type type_k; // data type for K cache
417427
# enum ggml_type type_v; // data type for V cache
418428

429+
419430
# // Keep the booleans together to avoid misalignment during copy-by-value.
420431
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
421432
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
@@ -424,7 +435,7 @@ class llama_model_params(Structure):
424435
# };
425436
class llama_context_params(Structure):
426437
"""Parameters for llama_context
427-
438+
428439
Attributes:
429440
seed (int): RNG seed, -1 for random
430441
n_ctx (int): text context, 0 = from model
@@ -444,7 +455,9 @@ class llama_context_params(Structure):
444455
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
445456
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
446457
embedding (bool): embedding mode only
447-
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
458+
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
459+
"""
460+
448461
_fields_ = [
449462
("seed", c_uint32),
450463
("n_ctx", c_uint32),
@@ -493,14 +506,16 @@ class llama_context_params(Structure):
493506
# } llama_model_quantize_params;
494507
class llama_model_quantize_params(Structure):
495508
"""Parameters for llama_model_quantize
496-
509+
497510
Attributes:
498511
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
499512
ftype (int): quantize to this llama_ftype
500513
allow_requantize (bool): allow quantizing non-f32/f16 tensors
501514
F987 quantize_output_tensor (bool): quantize output.weight
502515
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
503-
pure (bool): disable k-quant mixtures and quantize all tensors to the same type"""
516+
pure (bool): disable k-quant mixtures and quantize all tensors to the same type
517+
"""
518+
504519
_fields_ = [
505520
("nthread", c_int32),
506521
("ftype", c_int),
@@ -745,13 +760,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
745760
_lib.llama_n_ctx.argtypes = [llama_context_p]
746761
_lib.llama_n_ctx.restype = c_uint32
747762

763+
748764
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
749765
def llama_n_batch(ctx: llama_context_p) -> int:
750766
return _lib.llama_n_batch(ctx)
751767

768+
752769
_lib.llama_n_batch.argtypes = [llama_context_p]
753770
_lib.llama_n_batch.restype = c_uint32
754771

772+
755773
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
756774
def llama_vocab_type(model: llama_model_p) -> int:
757775
return _lib.llama_vocab_type(model)
@@ -1080,7 +1098,7 @@ def llama_kv_cache_view_init(
10801098

10811099
# // Free a KV cache view. (use only for debugging purposes)
10821100
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
1083-
def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
1101+
def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
10841102
"""Free a KV cache view. (use only for debugging purposes)"""
10851103
return _lib.llama_kv_cache_view_free(view)
10861104

@@ -1091,7 +1109,7 @@ def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # typ
10911109

10921110
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
10931111
# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
1094-
def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
1112+
def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
10951113
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
10961114
return _lib.llama_kv_cache_view_update(ctx, view)
10971115

@@ -1251,6 +1269,40 @@ def llama_kv_cache_seq_shift(
12511269
]
12521270
_lib.llama_kv_cache_seq_shift.restype = None
12531271

1272+
1273+
# // Integer division of the positions by factor of `d > 1`
1274+
# // If the KV cache is RoPEd, the KV data is updated accordingly
1275+
# // p0 < 0 : [0, p1]
1276+
# // p1 < 0 : [p0, inf)
1277+
# LLAMA_API void llama_kv_cache_seq_div(
1278+
# struct llama_context * ctx,
1279+
# llama_seq_id seq_id,
1280+
# llama_pos p0,
1281+
# llama_pos p1,
1282+
# int d);
1283+
def llama_kv_cache_seq_div(
1284+
ctx: llama_context_p,
1285+
seq_id: Union[llama_seq_id, int],
1286+
p0: Union[llama_pos, int],
1287+
p1: Union[llama_pos, int],
1288+
d: Union[c_int, int],
1289+
):
1290+
"""Integer division of the positions by factor of `d > 1`
1291+
If the KV cache is RoPEd, the KV data is updated accordingly
1292+
p0 < 0 : [0, p1]
1293+
p1 < 0 : [p0, inf)"""
1294+
return _lib.llama_kv_cache_seq_div(ctx, seq_id, p0, p1, d)
1295+
1296+
1297+
_lib.llama_kv_cache_seq_div.argtypes = [
1298+
llama_context_p,
1299+
llama_seq_id,
1300+
llama_pos,
1301+
llama_pos,
1302+
c_int,
1303+
]
1304+
_lib.llama_kv_cache_seq_div.restype = None
1305+
12541306
# //
12551307
# // State / sessions
12561308
# //
@@ -2063,10 +2115,11 @@ def llama_sample_temp(
20632115
temp: Union[c_float, float],
20642116
):
20652117
"""Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509
2066-
2118+
20672119
Parameters:
20682120
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
2069-
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text."""
2121+
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
2122+
"""
20702123
return _lib.llama_sample_temp(ctx, candidates, temp)
20712124

20722125

@@ -2111,10 +2164,11 @@ def llama_sample_grammar(
21112164
grammar, # type: llama_grammar_p
21122165
):
21132166
"""Apply constraints from grammar
2114-
2167+
21152168
Parameters:
21162169
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
2117-
grammar: A grammar object containing the rules and constraints to apply to the generated text."""
2170+
grammar: A grammar object containing the rules and constraints to apply to the generated text.
2171+
"""
21182172
return _lib.llama_sample_grammar(ctx, candidates, grammar)
21192173

21202174

@@ -2148,13 +2202,14 @@ def llama_sample_token_mirostat(
21482202
mu, # type: _Pointer[c_float]
21492203
) -> int:
21502204
"""Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
2151-
2205+
21522206
Parameters:
21532207
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
21542208
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
21552209
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
21562210
m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
2157-
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
2211+
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
2212+
"""
21582213
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
21592214

21602215

@@ -2188,12 +2243,13 @@ def llama_sample_token_mirostat_v2(
21882243
mu, # type: _Pointer[c_float]
21892244
) -> int:
21902245
"""Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
2191-
2246+
21922247
Parameters:
21932248
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
21942249
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
21952250
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
2196-
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
2251+
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
2252+
"""
21972253
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
21982254

21992255

llama_cpp/llama_grammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar":
7272
)
7373
if verbose:
7474
print(f"{cls.from_string.__name__} grammar:", file=sys.stderr)
75-
print_grammar(sys.stdout, parsed_grammar)
75+
print_grammar(sys.stderr, parsed_grammar)
7676
print(file=sys.stderr)
7777
return cls(parsed_grammar)
7878

vendor/llama.cpp

0 commit comments

Comments
 (0)
0