8000 ckpt · abetlen/llama-cpp-python@bcc3faf · GitHub
[go: up one dir, main page]

Skip to content

Commit bcc3faf

Browse files
committed
ckpt
1 parent 23e11a6 commit bcc3faf

File tree

4 files changed

+102
-25
lines changed

4 files changed

+102
-25
lines changed

llama_cpp/llama.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,28 @@ def apply_lora_from_file(
294294
self.model,
295295
lora_path.encode("utf-8"),
296296
scale,
297-
path_base_model.encode("utf-8")
298-
if path_base_model is not None
299-
else llama_cpp.c_char_p(0),
297+
(
298+
path_base_model.encode("utf-8")
299+
if path_base_model is not None
300+
else llama_cpp.c_char_p(0)
301+
),
300302
n_threads,
301303
)
302304

305+
# def load_control_vector(self, strength: float, fname: str):
306+
# return llama_cpp.llama_load_control_vector(load_infos)
307+
308+
# def apply_control_vector(
309+
# self,
310+
# cvec: Union[List[float], np.ndarray],
311+
# n_embd: int,
312+
# il_start: int,
313+
# il_end: int,
314+
# ):
315+
# return llama_cpp.llama_control_vector_apply(
316+
# self.model._ctx, cvec, n_embd, il_start, il_end
317+
# )
318+
303319
# Vocab
304320

305321
def token_get_text(self, token: int) -> str:
@@ -1486,7 +1502,7 @@ def logit_bias_processor(
14861502
stopping_criteria=stopping_criteria,
14871503
logits_processor=logits_processor,
14881504
grammar=grammar,
1489-
reset=False, # reset is managed by PersistantStateManager
1505+
reset=False, # reset is managed by PersistantStateManager
14901506
):
14911507
if token == self._token_eos:
14921508
text = self.detokenize(completion_tokens)
@@ -2237,19 +2253,19 @@ def __setstate__(self, state):
22372253

22382254
def save_state(self) -> LlamaState:
22392255
assert self._ctx.ctx is not None
2240-
2256+
22412257
if self.verbose:
22422258
print("Llama.save_state: saving llama state", file=sys.stderr)
22432259
state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
2244-
2260+
22452261
if self.verbose:
22462262
print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
22472263
llama_state = (llama_cpp.c_uint8 * int(state_size))()
2248-
2264+
22492265
if self.verbose:
22502266
print("Llama.save_state: allocated state", file=sys.stderr)
22512267
n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
2252-
2268+
22532269
if self.verbose:
22542270
print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
22552271
if int(n_bytes) > int(state_size):

llama_cpp/llama_cpp.py

Lines changed: 72 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,12 @@ def _load_shared_library(lib_base_name: str):
213213
# } llama_token_data;
214214
class llama_token_data(Structure):
215215
"""Used to store token data
216-
216+
217217
Attributes:
218218
id (llama_token): token id
219219
logit (float): log-odds of the token
220220
p (float): probability of the token"""
221+
221222
_fields_ = [
222223
("id", llama_token),
223224
("logit", c_float),
@@ -235,11 +236,12 @@ class llama_token_data(Structure):
235236
# } llama_token_data_array;
236237
class llama_token_data_array(Structure):
237238
"""Used to sample tokens given logits
238-
239+
239240
Attributes:
240241
data (ctypes.Array[llama_token_data]): token data
241242
size (int): size of the array
242243
sorted (bool): whether the array is sorted"""
244+
243245
_fields_ = [
244246
("data", llama_token_data_p),
245247
("size", c_size_t),
@@ -283,6 +285,8 @@ class llama_token_data_array(Structure):
283285
# llama_pos all_pos_1; // used if pos == NULL
284286
# llama_seq_id all_seq_id; // used if seq_id == NULL
285287
# } llama_batch;
288+
289+
286290
class llama_batch(Structure):
287291
"""Input data for llama_decode
288292
@@ -294,7 +298,8 @@ class llama_batch(Structure):
294298
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
295299
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
296300
pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
297-
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs"""
301+
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
302+
"""
298303

299304
_fields_ = [
300305
("n_tokens", c_int32),
@@ -309,6 +314,7 @@ class llama_batch(Structure):
309314
("all_seq_id", llama_seq_id),
310315
]
311316

317+
312318
# enum llama_model_kv_override_type {
313319
# LLAMA_KV_OVERRIDE_INT,
314320
# LLAMA_KV_OVERRIDE_FLOAT,
@@ -321,6 +327,7 @@ class llama_model_kv_override_type(Structure):
321327
("LLAMA_KV_OVERRIDE_BOOL", c_int),
322328
]
323329

330+
324331
# struct llama_model_kv_override {
325332
# char key[128];
326333
# enum llama_model_kv_override_type tag;
@@ -339,6 +346,7 @@ class llama_model_kv_override(Structure):
339346
("bool_value", c_bool),
340347
]
341348

349+
342350
# struct llama_model_params {
343351
# int32_t n_gpu_layers; // number of layers to store in VRAM
344352
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
@@ -352,14 +360,15 @@ class llama_model_kv_override(Structure):
352360
# // override key-value pairs of the model meta data
353361
# const struct llama_model_kv_override * kv_overrides;
354362

363+
355364
# // Keep the booleans together to avoid misalignment during copy-by-value.
356365
# bool vocab_only; // only load the vocabulary, no weights
357366
# bool use_mmap; // use mmap if possible
358367
# bool use_mlock; // force system to keep model in RAM
359368
# };
360369
class llama_model_params(Structure):
361370
"""Parameters for llama_model
362-
371+
363372
Attributes:
364373
n_gpu_layers (int): number of layers to store in VRAM
365374
main_gpu (int): the GPU that is used for scratch and small tensors
@@ -370,6 +379,7 @@ class llama_model_params(Structure):
370379
vocab_only (bool): only load the vocabulary, no weights
371380
use_mmap (bool): use mmap if possible
372381
use_mlock (bool): force system to keep model in RAM"""
382+
373383
_fields_ = [
374384
("n_gpu_layers", c_int32),
375385
("main_gpu", c_int32),
@@ -403,6 +413,7 @@ class llama_model_params(Structure):
403413
# enum ggml_type type_k; // data type for K cache
404414
# enum ggml_type type_v; // data type for V cache
405415

416+
406417
# // Keep the booleans together to avoid misalignment during copy-by-value.
407418
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
408419
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -411,7 +422,7 @@ class llama_model_params(Structure):
411422
# };
412423
class llama_context_params(Structure):
413424
"""Parameters for llama_context
414-
425+
415426
Attributes:
416427
seed (int): RNG seed, -1 for random
417428
n_ctx (int): text context, 0 = from model
@@ -432,6 +443,7 @@ class llama_context_params(Structure):
432443
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
433444
logits_all (bool): the llama_eval() call computes all logits, not just the last one
434445
embedding (bool): embedding mode only"""
446+
435447
_fields_ = [
436448
("seed", c_uint32),
437449
("n_ctx", c_uint32),
@@ -455,6 +467,15 @@ class llama_context_params(Structure):
455467
]
456468

457469

470+
class LlamaControlVectorLoadInfo(Structure):
471+
_fields_ = [("strength", c_float), ("fname", c_char_p)]
472+
473+
474+
class LlamaControlVectorData(Structure):
475+
# Placeholder: replace with actual fields based on C++ definition
476+
_fields_ = [("n_embd", c_int), ("data", POINTER(c_float))]
477+
478+
458479
# // Signature for logging events
459480
# // Note that text includes the new line character at the end for most events.
460481
# // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
@@ -480,14 +501,16 @@ class llama_context_params(Structure):
480501
# } llama_model_quantize_params;
481502
class llama_model_quantize_params(Structure):
482503
"""Parameters for llama_model_quantize
483-
504+
484505
Attributes:
485506
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
486507
ftype (int): quantize to this llama_ftype
487508
allow_requantize (bool): allow quantizing non-f32/f16 tensors
488509
quantize_output_tensor (bool): quantize output.weight
489510
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
490-
pure (bool): disable k-quant mixtures and quantize all tensors to the same type"""
511+
pure (bool): disable k-quant mixtures and quantize all tensors to the same type
512+
"""
513+
491514
_fields_ = [
492515
("nthread", c_int),
493516
("ftype", c_int),
@@ -969,6 +992,36 @@ def llama_model_apply_lora_from_file(
969992
)
970993

971994

995+
def llama_load_control_vector(strength: float, fname: str):
996+
load_infos = [LlamaControlVectorLoadInfo(strength, fname)]
997+
998+
infos_array = (LlamaControlVectorLoadInfo * len(load_infos))(*load_infos)
999+
result = _lib.llama_control_vector_load(infos_array, len(load_infos))
1000+
1001+
data_list = [] # How to populate this depends on your data structure
1002+
1003+
return {"n_embd": result.n_embd, "data": data_list}
1004+
1005+
1006+
def llama_control_vector_apply(lctx, data, n_embd, il_start, il_end):
1007+
"""
1008+
A Python wrapper to invoke llama_control_vector_apply C++ function.
1009+
1010+
:param lctx: The llama_context object
1011+
:param data: The data array to be applied
1012+
:param n_embd: Number of embeddings
1013+
:param il_start: Start layer index
1014+
:param il_end: End layer index
1015+
:return: Result of the C++ function call
1016+
"""
1017+
# Convert Python list or numpy array to ctypes array if not already
1018+
data_array = (c_float * len(data))(*data)
1019+
1020+
return _lib.llama_control_vector_apply(
1021+
ctypes.byref(lctx), data_array, len(data), n_embd, il_start, il_end
1022+
)
1023+
1024+
9721025
_lib.llama_model_apply_lora_from_file.argtypes = [
9731026
llama_model_p,
9741027
c_char_p,
@@ -2036,10 +2089,11 @@ def llama_sample_temp(
20362089
temp: Union[c_float, float],
20372090
):
20382091
"""Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509
2039-
2092+
20402093
Parameters:
20412094
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
2042-
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text."""
2095+
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
2096+
"""
20432097
return _lib.llama_sample_temp(ctx, candidates, temp)
20442098

20452099

@@ -2084,10 +2138,11 @@ def llama_sample_grammar(
20842138
grammar, # type: llama_grammar_p
20852139
):
20862140
"""Apply constraints from grammar
2087-
2141+
20882142
Parameters:
20892143
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
2090-
grammar: A grammar object containing the rules and constraints to apply to the generated text."""
2144+
grammar: A grammar object containing the rules and constraints to apply to the generated text.
2145+
"""
20912146
return _lib.llama_sample_grammar(ctx, candidates, grammar)
20922147

20932148

@@ -2121,13 +2176,14 @@ def llama_sample_token_mirostat(
21212176
mu, # type: _Pointer[c_float]
21222177
) -> int:
21232178
"""Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
2124-
2179+
21252180
Parameters:
21262181
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
21272182
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
21282183
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
21292184
m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
2130-
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
2185+
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
2186+
"""
21312187
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
21322188

21332189

@@ -2161,12 +2217,13 @@ def llama_sample_token_mirostat_v2(
21612217
mu, # type: _Pointer[c_float]
21622218
) -> int:
21632219
"""Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
2164-
2220+
21652221
Parameters:
21662222
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
21672223
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
21682224
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
2169-
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
2225+
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
2226+
"""
21702227
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
21712228

21722229

scripts/start.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1-
python llama_cpp/server --model "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/data/llms/mistral-fwd-john-doe-ckpt-158-200.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000
1+
python llama_cpp/server --model "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/data/llms/mistral-fwd-john-doe-ckpt-158-200.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000
2+
3+
python llama_cpp/server --model "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/data/llms/mistral-fwd-instruct-v0.2-v0.0.1.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000
4+
5+
python3 llama_cpp/server --model "../data/mistral-fwd-john-doe-ckpt-158-200.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000

vendor/llama.cpp

0 commit comments

Comments
 (0)
0