8000 Update llama.cpp · sjanaX01/llama-cpp-python@5502ac8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5502ac8

Browse files
committed
Update llama.cpp
1 parent 359ae73 commit 5502ac8

File tree

2 files changed

+36
-9
lines changed

2 files changed

+36
-9
lines changed

llama_cpp/llama_cpp.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ def _load_shared_library(lib_base_name: str):
229229
LLAMA_SPLIT_LAYER = 1
230230
LLAMA_SPLIT_ROW = 2
231231

232+
232233
# typedef struct llama_token_data {
233234
# llama_token id; // token id
234235
# float logit; // log-odds of the token
@@ -395,6 +396,7 @@ class llama_model_kv_override(Structure):
395396
# // override key-value pairs of the model meta data
396397
# const struct llama_model_kv_override * kv_overrides;
397398

399+
398400
# // Keep the booleans together to avoid misalignment during copy-by-value.
399401
# bool vocab_only; // only load the vocabulary, no weights
400402
# bool use_mmap; // use mmap if possible
@@ -407,7 +409,7 @@ class llama_model_params(Structure):
407409
n_gpu_layers (int): number of layers to store in VRAM
408410
split_mode (int): how to split the model across multiple GPUs
409411
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
410-
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
412+
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
411413
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
412414
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
413415
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -1960,14 +1962,39 @@ def llama_sample_repetition_penalties(
19601962

19611963

19621964
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
1963-
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
1964-
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
1965-
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
1966-
# LLAMA_API void llama_sample_classifier_free_guidance(
1967-
# struct llama_context * ctx,
1965+
# /// @param logits Logits extracted from the original generation context.
1966+
# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a 10000 negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
1967+
# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
1968+
# LLAMA_API void llama_sample_apply_guidance(
1969+
# struct llama_context * ctx,
1970+
# float * logits,
1971+
# float * logits_guidance,
1972+
# float scale);
1973+
def llama_sample_apply_guidance(
1974+
ctx: llama_context_p,
1975+
logits, # type: _Pointer[c_float]
1976+
logits_guidance, # type: _Pointer[c_float]
1977+
scale: Union[c_float, float],
1978+
):
1979+
"""Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
1980+
return _lib.llama_sample_apply_guidance(ctx, logits, logits_guidance, scale)
1981+
1982+
1983+
_lib.llama_sample_apply_guidance.argtypes = [
1984+
llama_context_p,
1985+
c_float_p,
1986+
c_float_p,
1987+
c_float,
1988+
]
1989+
_lib.llama_sample_apply_guidance.restype = None
1990+
1991+
1992+
# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
1993+
# struct llama_context * ctx,
19681994
# llama_token_data_array * candidates,
1969-
# struct llama_context * guidance_ctx,
1970-
# float scale);
1995+
# struct llama_context * guidance_ctx,
1996+
# float scale),
1997+
# "use llama_sample_apply_guidance() instead");
19711998
def llama_sample_classifier_free_guidance(
19721999
ctx: llama_context_p,
19732000
candidates, # type: _Pointer[llama_token_data_array]

vendor/llama.cpp

0 commit comments

Comments
 (0)
0