8000 Merge branch main into custom_rope · Freed-Wu/llama-cpp-python@f0797a6 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f0797a6

Browse files
committed
Merge branch main into custom_rope
1 parent 3f8f276 commit f0797a6

File tree

8 files changed

+212
-69
lines changed

8 files changed

+212
-69
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.71]
11+
12+
### Added
13+
14+
- (llama.cpp) Update llama.cpp
15+
16+
### Fixed
17+
18+
- (server) Fix several pydantic v2 migration bugs
19+
1020
## [0.1.70]
1121

1222
### Fixed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
135135
```bash
136136
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
137137
```
138+
[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
138139

139140
## Low-level API
140141

llama_cpp/llama.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
from collections import deque, OrderedDict
2020

2121
import diskcache
22+
import ctypes
2223

2324
from . import llama_cpp
2425
from .llama_types import *
2526

2627
import numpy as np
2728
import numpy.typing as npt
2829

29-
3030
class BaseLlamaCache(ABC):
3131
"""Base cache class for a llama.cpp model."""
3232

@@ -222,6 +222,7 @@ def __init__(
222222
lora_base: Optional[str] = None,
223223
lora_path: Optional[str] = None,
224224
low_vram: bool = False,
225+
tensor_split: Optional[List[float]] = None,
225226
verbose: bool = True,
226227
):
227228
"""Load a llama.cpp model from `model_path`.
@@ -244,6 +245,7 @@ def __init__(
244245
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
245246
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
246247
lora_path: Path to a LoRA file to apply to the model.
248+
tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
247249
verbose: Print verbose output to stderr.
248250
249251
Raises:
@@ -252,6 +254,7 @@ def __init__(
252254
Returns:
253255
A Llama instance.
254256
"""
257+
255258
self.verbose = verbose
256259
self.model_path = model_path
257260

@@ -269,6 +272,15 @@ def __init__(
269272
self.params.embedding = embedding
270273
self.params.low_vram = low_vram
271274

275+
self.tensor_split = tensor_split
276+
self._c_tensor_split = None
277+
278+
if self.tensor_split is not None:
279+
#Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
280+
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
281+
self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
282+
self.params.tensor_split = self._c_tensor_split
283+
272284
self.last_n_tokens_size = last_n_tokens_size
273285
self.n_batch = min(n_ctx, n_batch)
274286

@@ -1509,6 +1521,7 @@ def __getstate__(self):
15091521
n_threads=self.n_threads,
15101522
lora_base=self.lora_base,
15111523
lora_path=self.lora_path,
1524+
tensor_split=self.tensor_split,
15121525
### DEPRECATED ###
15131526
n_parts=self.n_parts,
15141527
### DEPRECATED ###
@@ -1533,6 +1546,7 @@ def __setstate__(self, state):
15331546
last_n_tokens_size=state["last_n_tokens_size"],
15341547
lora_base=state["lora_base"],
15351548
lora_path=state["lora_path"],
1549+
tensor_split=state["tensor_split"],
15361550
verbose=state["verbose"],
15371551
)
15381552

llama_cpp/llama_cpp.py

Lines changed: 128 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
165165
# int32_t n_gpu_layers; // number of layers to store in VRAM
166166
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
167167
# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
168+
169+
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
170+
# float rope_freq_base; // RoPE base frequency
171+
# float rope_freq_scale; // RoPE frequency scaling factor
172+
168173
# // called with a progress value between 0 and 1, pass NULL to disable
169174
# llama_progress_callback progress_callback;
170175
# // context pointer passed to the progress callback
171176
# void * progress_callback_user_data;
172177

173-
174178
# // Keep the booleans together to avoid misalignment during copy-by-value.
175179
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
176180
# bool f16_kv; // use fp16 for KV cache
@@ -190,6 +194,8 @@ class llama_context_params(Structure):
190194
("n_gpu_layers", c_int32),
191195
("main_gpu", c_int32),
192196
("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
197+
("rope_freq_base", c_float),
198+
("rope_freq_scale", c_float),
193199
("progress_callback", llama_progress_callback),
194200
("progress_callback_user_data", c_void_p),
195201
("low_vram", c_bool),
@@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool:
328334
# // Initialize the llama + ggml backend
329335
# // If numa is true, use NUMA optimizations
330336
# // Call once at the start of the program
331-
# LLAMA_API void llama_init_backend(bool numa);
332-
def llama_init_backend(numa: c_bool):
333-
return _lib.llama_init_backend(numa)
337+
# LLAMA_API void llama_backend_init(bool numa);
338+
def llama_backend_init(numa: c_bool):
339+
return _lib.llama_backend_init(numa)
340+
341+
342+
_lib.llama_backend_init.argtypes = [c_bool]
343+
_lib.llama_backend_init.restype = None
344+
334345

346+
# // Call once at the end of the program - currently only used for MPI
347+
# LLAMA_API void llama_backend_free();
348+
def llama_backend_free():
349+
return _lib.llama_backend_free()
335350

336-
_lib.llama_init_backend.argtypes = [c_bool]
337-
_lib.llama_init_backend.restype = None
351+
352+
_lib.llama_backend_free.argtypes = []
353+
_lib.llama_backend_free.restype = None
338354

339355

340356
# LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -648,6 +664,22 @@ def llama_tokenize(
648664
_lib.llama_tokenize.restype = c_int
649665

650666

667+
# LLAMA_API int llama_tokenize_with_model(
668+
# const struct llama_model * model,
669+
# const char * text,
670+
# llama_token * tokens,
671+
# int n_max_tokens,
672+
# bool add_bos);
673+
def llama_tokenize_with_model(
674+
model: llama_model_p,
675+
text: bytes,
676+
tokens, # type: Array[llama_token]
677+
n_max_tokens: c_int,
678+
add_bos: c_bool,
679+
) -> int:
680+
return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
681+
682+
651683
# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
652684
def llama_n_vocab(ctx: llama_context_p) -> int:
653685
return _lib.llama_n_vocab(ctx)
@@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
675707
_lib.llama_n_embd.restype = c_int
676708

677709

710+
# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
711+
def llama_n_vocab_from_model(model: llama_model_p) -> int:
712+
return _lib.llama_n_vocab_from_model(model)
713+
714+
715+
_lib.llama_n_vocab_from_model.argtypes = [llama_model_p]
716+
_lib.llama_n_vocab_from_model.restype = c_int
717+
718+
719+
# LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
720+
def llama_n_ctx_from_model(model: llama_model_p) -> int:
721+
return _lib.llama_n_ctx_from_model(model)
722+
723+
724+
_lib.llama_n_ctx_from_model.argtypes = [llama_model_p]
725+
_lib.llama_n_ctx_from_model.restype = c_int
726+
727+
728+
# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
729+
def llama_n_embd_from_model(model: llama_model_p) -> int:
730+
return _lib.llama_n_embd_from_model(model)
731+
732+
733+
_lib.llama_n_embd_from_model.argtypes = [llama_model_p]
734+
_lib.llama_n_embd_from_model.restype = c_int
735+
736+
678737
# // Get the vocabulary as output parameters.
679738
# // Returns number of results.
680739
# LLAMA_API int llama_get_vocab(
@@ -695,6 +754,20 @@ def llama_get_vocab(
695754
_lib.llama_get_vocab.restype = c_int
696755

697756

757+
# LLAMA_API int llama_get_vocab_from_model(
758+
# const struct llama_model * model,
759+
# const char * * strings,
760+
# float * scores,
761+
# int capacity);
762+
def llama_get_vocab_from_model(
763+
model: llama_model_p,
764+
strings, # type: Array[c_char_p] # type: ignore
765+
scores, # type: Array[c_float] # type: ignore
766+
capacity: c_int,
767+
) -> int:
768+
return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
769+
770+
698771
# Token logits obtained from the last call to llama_eval()
699772
# The logits for the last token are stored in the last row
700773
# Can be mutated in order to change the probabilities of the next token
@@ -724,15 +797,28 @@ def llama_get_embeddings(
724797
_lib.llama_get_embeddings.restype = c_float_p
725798

726799

727-
# Token Id -> String. Uses the vocabulary in the provided context
728-
# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
800+
# // Token Id -> String. Uses the vocabulary in the provided context
801+
# LLAMA_API const char * llama_token_to_str(
802+
# const struct llama_context * ctx,
803+
# llama_token token);
729804
def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
730805
return _lib.llama_token_to_str(ctx, token)
731806

732807

733808
_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
734809
_lib.llama_token_to_str.restype = c_char_p
735810

811+
812+
# LLAMA_API const char * llama_token_to_str_with_model(
813+
# const struct llama_model * model,
814+
# llama_token token);
815+
def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes:
816+
return _lib.llama_token_to_str_with_model(model, token)
817+
818+
819+
_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token]
820+
_lib.llama_token_to_str_with_model.restype = c_char_p
821+
736822
# Special tokens
737823

738824

@@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties(
821907
_lib.llama_sample_frequency_and_presence_penalties.restype = None
822908

823909

910+
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
911+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
912+
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
913+
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
914+
# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
915+
# LLAMA_API void llama_sample_classifier_free_guidance(
916+
# struct llama_context * ctx,
917+
# llama_token_data_array * candidates,
918+
# struct llama_context * guidance_ctx,
919+
# float scale,
920+
# float smooth_factor);
921+
def llama_sample_classifier_free_guidance(
922+
ctx: llama_context_p,
923+
candidates, # type: _Pointer[llama_token_data_array]
924+
guidance_ctx: llama_context_p,
925+
scale: c_float,
926+
smooth_factor: c_float,
927+
):
928+
return _lib.llama_sample_classifier_free_guidance(
929+
ctx, candidates, guidance_ctx, scale, smooth_factor
930+
)
931+
932+
933+
_lib.llama_sample_classifier_free_guidance.argtypes = [
934+
llama_context_p,
935+
llama_token_data_array_p,
936+
llama_context_p,
937+
c_float,
938+
c_float,
939+
]
940+
_lib.llama_sample_classifier_free_guidance.restype = None
941+
942+
824943
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
825944
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
826945
def llama_sample_softmax(
@@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes:
10651184
_llama_initialized = False
10661185

10671186
if not _llama_initialized:
1068-
llama_init_backend(c_bool(False))
1187+
llama_backend_init(c_bool(False))
10691188
_llama_initialized = True

0 commit comments

Comments
 (0)
0