8000 Update llama.cpp · coderonion/llama-cpp-python@9f1e565 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9f1e565

Browse files
committed
Update llama.cpp
1 parent 213cc5c commit 9f1e565

File tree

2 files changed

+40
-9
lines changed

2 files changed

+40
-9
lines changed

llama_cpp/llama_cpp.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,21 @@
11
import sys
22
import os
33
import ctypes
4-
from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t
4+
from ctypes import (
5+
c_int,
6+
c_float,
7+
c_char_p,
8+
c_void_p,
9+
c_bool,
10+
POINTER,
11+
Structure,
12+
Array,
13+
c_uint8,
14+
c_size_t,
15+
)
516
import pathlib
617

18+
719
# Load the library
820
def _load_shared_library(lib_base_name):
921
# Determine the file extension based on the platform
@@ -22,10 +34,10 @@ def _load_shared_library(lib_base_name):
2234
# for llamacpp) and "llama" (default name for this repo)
2335
_lib_paths = [
2436
_base_path / f"lib{lib_base_name}{lib_ext}",
25-
_base_path / f"{lib_base_name}{lib_ext}"
37+
_base_path / f"{lib_base_name}{lib_ext}",
2638
]
2739

28-
if ("LLAMA_CPP_LIB" in os.environ):
40+
if "LLAMA_CPP_LIB" in os.environ:
2941
lib_base_name = os.environ["LLAMA_CPP_LIB"]
3042
_lib = pathlib.Path(lib_base_name)
3143
_base_path = _lib.parent.resolve()
@@ -43,7 +55,10 @@ def _load_shared_library(lib_base_name):
4355
except Exception as e:
4456
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
4557

46-
raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found")
58+
raise FileNotFoundError(
59+
f"Shared library with base name '{lib_base_name}' not found"
60+
)
61+
4762

4863
# Specify the base name of the shared library to load
4964
_lib_base_name = "llama"
@@ -95,6 +110,10 @@ class llama_context_params(Structure):
95110

96111
llama_context_params_p = POINTER(llama_context_params)
97112

113+
LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
114+
LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors
115+
LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors
116+
LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors
98117

99118
# Functions
100119

@@ -106,18 +125,23 @@ def llama_context_default_params() -> llama_context_params:
106125
_lib.llama_context_default_params.argtypes = []
107126
_lib.llama_context_default_params.restype = llama_context_params
108127

128+
109129
def llama_mmap_supported() -> c_bool:
110130
return _lib.llama_mmap_supported()
111131

132+
112133
_lib.llama_mmap_supported.argtypes = []
113134
_lib.llama_mmap_supported.restype = c_bool
114135

136+
115137
def llama_mlock_supported() -> c_bool:
116138
return _lib.llama_mlock_supported()
117139

140+
118141
_lib.llama_mlock_supported.argtypes = []
119142
_lib.llama_mlock_supported.restype = c_bool
120143

144+
121145
# Various functions for loading a ggml llama model.
122146
# Allocate (almost) all memory needed for the model.
123147
# Return NULL on failure
@@ -142,42 +166,49 @@ def llama_free(ctx: llama_context_p):
142166

143167
# TODO: not great API - very likely to change
144168
# Returns 0 on success
145-
def llama_model_quantize(
146-
fname_inp: bytes, fname_out: bytes, itype: c_int
147-
) -> c_int:
169+
def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int:
148170
return _lib.llama_model_quantize(fname_inp, fname_out, itype)
149171

150172

151173
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
152174
_lib.llama_model_quantize.restype = c_int
153175

176+
154177
# Returns the KV cache that will contain the context for the
155178
# ongoing prediction with the model.
156179
def llama_get_kv_cache(ctx: llama_context_p):
157180
return _lib.llama_get_kv_cache(ctx)
158181

182+
159183
_lib.llama_get_kv_cache.argtypes = [llama_context_p]
160184
_lib.llama_get_kv_cache.restype = POINTER(c_uint8)
161185

186+
162187
# Returns the size of the KV cache
163188
def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
164189
return _lib.llama_get_kv_cache_size(ctx)
165190

191+
166192
_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
167193
_lib.llama_get_kv_cache_size.restype = c_size_t
168194

195+
169196
# Returns the number of tokens in the KV cache
170197
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
171198
return _lib.llama_get_kv_cache_token_count(ctx)
172199

200+
173201
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
174202
_lib.llama_get_kv_cache_token_count.restype = c_int
175203

176204

177205
# Sets the KV cache containing the current context for the model
178-
def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
206+
def llama_set_kv_cache(
207+
ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int
208+
):
179209
return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
180210

211+< 57FE /span>
181212
_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
182213
_lib.llama_set_kv_cache.restype = None
183214

vendor/llama.cpp

0 commit comments

Comments
 (0)
0