8000 feat: Update llama.cpp · iamlemec/llama-cpp-python@2292af5 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2292af5

Browse files
committed
feat: Update llama.cpp
1 parent 221edb9 commit 2292af5

File tree

4 files changed

+41
-37
lines changed

4 files changed

+41
-37
lines changed

llama_cpp/llama.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def __init__(
6565
*,
6666
# Model Params
6767
n_gpu_layers: int = 0,
68-
split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
68+
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
6969
main_gpu: int = 0,
7070
tensor_split: Optional[List[float]] = None,
7171
vocab_only: bool = False,
@@ -78,7 +78,7 @@ def __init__(
7878
n_batch: int = 512,
7979
n_threads: Optional[int] = None,
8080
n_threads_batch: Optional[int] = None,
81-
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED,
81+
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
8282
rope_freq_base: float = 0.0,
8383
rope_freq_scale: float = 0.0,
8484
yarn_ext_factor: float = -1.0,
@@ -238,13 +238,13 @@ def __init__(
238238
for i, (k, v) in enumerate(kv_overrides.items()):
239239
self._kv_overrides_array[i].key = k.encode("utf-8")
240240
if isinstance(v, bool):
241-
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
241+
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
242242
self._kv_overrides_array[i].value.bool_value = v
243243
elif isinstance(v, int):
244-
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
244+
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
245245
self._kv_overrides_array[i].value.int_value = v
246246
elif isinstance(v, float):
247-
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
247+
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
248248
self._kv_overrides_array[i].value.float_value = v
249249
else:
250250
raise ValueError(f"Unknown value type for {k}: {v}")
@@ -270,7 +270,7 @@ def __init__(
270270
self.context_params.rope_scaling_type = (
271271
rope_scaling_type
272272
if rope_scaling_type is not None
273-
else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
273+
else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
274274
)
275275
self.context_params.rope_freq_base = (
276276
rope_freq_base if rope_freq_base != 0.0 else 0

llama_cpp/llama_cpp.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -279,35 +279,35 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
279279
LLAMA_FTYPE_GUESSED = 1024
280280

281281
# enum llama_rope_scaling_type {
282-
# LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
283-
# LLAMA_ROPE_SCALING_NONE = 0,
284-
# LLAMA_ROPE_SCALING_LINEAR = 1,
285-
# LLAMA_ROPE_SCALING_YARN = 2,
286-
# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
282+
# LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
283+
# LLAMA_ROPE_SCALING_TYPE_NONE = 0,
284+
# LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
285+
# LLAMA_ROPE_SCALING_TYPE_YARN = 2,
286+
# LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
287287
# };
288-
LLAMA_ROPE_SCALING_UNSPECIFIED = -1
289-
LLAMA_ROPE_SCALING_NONE = 0
290-
LLAMA_ROPE_SCALING_LINEAR = 1
291-
LLAMA_ROPE_SCALING_YARN = 2
292-
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
288+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
289+
LLAMA_ROPE_SCALING_TYPE_NONE = 0
290+
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
291+
LLAMA_ROPE_SCALING_TYPE_YARN = 2
292+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
293293

294294
# enum llama_pooling_type {
295-
# LLAMA_POOLING_NONE = 0,
296-
# LLAMA_POOLING_MEAN = 1,
297-
# LLAMA_POOLING_CLS = 2,
295+
# LLAMA_POOLING_TYPE_NONE = 0,
296+
# LLAMA_POOLING_TYPE_MEAN = 1,
297+
# LLAMA_POOLING_TYPE_CLS = 2,
298298
# };
299-
LLAMA_POOLING_NONE = 0
300-
LLAMA_POOLING_MEAN = 1
301-
LLAMA_POOLING_CLS = 2
299+
LLAMA_POOLING_TYPE_NONE = 0
300+
LLAMA_POOLING_TYPE_MEAN = 1
301+
LLAMA_POOLING_TYPE_CLS = 2
302302

303303
# enum llama_split_mode {
304-
# LLAMA_SPLIT_NONE = 0, // single GPU
305-
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
306-
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
304+
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
305+
# LLAMA_SPLIT_MODE_LAYER = 1, / 67ED / split layers and KV across GPUs
306+
# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
307307
# };
308-
LLAMA_SPLIT_NONE = 0
309-
LLAMA_SPLIT_LAYER = 1
310-
LLAMA_SPLIT_ROW = 2
308+
LLAMA_SPLIT_MODE_NONE = 0
309+
LLAMA_SPLIT_MODE_LAYER = 1
310+
LLAMA_SPLIT_MODE_ROW = 2
311311

312312

313313
# typedef struct llama_token_data {
@@ -420,13 +420,13 @@ class llama_batch(ctypes.Structure):
420420

421421

422422
# enum llama_model_kv_override_type {
423-
# LLAMA_KV_OVERRIDE_INT,
424-
# LLAMA_KV_OVERRIDE_FLOAT,
425-
# LLAMA_KV_O A3DB VERRIDE_BOOL,
423+
# LLAMA_KV_OVERRIDE_TYPE_INT,
424+
# LLAMA_KV_OVERRIDE_TYPE_FLOAT,
425+
# LLAMA_KV_OVERRIDE_TYPE_BOOL,
426426
# };
427-
LLAMA_KV_OVERRIDE_INT = 0
428-
LLAMA_KV_OVERRIDE_FLOAT = 1
429-
LLAMA_KV_OVERRIDE_BOOL = 2
427+
LLAMA_KV_OVERRIDE_TYPE_INT = 0
428+
LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
429+
LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
430430

431431

432432
# struct llama_model_kv_override {

llama_cpp/server/settings.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class ModelSettings(BaseSettings):
2929
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
3030
)
3131
split_mode: int = Field(
32-
default=llama_cpp.LLAMA_SPLIT_LAYER,
32+
default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
3333
description="The split mode to use.",
3434
)
3535
main_gpu: int = Field(
@@ -74,7 +74,7 @@ class ModelSettings(BaseSettings):
7474
ge=0,
7575
description="The number of threads to use when batch processing.",
7676
)
77-
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED)
77+
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
7878
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
7979
rope_freq_scale: float = Field(
8080
default=0.0, description="RoPE frequency scaling factor"
@@ -143,6 +143,10 @@ class ModelSettings(BaseSettings):
143143
default=None,
144144
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
145145
)
146+
hf_model_repo_id: Optional[str] = Field(
147+
default=None,
148+
description="The HuggingFace repo_id to use to load model files from",
149+
)
146150
# Speculative Decoding
147151
draft_model: Optional[str] = Field(
148152
default=None,

vendor/llama.cpp

0 commit comments

Comments
 (0)
0