8000 Update llama.cpp · chiensen/llama-cpp-python@fa83cc5 · GitHub
[go: up one dir, main page]

Skip to content

Commit fa83cc5

Browse files
committed
Update llama.cpp
Fix build examples Exclude examples directory Revert cmake changes Try actions/checkout@v4 Try to update submodules Revert Update llama.cpp Fix build examples Exclude examples directory Revert cmake changes Try actions/checkout@v4 Try to update submodules Revert
1 parent ddbd10c commit fa83cc5

File tree

5 files changed

+145
-39
lines changed

5 files changed

+145
-39
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
python-version: ["3.8", "3.9", "3.10", "3.11"]
1818

1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
2121
with:
2222
submodules: "true"
2323
- name: Set up Python ${{ matrix.python-version }}

llama_cpp/llama.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,14 @@ def __init__(
230230
n_batch: int = 512,
231231
n_threads: Optional[int] = None,
232232
n_threads_batch: Optional[int] = None,
233+
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED,
233234
rope_freq_base: float = 0.0,
234235
rope_freq 8000 _scale: float = 0.0,
236+
yarn_ext_factor: float = float("nan"),
237+
yarn_attn_factor: float = 1.0,
238+
yarn_beta_fast: float = 32.0,
239+
yarn_beta_slow: float = 1.0,
240+
yarn_orig_ctx: int = 0,
235241
mul_mat_q: bool = True,
236242
f16_kv: bool = True,
237243
logits_all: bool = False,
@@ -255,30 +261,30 @@ def __init__(
255261
256262
Args:
257263
model_path: Path to the model.
258-
seed: Random seed. -1 for random.
259-
n_ctx: Maximum context size.
260-
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
261264
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
262-
main_gpu: Main GPU to use.
263-
tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
265+
main_gpu: The GPU that is used for scratch and small tensors.
266+
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
267+
vocab_only: Only load the vocabulary no weights.
268+
use_mmap: Use mmap if possible.
269+
use_mlock: Force the system to keep the model in RAM.
270+
seed: Random seed. -1 for random.
271+
n_ctx: Context size.
272+
n_batch: Batch size for prompt processing (must be >= 32 to use BLAS)
273+
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
274+
n_threads_batch: Number of threads to use for batch processing. If None, use n_threads.
275+
rope_scaling_type: Type of rope scaling to use.
264276
rope_freq_base: Base frequency for rope sampling.
265277
rope_freq_scale: Scale factor for rope sampling.
266-
low_vram: Use low VRAM mode.
267278
mul_mat_q: if true, use experimental mul_mat_q kernels
268279
f16_kv: Use half-precision for key/value cache.
269280
logits_all: Return logits for all tokens, not just the last token.
270-
vocab_only: Only load the vocabulary no weights.
271-
use_mmap: Use mmap if possible.
272-
use_mlock: Force the system to keep the model in RAM.
273281
embedding: Embedding mode only.
274-
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
275282
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
276283
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
277284
lora_path: Path to a LoRA file to apply to the model.
278285
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
279286
chat_format: String specifying the chat format to use when calling create_chat_completion.
280287
verbose: Print verbose output to stderr.
281-
kwargs: Unused keyword arguments (for additional backwards compatibility).
282288
283289
Raises:
284290
ValueError: If the model path does not exist.
@@ -332,12 +338,30 @@ def __init__(
332338
self.context_params.n_batch = self.n_batch
333339
self.context_params.n_threads = self.n_threads
334340
self.context_params.n_threads_batch = self.n_threads_batch
341+
self.context_params.rope_scaling_type = (
342+
rope_scaling_type if rope_scaling_type is not None else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
343+
)
335344
self.context_params.rope_freq_base = (
336345
rope_freq_base if rope_freq_base != 0.0 else 0
337346
)
338347
self.context_params.rope_freq_scale = (
339348
rope_freq_scale if rope_freq_scale != 0.0 else 0
340349
)
350+
self.context_params.yarn_ext_factor = (
351+
yarn_ext_factor if yarn_ext_factor != 0.0 else 0
352+
)
353+
self.context_params.yarn_attn_factor = (
354+
yarn_attn_factor if yarn_attn_factor != 0.0 else 0
355+
)
356+
self.context_params.yarn_beta_fast = (
357+
yarn_beta_fast if yarn_beta_fast != 0.0 else 0
358+
)
359+
self.context_params.yarn_beta_slow = (
360+
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
361+
)
362+
self.context_params.yarn_orig_ctx = (
363+
yarn_orig_ctx if yarn_orig_ctx != 0 else 0
364+
)
341365
self.context_params.mul_mat_q = mul_mat_q
342366
self.context_params.f16_kv = f16_kv
343367
self.context_params.logits_all = logits_all
@@ -1671,8 +1695,14 @@ def __getstate__(self):
16711695
n_batch=self.n_batch,
16721696
n_threads=self.context_params.n_threads,
16731697
n_threads_batch=self.context_params.n_threads_batch,
1698+
rope_scaling_type=self.context_params.rope_scaling_type,
16741699
rope_freq_base=self.context_params.rope_freq_base,
16751700
rope_freq_scale=self.context_params.rope_freq_scale,
1701+
yarn_ext_factor=self.context_params.yarn_ext_factor,
1702+
yarn_attn_factor=self.context_params.yarn_attn_factor,
1703+
yarn_beta_fast=self.context_params.yarn_beta_fast,
1704+
yarn_beta_slow=self.context_params.yarn_beta_slow,
1705+
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
16761706
mul_mat_q=self.context_params.mul_mat_q,
16771707
f16_kv=self.context_params.f16_kv,
16781708
logits_all=self.context_params.logits_all,
@@ -1709,6 +1739,12 @@ def __setstate__(self, state):
17091739
n_threads_batch=state["n_threads_batch"],
17101740
rope_freq_base=state["rope_freq_base"],
17111741
rope_freq_scale=state["rope_freq_scale"],
1742+
rope_scaling_type=state["rope_scaling_type"],
1743+
yarn_ext_factor=state["yarn_ext_factor"],
1744+
yarn_attn_factor=state["yarn_attn_factor"],
1745+
yarn_beta_fast=state["yarn_beta_fast"],
1746+
yarn_beta_slow=state["yarn_beta_slow"],
1747+
yarn_orig_ctx=state["yarn_orig_ctx"],
17121748
mul_mat_q=state["mul_mat_q"],
17131749
f16_kv=state["f16_kv"],
17141750
logits_all=state["logits_all"],

llama_cpp/llama_cpp.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ def _load_shared_library(lib_base_name: str):
192192
LLAMA_FTYPE_MOSTLY_Q6_K = 18
193193
LLAMA_FTYPE_GUESSED = 1024
194194

195+
# enum llama_rope_scaling_type {
196+
# LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
197+
# LLAMA_ROPE_SCALING_NONE = 0,
198+
# LLAMA_ROPE_SCALING_LINEAR = 1,
199+
# LLAMA_ROPE_SCALING_YARN = 2,
200+
# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
201+
# };
202+
LLAMA_ROPE_SCALING_UNSPECIFIED = -1
203+
LLAMA_ROPE_SCALING_NONE = 0
204+
LLAMA_ROPE_SCALING_LINEAR = 1
205+
LLAMA_ROPE_SCALING_YARN = 2
206+
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
195207

196208
# typedef struct llama_token_data {
197209
# llama_token id; // token id
@@ -308,10 +320,16 @@ class llama_model_params(Structure):
308320
# uint32_t n_batch; // prompt processing maximum batch size
309321
# uint32_t n_threads; // number of threads to use for generation
310322
# uint32_t n_threads_batch; // number of threads to use for batch processing
323+
# int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
311324

312325
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
313-
# float rope_freq_base; // RoPE base frequency, 0 = from model
314-
# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
326+
# float rope_freq_base; // RoPE base frequency, 0 = from model
327+
# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
328+
# float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
329+
# float yarn_attn_factor; // YaRN magnitude scaling factor
330+
# float yarn_beta_fast; // YaRN low correction dim
331+
# float yarn_beta_slow; // YaRN high correction dim
332+
# uint32_t yarn_orig_ctx; // YaRN original context size
315333

316334

317335
# // Keep the booleans together to avoid misalignment during copy-by-value.
@@ -327,8 +345,14 @@ class llama_context_params(Structure):
327345
("n_batch", c_uint32),
328346
("n_threads", c_uint32),
329347
("n_threads_batch", c_uint32),
348+
("rope_scaling_type", c_int8),
330349
("rope_freq_base", c_float),
331350
("rope_freq_scale", c_float),
351+
("yarn_ext_factor", c_float),
352+
("yarn_attn_factor", c_float),
353+
("yarn_beta_fast", c_float),
354+
("yarn_beta_slow", c_float),
355+
("yarn_orig_ctx", c_uint32),
332356
("mul_mat_q", c_bool),
333357
("f16_kv", c_bool),
334358
("logits_all", c_bool),

llama_cpp/server/app.py

Lines changed: 70 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,7 @@ class Settings(BaseSettings):
4141
default=None,
4242
description="The alias of the model to use for generating completions.",
4343
)
44-
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
45-
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
46-
n_batch: int = Field(
47-
default=512, ge=1, description="The batch size to use per eval."
48-
)
44+
# Model Params
4945
n_gpu_layers: int = Field(
5046
default=0,
5147
ge=-1,
@@ -60,17 +56,6 @@ class Settings(BaseSettings):
6056
default=None,
6157
description="Split layers across multiple GPUs in proportion.",
6258
)
63-
rope_freq_base: float = Field(
64-
default=0.0, description="RoPE base frequency"
65-
)
66-
rope_freq_scale: float = Field(
67-
default=0.0, description="RoPE frequency scaling factor"
68-
)
69-
mul_mat_q: bool = Field(
70-
default=True, description="if true, use experimental mul_mat_q kernels"
71-
)
72-
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
73-
logits_all: bool = Field(default=True, description="Whether to return logits.")
7459
vocab_only: bool = Field(
7560
default=False, description="Whether to only return the vocabulary."
7661
)
@@ -82,17 +67,59 @@ class Settings(BaseSettings):
8267
default=llama_cpp.llama_mlock_supported(),
8368
description="Use mlock.",
8469
)
85-
embedding: bool = Field(default=True, description="Whether to use embeddings.")
70+
# Context Params
71+
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
72+
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
73+
n_batch: int = Field(
74+
default=512, ge=1, description="The batch size to use per eval."
75+
)
8676
n_threads: int = Field(
8777
default=max(multiprocessing.cpu_count() // 2, 1),
8878
ge=1,
8979
description="The number of threads to use.",
9080
)
81+
n_threads_batch: int = Field(
82+
default=max(multiprocessing.cpu_count() // 2, 1),
83+
ge=0,
84+
description="The number of threads to use when batch processing.",
85+
)
86+
rope_scaling_type: int = Field(
87+
default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
88+
)
89+
rope_freq_base: float = Field(
90+
default=0.0, description="RoPE base frequency"
91+
)
92+
rope_freq_scale: float = Field(
93+
default=0.0, description="RoPE frequency scaling factor"
94+
)
95+
yarn_ext_factor: float = Field(
96+
default=float("nan")
97+
)
98+
yarn_attn_factor: float = Field(
99+
default=1.0
100+
)
101+
yarn_beta_fast: float = Field(
102+
default=32.0
103+
)
104+
yarn_beta_slow: float = Field(
105+
default=1.0
106+
)
107+
yarn_orig_ctx: int = Field(
108+
default=0
109+
)
110+
mul_mat_q: bool = Field(
111+
default=True, description="if true, use experimental mul_mat_q kernels"
112+
)
113+
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
114+
logits_all: bool = Field(default=True, description="Whether to return logits.")
115+
embedding: bool = Field(default=True, description="Whether to use embeddings.")
116+
# Sampling Params
91117
last_n_tokens_size: int = Field(
92118
default=64,
93119
ge=0,
94120
description="Last n tokens to keep for repeat penalty calculation.",
95121
)
122+
# LoRA Params
96123
lora_base: Optional[str] = Field(
97124
default=None,
98125
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
@@ -101,14 +128,17 @@ class Settings(BaseSettings):
101128
default=None,
102129
description="Path to a LoRA file to apply to the model.",
103130
)
131+
# Backend Params
104132
numa: bool = Field(
105133
default=False,
106134
description="Enable NUMA support.",
107135
)
136+
# Chat Format Params
108137
chat_format: str = Field(
109138
default="llama-2",
110139
description="Chat format to use.",
111140
)
141+
# Cache Params
112142
cache: bool = Field(
113143
default=False,
114144
description="Use a cache to reduce processing times for evaluated prompts.",
@@ -121,9 +151,11 @@ class Settings(BaseSettings):
121151
default=2 << 30,
122152
description="The size of the cache in bytes. Only used if cache is True.",
123153
)
154+
# Misc
124155
verbose: bool = Field(
125156
default=True, description="Whether to print debug information."
126157
)
158+
# Server Params
127159
host: str = Field(default="localhost", description="Listen address")
128160
port: int = Field(default=8000, description="Listen port")
129161
interrupt_requests: bool = Field(
@@ -345,27 +377,41 @@ def create_app(settings: Optional[Settings] = None):
345377
global llama
346378
llama = llama_cpp.Llama(
347379
model_path=settings.model,
348-
seed=settings.seed,
349-
n_ctx=settings.n_ctx,
350-
n_batch=settings.n_batch,
380+
# Model Params
351381
n_gpu_layers=settings.n_gpu_layers,
352382
main_gpu=settings.main_gpu,
353383
tensor_split=settings.tensor_split,
384+
vocab_only=settings.vocab_only,
385+
use_mmap=settings.use_mmap,
386+
use_mlock=settings.use_mlock,
387+
# Context Params
388+
seed=settings.seed,
389+
n_ctx=settings.n_ctx,
390+
n_batch=settings.n_batch,
391+
n_threads=settings.n_threads,
392+
n_threads_batch=settings.n_threads_batch,
393+
rope_scaling_type=settings.rope_scaling_type,
354394
rope_freq_base=settings.rope_freq_base,
355395
rope_freq_scale=settings.rope_freq_scale,
396+
yarn_ext_factor=settings.yarn_ext_factor,
397+
yarn_attn_factor=settings.yarn_attn_factor,
398+
yarn_beta_fast=settings.yarn_beta_fast,
399+
yarn_beta_slow=settings.yarn_beta_slow,
400+
yarn_orig_ctx=settings.yarn_orig_ctx,
356401
mul_mat_q=settings.mul_mat_q,
357402
f16_kv=settings.f16_kv,
358403
logits_all=settings.logits_all,
359-
vocab_only=settings.vocab_only,
360-
use_mmap=settings.use_mmap,
361-
use_mlock=settings.use_mlock,
362404
embedding=settings.embedding,
363-
n_threads=settings.n_threads,
405+
# Sampling Params
364406
last_n_tokens_size=settings.last_n_tokens_size,
407+
# LoRA Params
365408
lora_base=settings.lora_base,
366409
lora_path=settings.lora_path,
410+
# Backend Params
367411
numa=settings.numa,
412+
# Chat Format Params
368413
chat_format=settings.chat_format,
414+
# Misc
369415
verbose=settings.verbose,
370416
)
371417
if settings.cache:

vendor/llama.cpp

0 commit comments

Comments
 (0)
0