8000 Merge pull request #460 from shouyiwang/tensor_split · callMeMakerRen/llama-cpp-python@82b11c8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 82b11c8

Browse files
authored
Merge pull request abetlen#460 from shouyiwang/tensor_split
Add support for llama.cpp's --tensor-split parameter
2 parents 6705f9b + 579f526 commit 82b11c8

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

llama_cpp/llama.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
from collections import deque, OrderedDict
2020

2121
import diskcache
22+
import ctypes
2223

2324
from . import llama_cpp
2425
from .llama_types import *
2526

2627
import numpy as np
2728
import numpy.typing as npt
2829

29-
3030
class BaseLlamaCache(ABC):
3131
"""Base cache class for a llama.cpp model."""
3232

@@ -207,6 +207,7 @@ def __init__(
207207
n_ctx: int = 512,
208208
n_parts: int = -1,
209209
n_gpu_layers: int = 0,
210+
tensor_split: list[float] = None,
210211
seed: int = 1337,
211212
f16_kv: bool = True,
212213
logits_all: bool = False,
@@ -248,12 +249,20 @@ def __init__(
248249
Returns:
249250
A Llama instance.
250251
"""
252+
if tensor_split is None:
253+
tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value
254+
255+
#Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
256+
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
257+
c_tensor_split = FloatArray(*tensor_split)
258+
251259
self.verbose = verbose
252260
self.model_path = model_path
253261

254262
self.params = llama_cpp.llama_context_default_params()
255263
self.params.n_ctx = n_ctx
256264
self.params.n_gpu_layers = n_gpu_layers
265+
self.params.tensor_split = c_tensor_split
257266
self.params.seed = seed
258267
self.params.f16_kv = f16_kv
259268
self.params.logits_all = logits_all
@@ -1490,6 +1499,7 @@ def __getstate__(self):
14901499
model_path=self.model_path,
14911500
n_ctx=self.params.n_ctx,
14921501
n_gpu_layers=self.params.n_gpu_layers,
1502+
tensor_split=self.params.tensor_split,
14931503
seed=self.params.seed,
14941504
f16_kv=self.params.f16_kv,
14951505
logits_all=self.params.logits_all,
@@ -1514,6 +1524,7 @@ def __setstate__(self, state):
15141524
n_ctx=state["n_ctx"],
15151525
n_parts=state["n_parts"],
15161526
n_gpu_layers=state["n_gpu_layers"],
1527+
tensor_split=state["tensor_split"],
15171528
seed=state["seed"],
15181529
f16_kv=state["f16_kv"],
15191530
logits_all=state["logits_all"],

llama_cpp/server/app.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@ class Settings(BaseSettings):
3131
ge=0,
3232
description="The number of layers to put on the GPU. The rest will be on the CPU.",
3333
)
34-
seed: int = Field(default=1337, description="Random seed. -1 for random.")
34+
tensor_split: List[float] = Field(
35+
default=None,
36+
description="Split layers across multiple GPUs in proportion.",
37+
)
38+
seed: int = Field(
39+
default=1337, description="Random seed. -1 for random."
40+
)
3541
n_batch: int = Field(
3642
default=512, ge=1, description="The batch size to use per eval."
3743
)
@@ -111,6 +117,7 @@ def create_app(settings: Optional[Settings] = None):
111117
llama = llama_cpp.Llama(
112118
model_path=settings.model,
113119
n_gpu_layers=settings.n_gpu_layers,
120+
tensor_split=settings.tensor_split,
114121
seed=settings.seed,
115122
f16_kv=settings.f16_kv,
116123
use_mlock=settings.use_mlock,

0 commit comments

Comments
 (0)
0