8000 Add split_mode option. Closes #1085 · sjanaX01/llama-cpp-python@84615ad · GitHub
[go: up one dir, main page]

Skip to content

Commit 84615ad

Browse files
committed
Add split_mode option. Closes abetlen#1085
1 parent 76aafa6 commit 84615ad

File tree

2 files changed

+10
-1
lines changed

2 files changed

+10
-1
lines changed

llama_cpp/llama.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,7 @@ def __init__(
730730
*,
731731
# Model Params
732732
n_gpu_layers: int = 0,
733+
split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
733734
main_gpu: int = 0,
734735
tensor_split: Optional[List[float]] = None,
735736
vocab_only: bool = False,
@@ -799,7 +800,8 @@ def __init__(
799800
Args:
800801
model_path: Path to the model.
801802
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
802-
main_gpu: The GPU that is used for scratch and small tensors.
803+
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
804+
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
803805
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
804806
vocab_only: Only load the vocabulary no weights.
805807
use_mmap: Use mmap if possible.
@@ -850,6 +852,7 @@ def __init__(
850852
self.model_params.n_gpu_layers = (
851853
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
852854
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
855+
self.model_params.split_mode = split_mode
853856
self.model_params.main_gpu = main_gpu
854857
self.tensor_split = tensor_split
855858
self._c_tensor_split = None
@@ -2173,6 +2176,7 @@ def __getstate__(self):
21732176
model_path=self.model_path,
21742177
# Model Params
21752178
n_gpu_layers=self.model_params.n_gpu_layers,
2179+
split_mode=self.model_params.split_mode,
21762180
main_gpu=self.model_params.main_gpu,
21772181
tensor_split=self.tensor_split,
21782182
vocab_only=self.model_params.vocab_only,
@@ -2216,6 +2220,7 @@ def __setstate__(self, state):
22162220
model_path=state["model_path"],
22172221
# AC80 Model Params
22182222
n_gpu_layers=state["n_gpu_layers"],
2223+
split_mode=state["split_mode"],
22192224
main_gpu=state["main_gpu"],
22202225
tensor_split=state["tensor_split"],
22212226
vocab_only=state["vocab_only"],

llama_cpp/server/settings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ class ModelSettings(BaseSettings):
2828
ge=-1,
2929
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
3030
)
31+
split_mode: int = Field(
32+
default=llama_cpp.LLAMA_SPLIT_LAYER,
33+
description="The split mode to use.",
34+
)
3135
main_gpu: int = Field(
3236
default=0,
3337
ge=0,

0 commit comments

Comments
 (0)
0