You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+6-1Lines changed: 6 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -730,6 +730,7 @@ def __init__(
730
730
*,
731
731
# Model Params
732
732
n_gpu_layers: int=0,
733
+
split_mode: int=llama_cpp.LLAMA_SPLIT_LAYER,
733
734
main_gpu: int=0,
734
735
tensor_split: Optional[List[float]] =None,
735
736
vocab_only: bool=False,
@@ -799,7 +800,8 @@ def __init__(
799
800
Args:
800
801
model_path: Path to the model.
801
802
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
802
-
main_gpu: The GPU that is used for scratch and small tensors.
803
+
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
804
+
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
803
805
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
804
806
vocab_only: Only load the vocabulary no weights.
805
807
use_mmap: Use mmap if possible.
@@ -850,6 +852,7 @@ def __init__(
850
852
self.model_params.n_gpu_layers= (
851
853
0x7FFFFFFFifn_gpu_layers==-1elsen_gpu_layers
852
854
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
0 commit comments