8000 Update llama.cpp · YWUTJS/llama-cpp-python@50f5c74 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

8000
Appearance settings

Commit 50f5c74

Browse files
committed
Update llama.cpp
1 parent 43ba152 commit 50f5c74

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

llama_cpp/llama_cpp.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,9 @@ class llama_model_kv_override(ctypes.Structure):
648648
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
649649
# const float * tensor_split;
650650

651+
# // comma separated list of RPC servers to use for offloading
652+
# const char * rpc_servers;
653+
651654
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
652655
# // If the provided progress_callback returns true, model loading continues.
653656
# // If it returns false, model loading is immediately aborted.
@@ -674,6 +677,7 @@ class llama_model_params(ctypes.Structure):
674677
split_mode (int): how to split the model across multiple GPUs
675678
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
676679
tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
680+
rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading
677681
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
678682
progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
679683
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -687,6 +691,7 @@ class llama_model_params(ctypes.Structure):
687691
split_mode: int
688692
main_gpu: int
689693
tensor_split: CtypesArray[ctypes.c_float]
694+
rpc_servers: ctypes.c_char_p
690695
progress_callback: Callable[[float, ctypes.c_void_p], bool]
691696
progress_callback_user_data: ctypes.c_void_p
692697
kv_overrides: CtypesArray[llama_model_kv_override]
@@ -700,6 +705,7 @@ class llama_model_params(ctypes.Structure):
700705
("split_mode", ctypes.c_int),
701706
("main_gpu", ctypes.c_int32),
702707
("tensor_split", ctypes.POINTER(ctypes.c_float)),
708+
("rpc_servers", ctypes.c_char_p),
703709
("progress_callback", llama_progress_callback),
704710
("progress_callback_user_data", ctypes.c_void_p),
705711
("kv_overrides", ctypes.POINTER(llama_model_kv_override)),

vendor/llama.cpp

0 commit comments

Comments
 (0)
0