8000 cuda : implement load_tensor with cuFile · ggml-org/llama.cpp@e672602 · GitHub
[go: up one dir, main page]

Skip to content

Commit e672602

Browse files
committed
cuda : implement load_tensor with cuFile
1 parent d330a0f commit e672602

File tree

4 files changed

+93
-11
lines changed

4 files changed

+93
-11
lines changed

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ if (CUDAToolkit_FOUND)
104104
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
105105
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
106106
else ()
107-
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
107+
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static CUDA::cuFile_static)
108108
endif()
109109
else()
110-
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
110+
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cuFile)
111111
endif()
112112

113113
if (GGML_CUDA_NO_VMM)

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@
6060
#include <stdlib.h>
6161
#include <string>
6262
#include <vector>
63+
#include <cufile.h>
64+
#ifdef _WIN32
65+
#else
66+
#include <fcntl.h>
67+
#include <unistd.h>
68+
#endif
6369

6470
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
6571

@@ -3410,6 +3416,68 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
34103416
GGML_UNUSED(reg);
34113417
}
34123418

3419+
static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
3420+
#ifdef _WIN32
3421+
GGML_UNUSED(buffer);
3422+
GGML_UNUSED(tensor);
3423+
GGML_UNUSED(path);
3424+
GGML_UNUSED(file_offset);
3425+
GGML_UNUSED(tensor_offset);
3426+
GGML_UNUSED(size);
3427+
return false;
3428+
#else
3429+
static bool initialized = false;
3430+
static bool use_cufile = false;
3431+
if (!initialized) {
3432+
CUfileError_t err = cuFileDriverOpen();
3433+
initialized = true;
3434+
if (err.err != CU_FILE_SUCCESS) {
3435+
use_cufile = false;
3436+
return false;
3437+
}
3438+
CUfileDrvProps_t props;
3439+
err = cuFileDriverGetProperties(&props);
3440+
if (err.err != CU_FILE_SUCCESS) {
3441+
use_cufile = false;
3442+
return false;
3443+
}
3444+
if (props.nvfs.dcontrolflags & (1 << CU_FILE_ALLOW_COMPAT_MODE)) {
3445+
// do not use CUfile if the driver is in compatibility mode
3446+
// as we have faster mechanisms in llama-model-loader
3447+
use_cufile = false;
3448+
return false;
3449+
}
3450+
use_cufile = true;
3451+
}
3452+
if (!use_cufile) {
3453+
return false;
3454+
}
3455+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
3456+
ggml_cuda_set_device(ctx->device);
3457+
3458+
int fd = open(path, O_RDONLY | O_DIRECT);
3459+
if (fd < 0) {
3460+
return false;
3461+
}
3462+
CUfileDescr_t cf_descr;
3463+
CUfileHandle_t cf_handle;
3464+
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
3465+
cf_descr.handle.fd = fd;
3466+
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
3467+
CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
3468+
if (status.err != CU_FILE_SUCCESS) {
3469+
return false;
3470+
}
3471+
ssize_t ret = cuFileRead(cf_handle, (char *)tensor->data, size, file_offset, tensor_offset);
3472+
if (ret < 0) {
3473+
return false;
3474+
}
3475+
cuFileHandleDeregister(cf_handle);
3476+
close(fd);
3477+
return true;
3478+
#endif
3479+
}
3480+
34133481
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
34143482
GGML_UNUSED(reg);
34153483
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@@ -3424,6 +3492,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
34243492
if (strcmp(name, "ggml_backend_get_features") == 0) {
34253493
return (void *)ggml_backend_cuda_get_features;
34263494
}
3495+
if (strcmp(name, "ggml_backend_tensor_load") == 0) {
3496+
return (void *)ggml_backend_cuda_buffer_load_tensor;
3497+
}
34273498
return nullptr;
34283499
}
34293500

src/llama-model-loader.cpp

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -688,9 +688,13 @@ llama_model_loader::llama_model_loader(
688688

689689
this->use_mmap = use_mmap;
690690
this->check_tensors = check_tensors;
691-
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
692-
if (rpc_reg) {
693-
rpc_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_tensor_load");
691+
ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC");
692+
if (reg) {
693+
rpc_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_tensor_load");
694+
}
695+
reg = ggml_backend_reg_by_name("CUDA");
696+
if (reg) {
697+
cuda_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_tensor_load");
694698
}
695699
}
696700

@@ -895,15 +899,21 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
895899
}
896900

897901
bool llama_model_loader::load_tensor(ggml_tensor * cur, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
898-
if (!rpc_load_tensor_fn) {
899-
return false;
900-
}
901902
ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer;
902903
const char * buf_name = ggml_backend_buffer_name(buf);
903-
if (strncmp(buf_name, "RPC", 3) != 0) {
904-
return false;
904+
if (strncmp(buf_name, "RPC", 3) == 0) {
905+
if (!rpc_load_tensor_fn) {
906+
return false;
907+
}
908+
return rpc_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size);
909+
}
910+
if (strncmp(buf_name, "CUDA", 4) == 0) {
911+
if (!cuda_load_tensor_fn) {
912+
return false;
913+
}
914+
return cuda_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size);
905915
}
906-
return rpc_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size);
916+
return false;
907917
}
908918

909919
bool llama_model_loader::load_all_data(

src/llama-model-loader.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ struct llama_model_loader {
156156
void load_data_for(struct ggml_tensor * cur) const;
157157

158158
ggml_backend_tensor_load_t rpc_load_tensor_fn = nullptr;
159+
ggml_backend_tensor_load_t cuda_load_tensor_fn = nullptr;
159160
bool load_tensor(ggml_tensor * cur, const char * path, size_t file_offset, size_t tensor_offset, size_t size);
160161

161162
// Returns false if cancelled by progress_callback

0 commit comments

Comments
 (0)
0