8000 llama : try loading tensors with pre-computed hashes by rgerganov · Pull Request #13106 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

llama : try loading tensors with pre-computed hashes #13106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ extern "C" {
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);

typedef bool (*ggml_backend_tensor_load_t)(ggml_backend_buffer_t buffer,
struct ggml_tensor * tensor,
const char * path,
size_t file_offset,
size_t tensor_offset,
size_t size);

GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);

//
Expand Down
4 changes: 2 additions & 2 deletions ggml/include/ggml-rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extern "C" {
#endif

#define RPC_PROTO_MAJOR_VERSION 2
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_MINOR_VERSION 1
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16

Expand All @@ -21,7 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
const char * cache_dir,
const char * model_file, const char * cache_dir,
size_t free_mem, size_t total_mem);

GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
Expand Down
8 changes: 6 additions & 2 deletions ggml/src/ggml-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,14 @@ if (CUDAToolkit_FOUND)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static CUDA::cuFile_static)
endif()
else()
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
if (WIN32)
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
else()
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cuFile)
endif()
endif()

if (GGML_CUDA_NO_VMM)
Expand Down
73 changes: 73 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are from what I can tell never explicitly registering the buffers that you load data to.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can register the buffer in ggml_backend_cuda_buffer_type_alloc_buffer() but it'd be nice to have some performance data with the internal buffers first. The doc also suggests that registering the buffer is needed for small IO sizes which is generally not the case here.

Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@
#include <stdlib.h>
#include <string>
#include <vector>
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32)
# define GGML_USE_CUFILE
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32)
#ifdef GGML_USE_CUFILE
#include <cufile.h>
#include <fcntl.h>
#include <unistd.h>
#endif // GGML_USE_CUFILE

static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

Expand Down Expand Up @@ -3410,6 +3418,68 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
GGML_UNUSED(reg);
}

static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand this function correctly it loads a single tensor from a file. The driver is being re-used but the file handle is not - I don't know how much overhead that causes though.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, file handles are re-created every time and I am not sure how to address this with the current design. Adding a cache is going to leak both handles and descriptors which is not great.

#ifdef GGML_USE_CUFILE
static bool initialized = false;
static bool use_cufile = false;
if (!initialized) {
CUfileError_t err = cuFileDriverOpen();
initialized = true;
if (err.err != CU_FILE_SUCCESS) {
use_cufile = false;
return false;
}
CUfileDrvProps_t props;
err = cuFileDriverGetProperties(&props);
if (err.err != CU_FILE_SUCCESS) {
use_cufile = false;
return false;
}
if (props.nvfs.dcontrolflags & (1 << CU_FILE_ALLOW_COMPAT_MODE)) {
// do not use CUfile if the driver is in compatibility mode
// as we have faster mechanisms in llama-model-loader
use_cufile = false;
return false;
}
use_cufile = true;
}
if (!use_cufile) {
return false;
}
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);

int fd = open(path, O_RDONLY | O_DIRECT);
if (fd < 0) {
return false;
}
CUfileDescr_t cf_descr;
CUfileHandle_t cf_handle;
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
cf_descr.handle.fd = fd;
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
if (status.err != CU_FILE_SUCCESS) {
return false;
}
ssize_t ret = cuFileRead(cf_handle, (char *)tensor->data, size, file_offset, tensor_offset);
if (ret < 0) {
return false;
}
cuFileHandleDeregister(cf_handle);
close(fd);
return true;
#else
GGML_UNUSED(buffer);
GGML_UNUSED(tensor);
GGML_UNUSED(path);
GGML_UNUSED(file_offset);
GGML_UNUSED(tensor_offset);
GGML_UNUSED(size);
return false;
#endif // GGML_USE_CUFILE
}

static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
Expand All @@ -3424,6 +3494,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cuda_get_features;
}
if (strcmp(name, "ggml_backend_tensor_load") == 0) {
return (void *)ggml_backend_cuda_buffer_load_tensor;
}
return nullptr;
}

Expand Down
Loading
Loading
0