From d330a0f54e5d288a44bec91913bb3d585ff2c910 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Wed, 7 May 2025 12:29:37 +0300 Subject: [PATCH 1/4] llama : load tensors using pre-computed model hash --- ggml/include/ggml-backend.h | 7 + ggml/include/ggml-rpc.h | 4 +- ggml/src/ggml-rpc/ggml-rpc.cpp | 234 ++++++++++++++++++++++++++++++++- src/llama-mmap.cpp | 4 +- src/llama-mmap.h | 1 + src/llama-model-loader.cpp | 21 ++- src/llama-model-loader.h | 3 + tools/rpc/rpc-server.cpp | 33 +++-- 8 files changed, 288 insertions(+), 19 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68217a..c7fe8485ab0cc 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -111,6 +111,13 @@ extern "C" { // automatic fallback to sync copy if async is not supported GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); + typedef bool (*ggml_backend_tensor_load_t)(ggml_backend_buffer_t buffer, + struct ggml_tensor * tensor, + const char * path, + size_t file_offset, + size_t tensor_offset, + size_t size); + GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); // diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index 1e674112767c9..3743d5c5d711f 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -8,7 +8,7 @@ extern "C" { #endif #define RPC_PROTO_MAJOR_VERSION 2 -#define RPC_PROTO_MINOR_VERSION 0 +#define RPC_PROTO_MINOR_VERSION 1 #define RPC_PROTO_PATCH_VERSION 0 #define GGML_RPC_MAX_SERVERS 16 @@ -21,7 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, - const char * cache_dir, + const char * model_file, const char * cache_dir, size_t free_mem, size_t total_mem); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void); diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 4f0abb5a60f48..b7f3c48bd17ae 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -93,6 +93,7 @@ enum rpc_cmd { RPC_CMD_INIT_TENSOR, RPC_CMD_GET_ALLOC_SIZE, RPC_CMD_HELLO, + RPC_CMD_LOAD_TENSOR, RPC_CMD_COUNT, }; @@ -161,6 +162,18 @@ struct rpc_msg_set_tensor_hash_rsp { uint8_t result; }; +struct rpc_msg_load_tensor_req { + uint64_t model_hash; + rpc_tensor tensor; + uint64_t file_offset; + uint64_t tensor_offset; + uint64_t size; +}; + +struct rpc_msg_load_tensor_rsp { + uint8_t result; +}; + struct rpc_msg_get_tensor_req { rpc_tensor tensor; uint64_t offset; @@ -213,6 +226,24 @@ struct ggml_backend_rpc_buffer_context { // RPC helper functions +typedef uint64_t fnv_ctx_t; + +static void fnv_init(fnv_ctx_t * ctx) { + *ctx = 0xcbf29ce484222325ULL; +} + +static void fnv_update(fnv_ctx_t * ctx, const uint8_t * data, size_t len) { + const uint64_t fnv_prime = 0x100000001b3ULL; + for (size_t i = 0; i < len; ++i) { + *ctx ^= data[i]; + *ctx *= fnv_prime; + } +} + +static void fnv_final(fnv_ctx_t * ctx, uint64_t * digest) { + *digest = *ctx; +} + // Computes FNV-1a hash of the data static uint64_t fnv_hash(const uint8_t * data, size_t len) { const uint64_t fnv_prime = 0x100000001b3ULL; @@ -225,6 +256,87 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) { return hash; } +static bool get_model_hash_from_file(const char * model_file, uint64_t * hash) { + // try loading the hash from model_file + '.rpc' + std::string rpc_file = std::string(model_file) + ".rpc"; + // the hash file must exist, must be exactly 16 bytes and must be a valid hash written in hex + if (!fs::exists(rpc_file)) { + return false; + } + std::ifstream file(rpc_file, std::ios::binary); + if (!file.is_open()) { + return false; + } + std::string hash_str; + file.seekg(0, std::ios::end); + size_t file_size = file.tellg(); + if (file_size != 16) { + return false; + } + file.seekg(0, std::ios::beg); + hash_str.resize(file_size); + file.read(&hash_str[0], file_size); + if ((size_t)file.gcount() != file_size) { + return false; + } + if (hash_str.find_first_not_of("0123456789abcdefABCDEF") != std::string::npos) { + return false; + } + *hash = std::stoull(hash_str, nullptr, 16); + return true; +} + +static bool get_model_hash(const char * model_file, uint64_t * hash) { + // model path -> (hash_exist, hash_value) + static std::unordered_map> model_hashes; + if (model_hashes.find(model_file) != model_hashes.end()) { + *hash = model_hashes[model_file].second; + return model_hashes[model_file].first; + } + if (get_model_hash_from_file(model_file, hash)) { + model_hashes[model_file] = {true, *hash}; + return true; + } + model_hashes[model_file] = {false, 0}; + return false; +} + +static bool generate_model_hash(const char * model_file, uint64_t * hash) { + ggml_context * ctx = nullptr; + struct gguf_init_params params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + }; + gguf_context_ptr ctx_gguf { gguf_init_from_file(model_file, params) }; + if (!ctx_gguf) { + return false; + } + fnv_ctx_t fnv_ctx; + fnv_init(&fnv_ctx); + size_t data_offset = gguf_get_data_offset(ctx_gguf.get()); + fnv_update(&fnv_ctx, (const uint8_t*)&data_offset, sizeof(data_offset)); + const int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + ggml_tensor * cur = ggml_get_tensor(ctx, name); + auto n_bytes = ggml_nbytes(cur); + fnv_update(&fnv_ctx, (const uint8_t*)cur->data, n_bytes); + } + fnv_final(&fnv_ctx, hash); + // save the model hash to model_file + '.rpc' in hex format + std::string hash_file = std::string(model_file) + ".rpc"; + std::ofstream file(hash_file, std::ios::binary); + if (!file.is_open()) { + return false; + } + file << std::hex << std::setfill('0') << std::setw(16) << *hash; + if (!file) { + return false; + } + file.close(); + return true; +} + static std::shared_ptr make_socket(sockfd_t fd) { #ifdef _WIN32 if (fd == INVALID_SOCKET) { @@ -605,6 +717,24 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con return response.result; } +static bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + uint64_t hash; + if (!get_model_hash(path, &hash)) { + return false; + } + rpc_msg_load_tensor_req request; + request.model_hash = hash; + request.tensor = serialize_tensor(tensor); + request.file_offset = file_offset; + request.tensor_offset = tensor_offset; + request.size = size; + rpc_msg_load_tensor_rsp response; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_LOAD_TENSOR, &request, sizeof(request), &response, sizeof(response)); + GGML_ASSERT(status); + return response.result; +} + static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value}; @@ -854,8 +984,8 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, si class rpc_server { public: - rpc_server(ggml_backend_t backend, const char * cache_dir) - : backend(backend), cache_dir(cache_dir) { + rpc_server(ggml_backend_t backend, const std::unordered_map & model_hashes, const char * cache_dir) + : backend(backend), cache_dir(cache_dir), model_hashes(model_hashes) { } ~rpc_server(); @@ -868,6 +998,7 @@ class rpc_server { bool buffer_clear(const rpc_msg_buffer_clear_req & request); bool set_tensor(const std::vector & input); bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response); + bool load_tensor(const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response); bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response); bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response); bool graph_compute(const std::vector & input, rpc_msg_graph_compute_rsp & response); @@ -886,6 +1017,7 @@ class rpc_server { ggml_backend_t backend; const char * cache_dir; std::unordered_set buffers; + const std::unordered_map & model_hashes; }; void rpc_server::hello(rpc_msg_hello_rsp & response) { @@ -1104,6 +1236,18 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector & data) { return true; } +static bool read_model_data(const char * path, size_t file_offset, size_t size, std::vector & data) { + FILE * f = fopen(path, "rb"); + if (f == nullptr) { + return false; + } + fseek(f, file_offset, SEEK_SET); + data.resize(size); + size_t read_size = fread(data.data(), 1, size, f); + fclose(f); + return read_size == size; +} + bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response) { std::vector cached_file; @@ -1146,6 +1290,50 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp return true; } +bool rpc_server::load_tensor(const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response) { + if (model_hashes.find(request.model_hash) == model_hashes.end()) { + response.result = 0; + return true; + } + std::string path = model_hashes.at(request.model_hash); + std::vector model_data; + if (!read_model_data(path.c_str(), request.file_offset, request.size, model_data)) { + response.result = 0; + return true; + } + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context_ptr ctx_ptr { ggml_init(params) }; + GGML_ASSERT(ctx_ptr != nullptr); + ggml_context * ctx = ctx_ptr.get(); + ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); + if (tensor == nullptr) { + GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); + return false; + } + GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, request.tensor_offset, request.size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (request.tensor.data + request.tensor_offset < p0 + || request.tensor.data + request.tensor_offset >= p1 + || request.size > (p1 - request.tensor.data - request.tensor_offset)) { + GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%" PRIu64 ") out of buffer bounds [0x%zx, 0x%zx)\n", + __func__, request.tensor.data, request.tensor_offset, request.size, p0, p1); + return false; + } + } + ggml_backend_tensor_set(tensor, model_data.data(), request.tensor_offset, request.size); + response.result = 1; + return true; +} + bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) { struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead(), @@ -1368,9 +1556,11 @@ rpc_server::~rpc_server() { } } -static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir, +static void rpc_serve_client(ggml_backend_t backend, + const std::unordered_map & model_hashes, + const char * cache_dir, sockfd_t sockfd, size_t free_mem, size_t total_mem) { - rpc_server server(backend, cache_dir); + rpc_server server(backend, model_hashes, cache_dir); uint8_t cmd; if (!recv_data(sockfd, &cmd, 1)) { return; @@ -1514,6 +1704,20 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir, } break; } + case RPC_CMD_LOAD_TENSOR: { + rpc_msg_load_tensor_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + rpc_msg_load_tensor_rsp response; + if (!server.load_tensor(request, response)) { + return; + } + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } + break; + } case RPC_CMD_INIT_TENSOR: { rpc_msg_init_tensor_req request; if (!recv_msg(sockfd, &request,sizeof(request))) { @@ -1590,7 +1794,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir, } void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, - const char * cache_dir, + const char * model_file, const char * cache_dir, size_t free_mem, size_t total_mem) { printf("Starting RPC server v%d.%d.%d\n", RPC_PROTO_MAJOR_VERSION, @@ -1600,6 +1804,21 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a"); printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024)); + std::unordered_map model_hashes; + if (model_file != nullptr) { + uint64_t model_hash; + if (!get_model_hash(model_file, &model_hash)) { + printf("Generating model hash for file: %s\n", model_file); + if (!generate_model_hash(model_file, &model_hash)) { + fprintf(stderr, "Failed to generate model hash for file: %s\n", model_file); + return; + } + } + printf(" model file : %s\n", model_file); + printf(" model hash : %" PRIx64 "\n", model_hash); + model_hashes[model_hash] = model_file; + } + std::string host; int port; if (!parse_endpoint(endpoint, host, port)) { @@ -1628,7 +1847,7 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint } printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); fflush(stdout); - rpc_serve_client(backend, cache_dir, client_socket->fd, free_mem, total_mem); + rpc_serve_client(backend, model_hashes, cache_dir, client_socket->fd, free_mem, total_mem); printf("Client connection closed\n"); fflush(stdout); } @@ -1762,6 +1981,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) { return (void *)ggml_backend_rpc_start_server; } + if (std::strcmp(name, "ggml_backend_tensor_load") == 0) { + return (void *)ggml_backend_rpc_buffer_load_tensor; + } return NULL; GGML_UNUSED(reg); diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 9da97f1bc5057..55f3284a63fb9 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -159,7 +159,7 @@ struct llama_file::impl { } } #else - impl(const char * fname, const char * mode) { + impl(const char * fname, const char * mode) : fname(fname) { fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -237,6 +237,7 @@ struct llama_file::impl { } #endif + const char * fname; FILE * fp; size_t size; }; @@ -246,6 +247,7 @@ llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } +const char * llama_file::fname() const { return pimpl->fname; } int llama_file::file_id() const { #ifdef _WIN32 diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 4e5aec3f440d7..f7bc609ac5842 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -19,6 +19,7 @@ struct llama_file { size_t tell() const; size_t size() const; + const char * fname() const; int file_id() const; // fileno overload void seek(size_t offset, int whence) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ddb1b03675b28..5c5cc2e4e8012 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -688,6 +688,10 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (rpc_reg) { + rpc_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_tensor_load"); + } } std::string llama_model_loader::get_arch_name() const { @@ -890,6 +894,18 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { } } +bool llama_model_loader::load_tensor(ggml_tensor * cur, const char * path, size_t file_offset, size_t tensor_offset, size_t size) { + if (!rpc_load_tensor_fn) { + return false; + } + ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer; + const char * buf_name = ggml_backend_buffer_name(buf); + if (strncmp(buf_name, "RPC", 3) != 0) { + return false; + } + return rpc_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size); +} + bool llama_model_loader::load_all_data( struct ggml_context * ctx, llama_buf_map & bufs, @@ -1031,7 +1047,10 @@ bool llama_model_loader::load_all_data( mmap_used.first = std::min(mmap_used.first, weight->offs); mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { - ggml_backend_tensor_set(cur, data, 0, n_size); + const auto & file = files.at(weight->idx); + if (!load_tensor(cur, file->fname(), weight->offs, 0, n_size)) { + ggml_backend_tensor_set(cur, data, 0, n_size); + } } } else { const auto & file = files.at(weight->idx); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 0f52b011b6986..32e57f0ec5172 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -155,6 +155,9 @@ struct llama_model_loader { // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const; + ggml_backend_tensor_load_t rpc_load_tensor_fn = nullptr; + bool load_tensor(ggml_tensor * cur, const char * path, size_t file_offset, size_t tensor_offset, size_t size); + // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, diff --git a/tools/rpc/rpc-server.cpp b/tools/rpc/rpc-server.cpp index 581c74018c877..eb64839e3de06 100644 --- a/tools/rpc/rpc-server.cpp +++ b/tools/rpc/rpc-server.cpp @@ -137,18 +137,20 @@ struct rpc_server_params { bool use_cache = false; int n_threads = std::max(1U, std::thread::hardware_concurrency()/2); std::string device; + std::string model_file; }; static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) { fprintf(stderr, "Usage: %s [options]\n\n", argv[0]); fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -t, --threads number of threads for the CPU backend (default: %d)\n", params.n_threads); - fprintf(stderr, " -d DEV, --device device to use\n"); - fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str()); - fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port); - fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n"); - fprintf(stderr, " -c, --cache enable local file cache\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -t, --threads N number of threads for the CPU backend (default: %d)\n", params.n_threads); + fprintf(stderr, " -d, --device DEV device to use\n"); + fprintf(stderr, " -H, --host HOST host to bind to (default: %s)\n", params.host.c_str()); + fprintf(stderr, " -p, --port PORT port to bind to (default: %d)\n", params.port); + fprintf(stderr, " -mm, --mem MEM backend memory size (in MB)\n"); + fprintf(stderr, " -m, --model FNAME model path\n"); + fprintf(stderr, " -c, --cache enable local file cache\n"); fprintf(stderr, "\n"); } @@ -196,11 +198,16 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & } } else if (arg == "-c" || arg == "--cache") { params.use_cache = true; - } else if (arg == "-m" || arg == "--mem") { + } else if (arg == "-mm" || arg == "--mem") { if (++i >= argc) { return false; } params.backend_mem = std::stoul(argv[i]) * 1024 * 1024; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + return false; + } + params.model_file = argv[i]; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv, params); exit(0); @@ -302,6 +309,14 @@ int main(int argc, char * argv[]) { } cache_dir = cache_dir_str.c_str(); } + const char * model_file = nullptr; + if (!params.model_file.empty()) { + model_file = params.model_file.c_str(); + if (!fs::exists(model_file)) { + fprintf(stderr, "Model file does not exist: %s\n", model_file); + return 1; + } + } ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC"); if (!reg) { @@ -315,7 +330,7 @@ int main(int argc, char * argv[]) { return 1; } - start_server_fn(backend, endpoint.c_str(), cache_dir, free_mem, total_mem); + start_server_fn(backend, endpoint.c_str(), model_file, cache_dir, free_mem, total_mem); ggml_backend_free(backend); return 0; From e6726020d2e24e80d1db341caaa303dc5aea70fb Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Tue, 20 May 2025 14:18:23 +0300 Subject: [PATCH 2/4] cuda : implement load_tensor with cuFile --- ggml/src/ggml-cuda/CMakeLists.txt | 4 +- ggml/src/ggml-cuda/ggml-cuda.cu | 71 +++++++++++++++++++++++++++++++ src/llama-model-loader.cpp | 28 ++++++++---- src/llama-model-loader.h | 1 + 4 files changed, 93 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index c9ff4aa321b8b..71fe680a7e8f8 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -104,10 +104,10 @@ if (CUDAToolkit_FOUND) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static CUDA::cuFile_static) endif() else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cuFile) endif() if (GGML_CUDA_NO_VMM) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 02dc8c12dbd8c..eb78233bf47f0 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -60,6 +60,12 @@ #include #include #include +#include +#ifdef _WIN32 +#else + #include + #include +#endif static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3410,6 +3416,68 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t GGML_UNUSED(reg); } +static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) { +#ifdef _WIN32 + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); + GGML_UNUSED(path); + GGML_UNUSED(file_offset); + GGML_UNUSED(tensor_offset); + GGML_UNUSED(size); + return false; +#else + static bool initialized = false; + static bool use_cufile = false; + if (!initialized) { + CUfileError_t err = cuFileDriverOpen(); + initialized = true; + if (err.err != CU_FILE_SUCCESS) { + use_cufile = false; + return false; + } + CUfileDrvProps_t props; + err = cuFileDriverGetProperties(&props); + if (err.err != CU_FILE_SUCCESS) { + use_cufile = false; + return false; + } + if (props.nvfs.dcontrolflags & (1 << CU_FILE_ALLOW_COMPAT_MODE)) { + // do not use CUfile if the driver is in compatibility mode + // as we have faster mechanisms in llama-model-loader + use_cufile = false; + return false; + } + use_cufile = true; + } + if (!use_cufile) { + return false; + } + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + ggml_cuda_set_device(ctx->device); + + int fd = open(path, O_RDONLY | O_DIRECT); + if (fd < 0) { + return false; + } + CUfileDescr_t cf_descr; + CUfileHandle_t cf_handle; + memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t)); + cf_descr.handle.fd = fd; + cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; + CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr); + if (status.err != CU_FILE_SUCCESS) { + return false; + } + ssize_t ret = cuFileRead(cf_handle, (char *)tensor->data, size, file_offset, tensor_offset); + if (ret < 0) { + return false; + } + cuFileHandleDeregister(cf_handle); + close(fd); + return true; +#endif +} + static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { @@ -3424,6 +3492,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_get_features") == 0) { return (void *)ggml_backend_cuda_get_features; } + if (strcmp(name, "ggml_backend_tensor_load") == 0) { + return (void *)ggml_backend_cuda_buffer_load_tensor; + } return nullptr; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 5c5cc2e4e8012..def76d5f73b3b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -688,9 +688,13 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; - ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); - if (rpc_reg) { - rpc_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_tensor_load"); + ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC"); + if (reg) { + rpc_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_tensor_load"); + } + reg = ggml_backend_reg_by_name("CUDA"); + if (reg) { + cuda_load_tensor_fn = (ggml_backend_tensor_load_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_tensor_load"); } } @@ -895,15 +899,21 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { } bool llama_model_loader::load_tensor(ggml_tensor * cur, const char * path, size_t file_offset, size_t tensor_offset, size_t size) { - if (!rpc_load_tensor_fn) { - return false; - } ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer; const char * buf_name = ggml_backend_buffer_name(buf); - if (strncmp(buf_name, "RPC", 3) != 0) { - return false; + if (strncmp(buf_name, "RPC", 3) == 0) { + if (!rpc_load_tensor_fn) { + return false; + } + return rpc_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size); + } + if (strncmp(buf_name, "CUDA", 4) == 0) { + if (!cuda_load_tensor_fn) { + return false; + } + return cuda_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size); } - return rpc_load_tensor_fn(buf, cur, path, file_offset, tensor_offset, size); + return false; } bool llama_model_loader::load_all_data( diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 32e57f0ec5172..65e984abae1b5 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -156,6 +156,7 @@ struct llama_model_loader { void load_data_for(struct ggml_tensor * cur) const; ggml_backend_tensor_load_t rpc_load_tensor_fn = nullptr; + ggml_backend_tensor_load_t cuda_load_tensor_fn = nullptr; bool load_tensor(ggml_tensor * cur, const char * path, size_t file_offset, size_t tensor_offset, size_t size); // Returns false if cancelled by progress_callback From 260e19275a7a7a647732cf7b4fbb7014c08a8ff3 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Wed, 21 May 2025 13:47:25 +0300 Subject: [PATCH 3/4] fix build --- ggml/src/ggml-cuda/CMakeLists.txt | 6 +++++- ggml/src/ggml-cuda/ggml-cuda.cu | 22 ++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 71fe680a7e8f8..acd3446c7a4d3 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -107,7 +107,11 @@ if (CUDAToolkit_FOUND) target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static CUDA::cuFile_static) endif() else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cuFile) + if (WIN32) + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) + else() + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cuFile) + endif() endif() if (GGML_CUDA_NO_VMM) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index eb78233bf47f0..dfb0ad9c0b328 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -60,7 +60,9 @@ #include #include #include -#include +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32) +# include +#endif #ifdef _WIN32 #else #include @@ -3417,15 +3419,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t } static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) { -#ifdef _WIN32 - GGML_UNUSED(buffer); - GGML_UNUSED(tensor); - GGML_UNUSED(path); - GGML_UNUSED(file_offset); - GGML_UNUSED(tensor_offset); - GGML_UNUSED(size); - return false; -#else +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32) static bool initialized = false; static bool use_cufile = false; if (!initialized) { @@ -3475,6 +3469,14 @@ static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, g cuFileHandleDeregister(cf_handle); close(fd); return true; +#else + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); + GGML_UNUSED(path); + GGML_UNUSED(file_offset); + GGML_UNUSED(tensor_offset); + GGML_UNUSED(size); + return false; #endif } From abfdc6001780f34631d03a0cca41ca1018199551 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Sun, 25 May 2025 11:21:27 +0300 Subject: [PATCH 4/4] address review comments --- ggml/src/ggml-cuda/ggml-cuda.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dfb0ad9c0b328..5cb7fde129149 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -61,13 +61,13 @@ #include #include #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32) -# include -#endif -#ifdef _WIN32 -#else +# define GGML_USE_CUFILE +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32) +#ifdef GGML_USE_CUFILE + #include #include #include -#endif +#endif // GGML_USE_CUFILE static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3419,7 +3419,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t } static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && !defined(_WIN32) +#ifdef GGML_USE_CUFILE static bool initialized = false; static bool use_cufile = false; if (!initialized) { @@ -3477,7 +3477,7 @@ static bool ggml_backend_cuda_buffer_load_tensor(ggml_backend_buffer_t buffer, g GGML_UNUSED(tensor_offset); GGML_UNUSED(size); return false; -#endif +#endif // GGML_USE_CUFILE } static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {