8000 [Draft] Tensor Parallel support to llama.cpp by ClarkChin08 · Pull Request #9648 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

[Draft] Tensor Parallel support to llama.cpp #9648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
add tensor parallel support
Signed-off-by: Chen Xi <xi2chen@intel.com>
  • Loading branch information
Chen Xi committed Sep 26, 2024
commit c9ae1916ec19b2840cb404d6f9ca288c15cb63e4
2 changes: 2 additions & 0 deletions ggml/include/ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);

GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API int ggml_backend_sycl_rank(void);
GGML_API int ggml_backend_sycl_world_size(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
Expand Down
8 changes: 4 additions & 4 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -599,8 +599,8 @@ extern "C" {
TENSOR_NO_CHANGE,
TENSOR_SPLIT_BY_ROW,
TENSOR_SPLIT_BY_COLUMN,
TENSOR_KEEPED_ON_MASTER,
}
TENSOR_KEEPED_ON_MASTER
};

// n-dimensional tensor
struct ggml_tensor {
Expand Down Expand Up @@ -637,9 +637,9 @@ extern "C" {

void * extra; // extra things e.g. for ggml-cuda.cu

enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
enum tensor_parallel_mode split_mode; // {tensor_parallel_mode::TENSOR_NO_CHANGE};

// char padding[4];
char padding[12];
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,17 @@ if (GGML_SYCL)
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
endif()

set(oneCCL_DIR "/opt/intel/oneapi/ccl/latest/lib/cmake/oneCCL")
Copy link
Collaborator
@NeoZhangJianyu NeoZhangJianyu Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The real oneapi path is not always in /opt/intel/oneapi/.
Please use ENV{ONEAPI_ROOT} which is mandatory env variable in cmakefile.

Same for following script

set(MPI_INCLUDE_PATH "/opt/intel/oneapi/mpi/latest/include")
set(MPI_LIBRARY_PATH "/opt/intel/oneapi/mpi/latest/lib/")
set(ONECCL_INCLUDE_PATH "/opt/intel/oneapi/ccl/latest/include")
set(ONECCL_LIBRARY_PATH "/opt/intel/oneapi/ccl/latest/lib/")
include_directories(${MPI_INCLUDE_PATH} ${ONECCL_INCLUDE_PATH})
find_library(MPI_LIBRARY mpi HINTS ${MPI_LIBRARY_PATH})
find_library(ONECCL_LIBRARY ccl HINTS ${ONECCL_LIBRARY_PATH})
# find_package(oneCCL REQUIRED)
message("-- oneCCL found")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add script for not found oneCCL.

oneCCL is not included in oneAPI base toolkit, please print the message to guide user how to install it.

set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${MPI_LIBRARY_PATH} ${ONECCL_LIBRARY_PATH})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GGML_EXTRA_LIBS was recently split into GGML_EXTRA_LIBS_PUBLIC and GGML_EXTRA_LIBS_PRIVATE, so I think the line above won't work anymore
Also why there are paths to the lib directories inside this variable instead of found mpi/ccl libraries?

if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
Expand Down
16 changes: 8 additions & 8 deletions ggml/src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1747,12 +1747,12 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)

int ggml_backend_sycl_rank() {
// use ccl rank as main gpu
return dpct::dev_mgr::instance().get_ccl_rank();
return dpct::dev_mgr::instance().get_rank();
}

int ggml_backend_sycl_world_size() {
// use ccl rank as main gpu
return dpct::dev_mgr::instance().get_ccl_world_size();
return dpct::dev_mgr::instance().get_world_size();
}

void ggml_backend_sycl_print_sycl_devices() {
Expand Down Expand Up @@ -4237,9 +4237,9 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) {
int rank = ggml_backend_sycl_rank()
int world_size = ggml_backend_sycl_world_size()
static bool split_tensor(const struct ggml_tensor * src, void* dst, const void* data, enum tensor_parallel_mode split_mode) {
int rank = ggml_backend_sycl_rank();
int world_size = ggml_backend_sycl_world_size();
auto type_traits = ggml_internal_get_type_traits(src->type);
size_t element_size = type_traits.type_size / type_traits.blck_size;
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
Expand Down Expand Up @@ -4288,7 +4288,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
memcpy(host_buf, data, size);
} else {
if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) {
if (!split_tensor(tensor, ((void*)host_buf), data, tensor->split_mode)) {
std::cerr << "split tensor failed!" << std::endl;
}
}
Expand Down Expand Up @@ -4505,8 +4505,8 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
if (dpct::dev_mgr::instance().world_size() > 1) {
auto rank = dpct::dev_mgr::instance().get_rank();
if (ggml_backend_sycl_world_size() > 1) {
auto rank = ggml_backend_sycl_rank();
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
queue_ptr stream = &(device_tp.default_queue());
// TODO(xi): buffer_types always use 0 to avoid changes on public code
Expand Down
17 changes: 11 additions & 6 deletions ggml/src/ggml-sycl/dpct/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
#ifndef GGML_SYCL_DPCT_HELPER_HPP
#define GGML_SYCL_DPCT_HELPER_HPP

#include <stdlib.h>
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <oneapi/ccl.hpp>
#include <oneapi/mkl.hpp>
#include <map>
#include <mpi.h>

#include "ggml.h"

Expand Down Expand Up @@ -480,8 +482,6 @@ namespace dpct
int _max_nd_range_size_i[3];
uint32_t _device_id;
std::array<unsigned char, 16> _uuid;
uint32_t _rank;
uint32_t _world_size;
};

static int get_major_version(const sycl::device &dev)
Expand Down Expand Up @@ -873,8 +873,8 @@ namespace dpct
}
return -1;
}
inline int get_ccl_rank() { return _rank; }
inline int get_ccl_world_size() { return _world_size; }
inline int get_rank() { return _rank; }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These new functions have no relationship with DPCT.
It's better to move the ggml-sycl/src.
Recommend to reduce the dependence on DPCT code.

inline int get_world_size() { return _world_size; }
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);

Expand Down Expand Up @@ -1002,7 +1002,13 @@ namespace dpct
return convert_backend_index(backend1) < convert_backend_index(backend2);
}

static void init_ccl() {
static void mpi_finalize() {
static int is_finalized = 0;
MPI_Finalized(&is_finalized);
if (!is_finalized) MPI_Finalize();
}

void init_ccl() {
ccl::init();
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
Expand All @@ -1018,7 +1024,6 @@ namespace dpct
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
_kvs = ccl::create_kvs(main_addr);
}

}

dev_mgr()
Expand Down
18 changes: 8 additions & 10 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4368,8 +4368,6 @@ struct llama_model_loader {
int n_created = 0;
// For tensor parallelism
int world_size = 1;
int rank = 0;
bool enable_tp = false;

int64_t n_elements = 0;
size_t n_bytes = 0;
Expand Down Expand Up @@ -4630,7 +4628,6 @@ struct llama_model_loader {
this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
world_size = ggml_backend_get_world_size();
rank = ggml_backend_get_rank();
}

~llama_model_loader() {
Expand Down Expand Up @@ -4859,12 +4856,12 @@ struct llama_model_loader {
ggml_set_name(tensor, ggml_get_name(cur));
if (flags == TENSOR_SPLIT_BY_ROW) {
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
}
if (flags == TENSOR_SPLIT_BY_COLUMN) {
} else if (flags == TENSOR_SPLIT_BY_COLUMN) {
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
}
if (flags == TENSOR_KEEPED_ON_MASTER) {
} else if (flags == TENSOR_KEEPED_ON_MASTER) {
tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
} else {
tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
}

if (flags == TENSOR_DUPLICATED) {
Expand Down Expand Up @@ -7023,8 +7020,9 @@ static bool llm_load_tensors(
if (n_expert > 0 && hparams.n_expert_used == 0) {
throw std::runtime_error("model has expert layers but no expert layers are used");
}

bool enable_tp = false;
if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
int world_size = ggml_backend_get_world_size();
if (world_size > 1) {
enable_tp = true;
// need to change the size before load tensor
Expand Down Expand Up @@ -7078,7 +7076,7 @@ static bool llm_load_tensors(
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);

// optional bias tensors
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
Expand Down Expand Up @@ -7109,7 +7107,7 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);

// optional MLP bias
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);
Expand Down
Loading
0