8000 [Draft] Tensor Parallel support to llama.cpp by ClarkChin08 · Pull Request #9648 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

[Draft] Tensor Parallel support to llama.cpp #9648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8000
Next Next commit
add tensor parallelism support to SYCL
Signed-off-by: Chen Xi <xi2chen@intel.com>
  • Loading branch information
Chen Xi committed Sep 26, 2024
commit cb8507b3b4db33b7afaf043da3c3ff7441c774a1
23 changes: 23 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,27 @@ extern "C" {
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
};

// ggml object
struct ggml_object {
size_t offs;
size_t size;

struct ggml_object * next;

enum ggml_object_type type;

char padding[4];
};

static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);

enum tensor_parallel_mode {
TENSOR_NO_CHANGE,
TENSOR_SPLIT_BY_ROW,
TENSOR_SPLIT_BY_COLUMN,
TENSOR_KEEPED_ON_MASTER,
}

// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
Expand Down Expand Up @@ -616,6 +637,8 @@ extern "C" {

void * extra; // extra things e.g. for ggml-cuda.cu

enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;

// char padding[4];
};

Expand Down
113 changes: 105 additions & 8 deletions ggml/src/ggml-sycl.cpp
8000
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,15 @@ static void relu_f32_sycl(const float *x, float *dst, const int k,
});
}

static void allreduce_f32_sycl(const float *x, float *dst, const int k,
queue_ptr stream) {
auto dev = ccl::create_device(stream->get_device());
auto ctx = ccl::create_context(stream->get_context());
auto comm = dpct::dev_mgr::instance().create_ccl_communicator(dev, ctx);
auto ccl_stream = ccl::create_stream(*stream);
ccl::allreduce(x, dst, k, ccl::reduction::sum, comm, ccl_stream).wait();
}

static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
queue_ptr stream) {
const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
Expand Down Expand Up @@ -1736,6 +1745,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
}

int ggml_backend_sycl_rank() {
// use ccl rank as main gpu
return dpct::dev_mgr::instance().get_ccl_rank();
}

int ggml_backend_sycl_world_size() {
// use ccl rank as main gpu
return dpct::dev_mgr::instance().get_ccl_world_size();
}

void ggml_backend_sycl_print_sycl_devices() {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
int device_count = dpct::dev_mgr::instance().device_count();
Expand Down Expand Up @@ -2270,6 +2289,21 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor
(void) src1_dd;
}

inline void ggml_sycl_op_allreduce(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd,
const float *src1_dd, float *dst_dd,
const queue_ptr &main_stream) {

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

allreduce_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

(void) src1;
(void) dst;
(void) src1_dd;
}

static void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
Expand Down Expand Up @@ -3179,6 +3213,13 @@ static void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *
GGML_SYCL_DEBUG("call %s done\n", __func__);
}

static void ggml_sycl_allreduce(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_allreduce);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}


static void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid);
Expand Down Expand Up @@ -3530,6 +3571,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
} else {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
}
if (src0->split_mode == tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN) {
ggml_sycl_allreduce(ctx, dst, src1, dst);
}
}


Expand Down Expand Up @@ -4193,6 +4237,41 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) {
int rank = ggml_backend_sycl_rank()
int world_size = ggml_backend_sycl_world_size()
auto type_traits = ggml_internal_get_type_traits(src->type);
size_t element_size = type_traits.type_size / type_traits.blck_size;
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
switch (split_mode) {
case tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN: {
const int64_t nr = ggml_nrows(src);
const int64_t nc = src->ne[0];
const int64_t ndr = nr;
const int64_t ndc = nc / world_size;
for (size_t i = 0; i < nr; ++i) {
memcpy(((char*)dst) + i * ndc * element_size,
((char*)data) + i * nc * element_size + ndc * rank * element_size, ndc * element_size);
}
} break;
case tensor_parallel_mode::TENSOR_SPLIT_BY_ROW: {
memcpy(((char*)dst), ((char*)data) + dst_size * rank, dst_size);
} break;
case tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER: {
if (rank == 0) {
memcpy(((char*)dst), ((char*)data), dst_size);
} else {
memset(((char*)dst), 0, dst_size);
}
} break;
default: {
return false;
} break;
}
return true;
}


static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor,
const void *data, size_t offset,
Expand All @@ -4205,7 +4284,14 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
SYCL_CHECK(
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
char* host_buf = (char*)malloc(size);
memcpy(host_buf, data, size);

if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
memcpy(host_buf, data, size);
} else {
if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) {
std::cerr << "split tensor failed!" << std::endl;
}
}
SYCL_CHECK(
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
.wait()));
Expand Down Expand Up @@ -4419,14 +4505,25 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
auto & device_i = dpct::dev_mgr::instance().get_device(i);
queue_ptr stream = &(device_i.default_queue());
ggml_backend_sycl_buffer_types[i] = {
if (dpct::dev_mgr::instance().world_size() > 1) {
auto rank = dpct::dev_mgr::instance().get_rank();
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
queue_ptr stream = &(device_tp.default_queue());
// TODO(xi): buffer_types always use 0 to avoid changes on public code
ggml_backend_sycl_buffer_types[0] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
};
}
/* .context = */ new ggml_backend_sycl_buffer_type_context{rank, GGML_SYCL_NAME + std::to_string(rank), stream},
};
} else {
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
auto & device_i = dpct::dev_mgr::instance().get_device(i);
queue_ptr stream = &(device_i.default_queue());
ggml_backend_sycl_buffer_types[i] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
};
}
}
ggml_backend_sycl_buffer_type_initialized = true;
}
return &ggml_backend_sycl_buffer_types[device];
Expand Down
35 changes: 34 additions & 1 deletion ggml/src/ggml-sycl/dpct/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <oneapi/ccl.hpp>
#include <oneapi/mkl.hpp>
#include <map>

Expand Down Expand Up @@ -479,6 +480,8 @@ namespace dpct
int _max_nd_range_size_i[3];
uint32_t _device_id;
std::array<unsigned char, 16> _uuid;
uint32_t _rank;
uint32_t _world_size;
};

static int get_major_version(const sycl::device &dev)
Expand Down Expand Up @@ -870,7 +873,12 @@ namespace dpct
}
return -1;
}

inline int get_ccl_rank() { return _rank; }
inline int get_ccl_world_size() { return _world_size; }
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);

}
inline std::string get_preferred_gpu_platform_name() {
std::string result;

Expand Down Expand Up @@ -993,6 +1001,26 @@ namespace dpct
static bool compare_backend(std::string &backend1, std::string &backend2) {
return convert_backend_index(backend1) < convert_backend_index(backend2);
}

static void init_ccl() {
ccl::init();
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &_rank);
atexit(mpi_finalize);
ccl::kvs::address_type main_addr;
if (_rank == 0) {
_kvs = ccl::c F438 reate_main_kvs();
main_addr = _kvs->get_address();
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
_kvs = ccl::create_kvs(main_addr);
}

}

dev_mgr()
{
sycl::device default_device =
Expand Down Expand Up @@ -1050,6 +1078,7 @@ namespace dpct
_cpu_device = _devs.size() - 1;
}
}
init_ccl();
Copy link
Collaborator
@NeoZhangJianyu NeoZhangJianyu Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mv this init() function to ggml-sycl/src.

}
void check_id(unsigned int id) const
{
Expand All @@ -1066,6 +1095,10 @@ namespace dpct
/// thread-id to device-id map.
std::map<unsigned int, unsigned int> _thread2dev_map;
int _cpu_device = -1;
// For tensor parallelsim
int _rank = 0;
int _world_size = 1;
ccl::shared_ptr_class<ccl::kvs> _kvs;
};

static inline sycl::queue &get_default_queue()
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ extern "C" {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
LLAMA_SPLIT_MODE_TENSOR = 3, // split tensors across GPUs
};

// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
Expand Down
Loading
0