8000 SYCL: Implement few same quantized type copy kernels by qnixsynapse · Pull Request #13739 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

SYCL: Implement few same quantized type copy kernels #13739

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 142 additions & 2 deletions ggml/src/ggml-sycl/cpy.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#include "cpy.hpp"

#include <float.h>
#include <string>

#include "dequantize.hpp"
#include "ggml-sycl/common.hpp"
#include "ggml.h"

static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
if (x <= val[0]) {
Expand Down Expand Up @@ -116,6 +119,38 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
}
}

/* quantized type same copy */
static void cpy_block_q8_0_q8_0(const char * cxi, char * cdsti) {
const block_q8_0 * xi = (const block_q8_0 *) cxi;
block_q8_0 * dsti = (block_q8_0 *) cdsti;
*dsti = *xi;
}

static void cpy_block_q5_0_q5_0(const char * cxi, char * cdsti) {
const block_q5_0 * xi = (const block_q5_0 *) cxi;
block_q5_0 * dsti = (block_q5_0 *) cdsti;
*dsti = *xi;
}


static void cpy_block_q5_1_q5_1(const char * cxi, char * cdsti) {
const block_q5_1 * xi = (const block_q5_1 *) cxi;
block_q5_1 * dsti = (block_q5_1 *) cdsti;
*dsti = *xi;
}

static void cpy_block_q4_0_q4_0(const char * cxi, char * cdsti) {
const block_q4_0 * xi = (const block_q4_0 *) cxi;
block_q4_0 * dsti = (block_q4_0 *) cdsti;
*dsti = *xi;
}

static void cpy_block_q4_1_q4_1(const char * cxi, char * cdsti) {
const block_q4_1 * xi = (const block_q4_1 *) cxi;
block_q4_1 * dsti = (block_q4_1 *) cdsti;
*dsti = *xi;
}

static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
float * cdstf = (float *) (cdsti);

Expand Down Expand Up @@ -311,6 +346,34 @@ template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const
}
}


template <cpy_kernel_t cpy_blck, int qk>
static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
const sycl::nd_item<3> & item_ct1) {
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;

if (i >= ne) {
return;
}

const int i03 = i / (ne00 * ne01 * ne02);
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;


const int i13 = i / (ne10 * ne11 * ne12);
const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;

cpy_blck(cx + x_offset, cdst + dst_offset);
}

template <cpy_kernel_t cpy_blck, int qk>
static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
Expand All @@ -322,6 +385,7 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00
return;
}


const int i03 = i / (ne00 * ne01 * ne02);
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
Expand Down Expand Up @@ -615,6 +679,70 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
}
}

static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13, queue_ptr stream) {

const int num_blocks = ne;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
cpy_q_q<cpy_block_q8_0_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
});
}


static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13, queue_ptr stream) {

const int num_blocks = ne;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
cpy_q_q<cpy_block_q5_0_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
});
}


static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13, queue_ptr stream) {

const int num_blocks = ne;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
cpy_q_q<cpy_block_q5_1_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
});
}


static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13, queue_ptr stream) {

const int num_blocks = ne;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
cpy_q_q<cpy_block_q4_0_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
});
}


static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13, queue_ptr stream) {

const int num_blocks = ne;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
cpy_q_q<cpy_block_q4_1_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
});
}

void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
Expand All @@ -632,8 +760,10 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co

char * src0_ddc = (char *) src0->data;
char * src1_ddc = (char *) src1->data;

if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) {
GGML_SYCL_DEBUG("%s: memcpy path\n", __func__);
main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0));
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
Expand Down Expand Up @@ -684,6 +814,16 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) {
ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) {
ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) {
ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) {
ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) {
ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
ggml_type_name(src1->type));
Expand Down
18 changes: 18 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4165,6 +4165,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
{
ggml_type src0_type = op->src[0]->type;
ggml_type src1_type = op->src[1]->type;
if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]))) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return true;
}
Expand Down Expand Up @@ -4210,6 +4213,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
return true;
}
if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
return true;
}
if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
return true;
}
if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
return true;
}
if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
return true;
}
if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
return true;
}
return false;
}
case GGML_OP_CONCAT:
Expand Down
Loading
0