8000 sycl: reordered Q4_K MMVQ (#13109) · ggml-org/llama.cpp@64bb51c · GitHub
[go: up one dir, main page]

Skip to content

Commit 64bb51c

Browse files
authored
sycl: reordered Q4_K MMVQ (#13109)
1 parent 9c404ed commit 64bb51c

File tree

7 files changed

+280
-84
lines changed

7 files changed

+280
-84
lines changed

ggml/src/ggml-sycl/convert.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
183183
}
184184
}
185185

186+
template <typename dst_t>
187+
static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
188+
const int64_t nb = k / QK_K;
189+
const size_t local_size = 32;
190+
const size_t global_size = nb * local_size;
191+
192+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
193+
194+
stream->submit([&](sycl::handler & cgh) {
195+
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
196+
197+
cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
198+
[=](sycl::nd_item<1> item_ct1) {
199+
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
200+
});
201+
});
202+
}
203+
186204
template <typename dst_t>
187205
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
188206
dpct::queue_ptr stream) {
@@ -504,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
504522
case GGML_TYPE_Q3_K:
505523
return dequantize_row_q3_K_sycl;
506524
case GGML_TYPE_Q4_K:
507-
return dequantize_row_q4_K_sycl;
525+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
526+
return dequantize_row_q4_K_sycl_reorder;
527+
} else {
528+
return dequantize_row_q4_K_sycl;
529+
}
508530
case GGML_TYPE_Q5_K:
509531
return dequantize_row_q5_K_sycl;
510532
case GGML_TYPE_Q6_K:
@@ -556,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
556578
case GGML_TYPE_Q3_K:
557579
return dequantize_row_q3_K_sycl;
558580
case GGML_TYPE_Q4_K:
559-
return dequantize_row_q4_K_sycl;
581+
if (dst->src[0]->extra &&
582+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
583+
return dequantize_row_q4_K_sycl_reorder;
584+
} else {
585+
return dequantize_row_q4_K_sycl;
586+
}
560587
case GGML_TYPE_Q5_K:
561588
return dequantize_row_q5_K_sycl;
562589
case GGML_TYPE_Q6_K:

ggml/src/ggml-sycl/dequantize.hpp

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
357357
}
358358
#endif
359359

360+
template <typename dst_t>
361+
inline void dequantize_q4_K_common(dst_t * __restrict__ y, co 3262 nst uint8_t * __restrict__ qs_ptr, const float dall,
362+
const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
363+
const int is = 2 * il;
364+
constexpr int n = 4;
365+
366+
uint8_t sc, m;
367+
get_scale_min_k4(is + 0, scales_local, sc, m);
368+
const float d1 = dall * sc;
369+
const float m1 = dmin * m;
370+
371+
get_scale_min_k4(is + 1, scales_local, sc, m);
372+
const float d2 = dall * sc;
373+
const float m2 = dmin * m;
374+
375+
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
376+
for (int l = 0; l < n; ++l) {
377+
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
378+
y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
379+
}
380+
}
381+
360382
template<typename dst_t>
361383
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
362384
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
365387
const int64_t i = item_ct1.get_group(2);
366388

367389
#if QK_K == 256
368-
// assume 32 threads
369390
const int64_t tid = item_ct1.get_local_id(2);
370-
const int64_t il = tid/8;
371-
const int64_t ir = tid%8;
372-
const int64_t is = 2*il;
373-
const int64_t n = 4;
391+
const int64_t il = tid / 8;
392+
const int64_t ir = tid % 8;
374393

375-
dst_t * y = yy + i*QK_K + 64*il + n*ir;
394+
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
376395

377396
const sycl::half2 dm = x[i].dm;
378397
const float dall = dm[0];
379398
const float dmin = dm[1];
380399

381-
if (tid < 12)
400+
if (tid < 12) {
382401
scales_local[tid] = x[i].scales[tid];
383-
item_ct1.barrier(sycl::access::fence_space::local_space);
384-
385-
uint8_t sc, m;
386-
get_scale_min_k4(is + 0, scales_local, sc, m);
387-
const float d1 = dall * sc;
388-
const float m1 = dmin * m;
389-
get_scale_min_k4(is + 1, scales_local, sc, m);
390-
const float d2 = dall * sc;
391-
const float m2 = dmin * m;
392-
393-
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
394-
for (int l = 0; l < n; ++l) {
395-
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
396-
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
397402
}
403+
404+
item_ct1.barrier(sycl::access::fence_space::local_space);
405+
dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
398406
#else
399407
const int64_t tid = item_ct1.get_local_id(2);
400408
const uint8_t * q = x[i].qs;
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
406414
#endif
407415
}
408416

417+
template <typename dst_t>
418+
static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
419+
const sycl::nd_item<1> & item_ct1, int64_t nb) {
420+
const int64_t i = item_ct1.get_group(0); // block index
421+
const int64_t tid = item_ct1.get_local_id(0); // thread index within block
422+
const int64_t il = tid / 8;
423+
const int64_t ir = tid % 8;
424+
425+
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
426+
427+
const uint8_t * base = static_cast<const uint8_t *>(vx);
428+
const size_t qs_offset = i * (QK_K / 2);
429+
const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
430+
const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
431+
432+
const uint8_t * qs_ptr = base + qs_offset;
433+
const uint8_t * scales_ptr = base + scales_offset;
434+
ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
435+
436+
const float dall = dm_values.x();
437+
const float dmin = dm_values.y();
438+
439+
if (tid < 12) {
440+
scales_local[tid] = scales_ptr[tid];
441+
}
442+
443+
item_ct1.barrier(sycl::access::fence_space::local_space);
444+
dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
445+
}
446+
409447
template<typename dst_t>
410448
static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
411449
const sycl::nd_item<3> &item_ct1) {

ggml/src/ggml-sycl/dmmv.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1129,7 +1129,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
11291129
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
11301130
break;
11311131
case GGML_TYPE_Q4_K:
1132-
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1132+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1133+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1134+
// reorder is currently not supported for dmmv
1135+
GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
1136+
} else {
1137+
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1138+
}
11331139
break;
11341140
case GGML_TYPE_Q5_K:
11351141
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
352352
assert(tensor->view_src->buffer->buft == buffer->buft);
353353
return GGML_STATUS_SUCCESS;
354354
}
355-
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
355+
if ((tensor- 10000 >type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
356356
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
357357
tensor->extra = extra;
358358
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -2900,6 +2900,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
29002900
switch (type) {
29012901
case GGML_TYPE_Q4_0:
29022902
return true;
2903+
case GGML_TYPE_Q4_K:
2904+
return !g_ggml_sycl_prioritize_dmmv;
29032905
default:
29042906
return false;
29052907
}
@@ -2917,6 +2919,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
29172919
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
29182920
switch (type) {
29192921
case GGML_TYPE_Q4_0:
2922+
case GGML_TYPE_Q4_K:
29202923
return true;
29212924
default:
29222925
return false;
@@ -2942,16 +2945,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
29422945
}
29432946
}
29442947

2945-
static void reorder_qw(char *data_device, const int ncols, const int nrows,
2946-
size_t size, size_t offset, dpct::queue_ptr stream) {
2947-
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
2948+
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
2949+
dpct::queue_ptr stream) {
2950+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
29482951
SYCL_CHECK(
29492952
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
29502953
.wait()));
29512954
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
29522955
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
29532956
int offset_blks = offset / sizeof(block_q4_0);
2954-
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
2957+
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
29552958
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
29562959

29572960
stream->parallel_for(
@@ -2965,18 +2968,59 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
29652968
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
29662969
}
29672970
*(d_ptr + ib) = x[ib].d;
2968-
});
2971+
}).wait_and_throw();
2972+
2973+
sycl::free(tmp_buf, *stream);
2974+
}
2975+
2976+
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
2977+
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
2978+
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
2979+
2980+
const int nblocks = size / sizeof(block_q4_K);
2981+
2982+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
2983+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
2984+
2985+
auto * qs_ptr = data_device;
2986+
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
2987+
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
2988+
2989+
stream->parallel_for(nblocks, [=](auto i) {
2990+
const block_q4_K * x = (const block_q4_K *) tmp_buf;
2991+
const int ib = i;
2992+
2993+
for (int j = 0; j < QK_K / 2; ++j) {
2994+
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
2995+
}
2996+
2997+
for (int j = 0; j < K_SCALE_SIZE; ++j) {
2998+
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
2999+
}
3000+
3001+
dm_ptr[ib] = x[ib].dm;
3002+
}).wait_and_throw();
29693003

29703004
sycl::free(tmp_buf, *stream);
29713005
}
29723006

29733007
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
2974-
char*data_device = (char*)src0->data;
3008+
uint8_t * data_device = (uint8_t *) src0->data;
29753009
size_t ncols = src0->ne[0];
29763010
size_t nrows = src0->ne[1];
29773011
size_t size = ggml_nbytes(src0);
29783012

2979-
reorder_qw(data_device, ncols, nrows, size, 0, stream);
3013+
switch (src0->type) {
3014+
case GGML_TYPE_Q4_0:
3015+
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
3016+
break;
3017+
case GGML_TYPE_Q4_K:
3018+
reorder_qw_q4_k(data_device, size, 0, stream);
3019+
break;
3020+
default:
3021+
GGML_ABORT("reorder_qw() called with unsupported type");
3022+
break;
3023+
}
29803024
}
29813025

29823026
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
@@ -3019,8 +3063,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
30193063
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
30203064
}
30213065

3022-
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
30233066

3067+
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3068+
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
3069+
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
3070+
}
3071+
3072+
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3073+
return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
3074+
src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
3075+
}
3076+
3077+
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
30243078
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
30253079
int64_t min_compute_capability = INT_MAX;
30263080

@@ -3043,13 +3097,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
30433097
}
30443098

30453099
// check data types and tensor shapes for custom matrix multiplication kernels:
3046-
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
3047-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
3048-
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
3100+
bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);
30493101

3050-
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
3051-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
3052-
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
3102+
bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);
30533103

30543104
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
30553105
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

ggml/src/ggml-sycl/mmvq.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
2424
const int blocks_per_row = ncols / block_traits::qk;
2525
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
2626
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
27+
const int nblocks = nrows * (ncols / block_traits::qk);
2728

2829
static_assert(blocks_per_subgroup > 0);
2930
static_assert(block_elements_per_subgroup > 0);
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
4546
// x block quant index when casting the quants to int
4647
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
4748

48-
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
49+
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
4950
}
5051
}
5152

@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
739740
}
740741
}
741742

743+
static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
744+
const int nrows, dpct::queue_ptr stream) {
745+
GGML_ASSERT(ncols % QK_K == 0);
746+
747+
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
748+
constexpr size_t num_subgroups = 16;
749+
GGML_ASSERT(block_num_y % num_subgroups == 0);
750+
751+
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
752+
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
753+
754+
stream->submit([&](sycl::handler & cgh) {
755+
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
756+
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
757+
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
758+
nrows, nd_item);
759+
});
760+
});
761+
}
762+
763+
742764
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
743765
float *dst, const int ncols,
744766
const int nrows,
@@ -1035,7 +1057,12 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
10351057
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
10361058
break;
10371059
case GGML_TYPE_Q4_K:
1038-
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1060+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1061+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1062+
reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1063+
} else {
1064+
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1065+
}
10391066
break;
10401067
case GGML_TYPE_Q5_K:
10411068
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);

0 commit comments

Comments
 (0)
0