8000 CANN: Support MOE Model MUL_MAT_ID (#13042) · robbiemu/llama.cpp@33d7aed · GitHub
[go: up one dir, main page]

Skip to content

Commit 33d7aed

Browse files
authored
CANN: Support MOE Model MUL_MAT_ID (ggml-org#13042)
Signed-off-by: noemotiovon <757486878@qq.com>
1 parent 6a2bc8b commit 33d7aed

File tree

3 files changed

+183
-2
lines changed

3 files changed

+183
-2
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
#include <aclnnop/aclnn_eq_tensor.h>
6666
#include <aclnnop/aclnn_gt_scalar.h>
6767
#include <aclnnop/aclnn_pow.h>
68+
#include <aclnnop/aclnn_grouped_matmul_v2.h>
6869
#include <float.h>
6970

7071
#include <cmath>
@@ -2587,3 +2588,149 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
25872588

25882589
ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
25892590
}
2591+
2592+
/**
2593+
* @brief Performs expert-specific matrix multiplication (MoE) with
2594+
* floating-point precision using the CANN backend.
2595+
*
2596+
* This function executes a matrix multiplication operation tailored for
2597+
* Mixture of Experts (MoE) models, where the input tensor is multiplied
2598+
* with expert-specific weight matrices. It uses the CANN backend for
2599+
* efficient computation and stores the result in the destination tensor `dst`.
2600+
* The operation may leverage identity-based optimizations or routing masks
2601+
* as part of sparse expert selection.
2602+
*
2603+
* @param ctx The context for executing CANN backend operations.
2604+
* @param dst The destination tensor where the MoE multiplication result
2605+
* will be stored.
2606+
*
2607+
* @note This function assumes floating-point data types and is designed for
2608+
* MoE architectures, possibly involving sparse expert routing.
2609+
*/
2610+
static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2611+
//dst [M, K, N, 1]
2612+
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2613+
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2614+
ggml_tensor * ids = dst->src[2]; //ids [K, N]
2615+
2616+
GGML_TENSOR_BINARY_OP_LOCALS
2617+
2618+
// copy index from npu to cpu
2619+
int64_t n_as = ne02; // A
2620+
int64_t n_ids = ids->ne[0]; // K
2621+
2622+
std::vector<char> ids_host(ggml_nbytes(ids));
2623+
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2624+
ACL_MEMCPY_DEVICE_TO_HOST);
2625+
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2626+
2627+
char * src0_original = (char *) src0->data;
2628+
char * src1_original = (char *) src1->data;
2629+
char * dst_original = (char *) dst->data;
2630+
size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
2631+
2632+
// src0 is F16, src1 is F32, dst is F32
2633+
ggml_cann_pool_alloc src0_cast_allocator;
2634+
if (src0->type == GGML_TYPE_F16) {
2635+
src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
2636+
void* src0_cast_buf = src0_cast_allocator.get();
2637+
2638+
size_t cast_nb[GGML_MAX_DIMS];
2639+
cast_nb[0] = sizeof(float_t);
2640+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
2641+
cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
2642+
}
2643+
2644+
aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
2645+
aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
2646+
ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
2647+
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
2648+
ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
2649+
2650+
src0_original = (char *) src0_cast_buf;
2651+
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
2652+
}
2653+
2654+
std::vector<aclTensor*> src0_tensor_vec;
2655+
std::vector<aclTensor*> src1_tensor_vec;
2656+
std::vector<aclTensor*> dst_tensor_vec;
2657+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2658+
for (int64_t id = 0; id < n_ids; id++) {
2659+
// src0_row [M, D] -> weight && permute
2660+
int64_t src0_ne[2] = {ne01, ne00};
2661+
size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
2662+
// src1_row [D, 1] -> input
2663+
int64_t src1_ne[2] = {ne10, 1};
2664+
size_t src1_nb[2] = {nb10, nb11};
2665+
// dst_row [M, 1] -> out
2666+
int64_t dst_ne[2] = {ne0, 1};
2667+
size_t dst_nb[2] = {nb0, nb1};
2668+
2669+
// expert index
2670+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2671+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
2672+
2673+
// If B = 1 (broadcast), always use 0; otherwise, use id.
2674+
int64_t i11 = (ne11 == 1 ? 0 : id);
2675+
int64_t i12 = iid1;
2676+
2677+
int64_t i1 = id;
2678+
int64_t i2 = i12;
2679+
2680+
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2681+
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2682+
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2683+
2684+
aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
2685+
ACL_FLOAT, sizeof(float),
2686+
src0_ne, src0_nb, 2);
2364 2687+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
2688+
ACL_FLOAT, sizeof(float),
2689+
src1_ne, src1_nb, 2);
2690+
aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
2691+
ACL_FLOAT, sizeof(float),
2692+
dst_ne, dst_nb, 2);
2693+
2694+
src0_tensor_vec.push_back(acl_src0);
2695+
src1_tensor_vec.push_back(acl_src1);
2696+
dst_tensor_vec.push_back(acl_dst);
2697+
}
2698+
}
2699+
2700+
// GroupedMatmulV2 required tensor_list.size < 128
2701+
size_t GROUP_SIZE = 128;
2702+
std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
2703+
std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
2704+
std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
2705+
2706+
// split and call GroupedMatmulV2
2707+
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
2708+
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
2709+
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
2710+
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
2711+
std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
2712+
2713+
aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
2714+
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
2715+
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
2716+
2717+
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
2718+
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
2719+
2720+
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
2721+
}
2722+
return;
2723+
}
2724+
2725+
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2726+
const enum ggml_type type = dst->src[0]->type;
2727+
switch (type) {
2728+
case GGML_TYPE_F32:
2729+
case GGML_TYPE_F16:
2730+
ggml_cann_mul_mat_id_fp(ctx, dst);
2731+
break;
2732+
default:
2733+
GGML_ABORT("Unsupported type for mul_mat_id");
2734+
break;
2735+
}
2736+
}

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
978978
}
979979
}
980980

981+
/**
982+
* @brief Performs sparse expert-based matrix multiplication using the CANN backend.
983+
*
984+
* @details This function implements a MoE-style batched matrix multiplication, where each input token
985+
* is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
986+
* in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
987+
*
988+
* For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
989+
* performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
990+
* and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
991+
*
992+
* Dimensions:
993+
* - src0: [D, M, A, 1], where A is the number of experts
994+
* - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
995+
* - ids : [K, N], where K is the number of experts each token is routed to
996+
* - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
997+
*
998+
* The function handles two main modes:
999+
* - If `ne12 == 1`, a simpler per-token loop is used.
1000+
* - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
1001+
*
1002+
* @param ctx The CANN context used for operations.
1003+
* @param dst The destination tensor where the expert-weighted token outputs are stored.
1004+
* Expected to be of shape [M, K, N, 1].
1005+
*/
1006+
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
1007+
9811008
/**
9821009
* @brief Applies a element-wise operation to two input tensors using the CANN
9831010
* backend.

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
16721672
ggml_cann_mul_mat(ctx, dst);
16731673
break;
16741674
case GGML_OP_MUL_MAT_ID:
1675-
return false;
1675+
ggml_cann_mul_mat_id(ctx, dst);
1676+
break;
16761677
case GGML_OP_SCALE:
16771678
ggml_cann_scale(ctx, dst);
16781679
break;
@@ -2030,7 +2031,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
20302031
}
20312032
}
20322033
case GGML_OP_MUL_MAT_ID:
2033-
return false;
2034+
switch (op->src[0]->type) {
2035+
case GGML_TYPE_F16:
2036+
case GGML_TYPE_F32:
2037+
return true;
2038+
default:
2039+
return false;
2040+
}
20342041
// embedding
20352042
case GGML_OP_GET_ROWS: {
20362043
switch (op->src[0]->type) {

0 commit comments

Comments
 (0)
0