|
65 | 65 | #include <aclnnop/aclnn_eq_tensor.h>
|
66 | 66 | #include <aclnnop/aclnn_gt_scalar.h>
|
67 | 67 | #include <aclnnop/aclnn_pow.h>
|
| 68 | +#include <aclnnop/aclnn_grouped_matmul_v2.h> |
68 | 69 | #include <float.h>
|
69 | 70 |
|
70 | 71 | #include <cmath>
|
@@ -2587,3 +2588,149 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
2587 | 2588 |
|
2588 | 2589 | ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
|
2589 | 2590 | }
|
| 2591 | + |
| 2592 | +/** |
| 2593 | + * @brief Performs expert-specific matrix multiplication (MoE) with |
| 2594 | + * floating-point precision using the CANN backend. |
| 2595 | + * |
| 2596 | + * This function executes a matrix multiplication operation tailored for |
| 2597 | + * Mixture of Experts (MoE) models, where the input tensor is multiplied |
| 2598 | + * with expert-specific weight matrices. It uses the CANN backend for |
| 2599 | + * efficient computation and stores the result in the destination tensor `dst`. |
| 2600 | + * The operation may leverage identity-based optimizations or routing masks |
| 2601 | + * as part of sparse expert selection. |
| 2602 | + * |
| 2603 | + * @param ctx The context for executing CANN backend operations. |
| 2604 | + * @param dst The destination tensor where the MoE multiplication result |
| 2605 | + * will be stored. |
| 2606 | + * |
| 2607 | + * @note This function assumes floating-point data types and is designed for |
| 2608 | + * MoE architectures, possibly involving sparse expert routing. |
| 2609 | + */ |
| 2610 | +static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
| 2611 | + //dst [M, K, N, 1] |
| 2612 | + ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] |
| 2613 | + ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 |
| 2614 | + ggml_tensor * ids = dst->src[2]; //ids [K, N] |
| 2615 | + |
| 2616 | + GGML_TENSOR_BINARY_OP_LOCALS |
| 2617 | + |
| 2618 | + // copy index from npu to cpu |
| 2619 | + int64_t n_as = ne02; // A |
| 2620 | + int64_t n_ids = ids->ne[0]; // K |
| 2621 | + |
| 2622 | + std::vector<char> ids_host(ggml_nbytes(ids)); |
| 2623 | + ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids), |
| 2624 | + ACL_MEMCPY_DEVICE_TO_HOST); |
| 2625 | + ACL_CHECK(aclrtSynchronizeStream(ctx.stream())); |
| 2626 | + |
| 2627 | + char * src0_original = (char *) src0->data; |
| 2628 | + char * src1_original = (char *) src1->data; |
| 2629 | + char * dst_original = (char *) dst->data; |
| 2630 | + size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03}; |
| 2631 | + |
| 2632 | + // src0 is F16, src1 is F32, dst is F32 |
| 2633 | + ggml_cann_pool_alloc src0_cast_allocator; |
| 2634 | + if (src0->type == GGML_TYPE_F16) { |
| 2635 | + src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0)); |
| 2636 | + void* src0_cast_buf = src0_cast_allocator.get(); |
| 2637 | + |
| 2638 | + size_t cast_nb[GGML_MAX_DIMS]; |
| 2639 | + cast_nb[0] = sizeof(float_t); |
| 2640 | + for (int i = 1; i < GGML_MAX_DIMS; i++) { |
| 2641 | + cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1]; |
| 2642 | + } |
| 2643 | + |
| 2644 | + aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0); |
| 2645 | + aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf, |
| 2646 | + ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4); |
| 2647 | + GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast); |
| 2648 | + ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16); |
| 2649 | + |
| 2650 | + src0_original = (char *) src0_cast_buf; |
| 2651 | + memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb)); |
| 2652 | + } |
| 2653 | + |
| 2654 | + std::vector<aclTensor*> src0_tensor_vec; |
| 2655 | + std::vector<aclTensor*> src1_tensor_vec; |
| 2656 | + std::vector<aclTensor*> dst_tensor_vec; |
| 2657 | + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { |
| 2658 | + for (int64_t id = 0; id < n_ids; id++) { |
| 2659 | + // src0_row [M, D] -> weight && permute |
| 2660 | + int64_t src0_ne[2] = {ne01, ne00}; |
| 2661 | + size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]}; |
| 2662 | + // src1_row [D, 1] -> input |
| 2663 | + int64_t src1_ne[2] = {ne10, 1}; |
| 2664 | + size_t src1_nb[2] = {nb10, nb11}; |
| 2665 | + // dst_row [M, 1] -> out |
| 2666 | + int64_t dst_ne[2] = {ne0, 1}; |
| 2667 | + size_t dst_nb[2] = {nb0, nb1}; |
| 2668 | + |
| 2669 | + // expert index |
| 2670 | + int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); |
| 2671 | + GGML_ASSERT(i02 >= 0 && i02 < n_as); |
| 2672 | + |
| 2673 | + // If B = 1 (broadcast), always use 0; otherwise, use id. |
| 2674 | + int64_t i11 = (ne11 == 1 ? 0 : id); |
| 2675 | + int64_t i12 = iid1; |
| 2676 | + |
| 2677 | + int64_t i1 = id; |
| 2678 | + int64_t i2 = i12; |
| 2679 | + |
| 2680 | + void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2]; |
| 2681 | + void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; |
| 2682 | + void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2; |
| 2683 | + |
| 2684 | + aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr, |
| 2685 | + ACL_FLOAT, sizeof(float), |
| 2686 | + src0_ne, src0_nb, 2); |
2364
| 2687 | + aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr, |
| 2688 | + ACL_FLOAT, sizeof(float), |
| 2689 | + src1_ne, src1_nb, 2); |
| 2690 | + aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr, |
| 2691 | + ACL_FLOAT, sizeof(float), |
| 2692 | + dst_ne, dst_nb, 2); |
| 2693 | + |
| 2694 | + src0_tensor_vec.push_back(acl_src0); |
| 2695 | + src1_tensor_vec.push_back(acl_src1); |
| 2696 | + dst_tensor_vec.push_back(acl_dst); |
| 2697 | + } |
| 2698 | + } |
| 2699 | + |
| 2700 | + // GroupedMatmulV2 required tensor_list.size < 128 |
| 2701 | + size_t GROUP_SIZE = 128; |
| 2702 | + std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec; |
| 2703 | + std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec; |
| 2704 | + std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec; |
| 2705 | + |
| 2706 | + // split and call GroupedMatmulV2 |
| 2707 | + for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) { |
| 2708 | + size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size()); |
| 2709 | + std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end); |
| 2710 | + std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end); |
| 2711 | + std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end); |
| 2712 | + |
| 2713 | + aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size()); |
| 2714 | + aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size()); |
| 2715 | + aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size()); |
| 2716 | + |
| 2717 | + GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list, |
| 2718 | + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list); |
| 2719 | + |
| 2720 | + ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list); |
| 2721 | + } |
| 2722 | + return; |
| 2723 | +} |
| 2724 | + |
| 2725 | +void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
| 2726 | + const enum ggml_type type = dst->src[0]->type; |
| 2727 | + switch (type) { |
| 2728 | + case GGML_TYPE_F32: |
| 2729 | + case GGML_TYPE_F16: |
| 2730 | + ggml_cann_mul_mat_id_fp(ctx, dst); |
| 2731 | + break; |
| 2732 | + default: |
| 2733 | + GGML_ABORT("Unsupported type for mul_mat_id"); |
| 2734 | + break; |
| 2735 | + } |
| 2736 | +} |
0 commit comments