cleanup

ggml-org · slaren · Apr 18, 2024 · Apr 3, 2024 · Apr 6, 2024 · Apr 7, 2024
commit bf56fdecb3deaf404bcccf7a9d12bc9fa307c028
diff --git a/ggml.c b/ggml.c
@@ -4588,7 +4588,7 @@ void ggml_mul_mat_set_prec(
 
     in b, n_experts_used can be broadcasted to match the n_expert_used of ids
 
-    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e in ids
+    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */
 struct ggml_tensor * ggml_mul_mat_id(
         struct ggml_context * ctx,

diff --git a/ggml.h b/ggml.h
@@ -1161,7 +1161,6 @@ extern "C" {
             enum ggml_prec       prec);
 
     // indirect matrix multiplication
-    // TODO: document
     GGML_API struct ggml_tensor * ggml_mul_mat_id(
             struct ggml_context * ctx,
             struct ggml_tensor  * as,

diff --git a/llama.cpp b/llama.cpp
@@ -7365,6 +7365,7 @@ struct llm_build_context {
                     n_expert, n_expert_used,
                     LLM_FFN_SILU, true,
                     cb, il);
+            cb(cur, "ffn_moe_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
@@ -8694,6 +8695,7 @@ struct llm_build_context {
                         n_expert, n_expert_used,
                         LLM_FFN_SILU, false,
                         cb, il);
+            cb(cur, "ffn_moe_out", il);
 
             // FFN shared expert
             {

diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
@@ -12,19 +12,7 @@ bench_args="${@:3}"
 
 rm -f llama-bench.sqlite
 
-backend="cpu"
-
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-make_opts=""
-
-#if [[ "$backend" == "cuda" ]]; then
-#    make_opts="LLAMA_CUDA=1"
-#fi
+# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
 
 git checkout $1
 make clean && make -j32 $make_opts llama-bench

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -1613,7 +1613,6 @@ struct test_llm : public test_case {
     }
 };
 
-
 // Llama
 struct test_llama : public test_llm {
     static constexpr float freq_base = 10000.0f;
@@ -1860,90 +1859,6 @@ struct test_falcon : public test_llm {
     }
 };
 
-
-// Mixtral MOE
-struct test_moe : public test_case {
-    const int n_expert;
-    const int n_expert_used;
-    const int n_tokens;
-    const int n_embd;
-    const int n_ff;
-
-    std::string op_desc(ggml_tensor * t) override {
-        return "MOE";
-
-        GGML_UNUSED(t);
-    }
-
-    std::string vars() override {
-        return VARS_TO_STR5(n_expert, n_expert_used, n_tokens, n_embd, n_ff);
-    }
-
-    test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
-        : n_expert(n_experts), n_expert_used(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
-    }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_type wtype = GGML_TYPE_F32;
-        ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_expert);
-
-        ggml_tensor * ffn_gate_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
-        ggml_tensor * ffn_down_exps = ggml_new_tensor_3d(ctx, wtype, n_ff, n_embd, n_expert);
-        ggml_tensor * ffn_up_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
-
-        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_expert, n_tokens]
-
-        //ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
-
-        // select experts
-        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
-
-        ggml_tensor * weights = ggml_get_rows(ctx,
-                ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-
-        weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
-
-        weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
-
-        cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-        ggml_tensor * up = ggml_mul_mat_id(ctx, ffn_up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-
-        ggml_tensor * gate = ggml_mul_mat_id(ctx, ffn_gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-
-        gate = ggml_silu(ctx, gate);
-
-        ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
-
-        ggml_tensor * experts = ggml_mul_mat_id(ctx, ffn_down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
-
-        experts = ggml_mul(ctx, experts,
-                ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens));
-
-        // aggregate experts
-        ggml_tensor * moe_out = nullptr;
-        for (int i = 0; i < n_expert_used; ++i) {
-            ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
-                    experts->nb[2], i*experts->nb[1]);
-            cur_expert = ggml_cont(ctx, cur_expert);
-            if (i == 0) {
-                moe_out = cur_expert;
-            } else {
-                moe_out = ggml_add(ctx, moe_out, cur_expert);
-            }
-        }
-
-        cur = moe_out;
-
-        return cur;
-    }
-};
-
-
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     std::vector<std::unique_ptr<test_case>> test_cases;
     std::default_random_engine rng(0);
@@ -2031,10 +1946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         }
     };
 
-    // mul: src0: 4096 2 32 1
-    // mul: src1: 1 2 32 1
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 2, 32, 1}, {4096, 1, 1, 1});
-
     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
@@ -2194,8 +2105,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_llama(2));
     test_cases.emplace_back(new test_falcon(1));
     test_cases.emplace_back(new test_falcon(2));
-    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
-    test_cases.emplace_back(new test_moe(8, 2, 32, 4096, 8*1024));
 #endif
 
     // run tests