8000 ggml : group all experts in a single ggml_mul_mat_id by slaren · Pull Request #6505 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

ggml : group all experts in a single ggml_mul_mat_id #6505

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cleanup
  • Loading branch information
slaren committed Apr 17, 2024
commit bf56fdecb3deaf404bcccf7a9d12bc9fa307c028
2 changes: 1 addition & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4588,7 +4588,7 @@ void ggml_mul_mat_set_prec(

in b, n_experts_used can be broadcasted to match the n_expert_used of ids

c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e in ids
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
*/
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
Expand Down
1 change: 0 additions & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1161,7 +1161,6 @@ extern "C" {
enum ggml_prec prec);

// indirect matrix multiplication
// TODO: document
GGML_API struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
struct ggml_tensor * as,
Expand Down
2 changes: 2 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7365,6 +7365,7 @@ struct llm_build_context {
n_expert, n_expert_used,
LLM_FFN_SILU, true,
cb, il);
cb(cur, "ffn_moe_out", il);

cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
Expand Down Expand Up @@ -8694,6 +8695,7 @@ struct llm_build_context {
n_expert, n_expert_used,
LLM_FFN_SILU, false,
cb, il);
cb(cur, "ffn_moe_out", il);

// FFN shared expert
{
Expand Down
14 changes: 1 addition & 13 deletions scripts/compare-commits.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,7 @@ bench_args="${@:3}"

rm -f llama-bench.sqlite

backend="cpu"

if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi

make_opts=""

#if [[ "$backend" == "cuda" ]]; then
# make_opts="LLAMA_CUDA=1"
#fi
# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)

git checkout $1
make clean && make -j32 $make_opts llama-bench
Expand Down
91 changes: 0 additions & 91 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1613,7 +1613,6 @@ struct test_llm : public test_case {
}
};


// Llama
struct test_llama : public test_llm {
static constexpr float freq_base = 10000.0f;
Expand Down Expand Up @@ -1860,90 +1859,6 @@ struct test_falcon : public test_llm {
}
};


// Mixtral MOE
struct test_moe : public test_case {
const int n_expert;
const int n_expert_used;
const int n_tokens;
const int n_embd;
const int n_ff;

std::string op_desc(ggml_tensor * t) override {
return "MOE";

GGML_UNUSED(t);
}

std::string vars() override {
return VARS_TO_STR5(n_expert, n_expert_used, n_tokens, n_embd, n_ff);
}

test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
: n_expert(n_experts), n_expert_used(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
}

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_type wtype = GGML_TYPE_F32;
ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_expert);

ggml_tensor * ffn_gate_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
ggml_tensor * ffn_down_exps = ggml_new_tensor_3d(ctx, wtype, n_ff, n_embd, n_expert);
ggml_tensor * ffn_up_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);

ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);

ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_expert, n_tokens]

//ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);

// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]

ggml_tensor * weights = ggml_get_rows(ctx,
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]

weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);

ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]

weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]

cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
ggml_tensor * up = ggml_mul_mat_id(ctx, ffn_up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]

ggml_tensor * gate = ggml_mul_mat_id(ctx, ffn_gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]

gate = ggml_silu(ctx, gate);

ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]

ggml_tensor * experts = ggml_mul_mat_id(ctx, ffn_down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]

experts = ggml_mul(ctx, experts,
ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens));

// aggregate experts
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_expert_used; ++i) {
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
experts->nb[2], i*experts->nb[1]);
cur_expert = ggml_cont(ctx, cur_expert);
if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx, moe_out, cur_expert);
}
}

cur = moe_out;

return cur;
}
};


static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
std::vector<std::unique_ptr<test_case>> test_cases;
std::default_random_engine rng(0);
Expand Down Expand Up @@ -2031,10 +1946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
};

// mul: src0: 4096 2 32 1
// mul: src1: 1 2 32 1
add_test_bin_bcast(GGML_TYPE_F32, {1, 2, 32, 1}, {4096, 1, 1, 1});

add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
Expand Down Expand Up @@ -2194,8 +2105,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_llama(2));
test_cases.emplace_back(new test_falcon(1));
test_cases.emplace_back(new test_falcon(2));
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
test_cases.emplace_back(new test_moe(8, 2, 32, 4096, 8*1024));
#endif

// run tests
Expand Down
0