8000 gemma : use more bits for the token_embd.weight tensor (#5650) · ggml-org/llama.cpp@96633ee · GitHub
[go: up one dir, main page]

Skip to content
  • Commit 96633ee

    Browse files
    authored
    gemma : use more bits for the token_embd.weight tensor (#5650)
    * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type
    1 parent 847eedb commit 96633ee

    File tree

    1 file changed

    +4
    -1
    lines changed

    1 file changed

    +4
    -1
    lines changed

    llama.cpp

    Lines changed: 4 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
    1049810498
    return std::make_pair(i_layer, n_layer);
    1049910499
    };
    1050010500

    10501-
    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
    10501+
    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
    10502+
    // with the quantization of the output tensor
    10503+
    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
    10504+
    (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
    1050210505
    int nx = tensor->ne[0];
    1050310506
    if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
    1050410507
    new_type = GGML_TYPE_Q8_0;

    0 commit comments

    Comments
     (0)
    0