8000 mtmd : support InternVL 3 38B and 78B mmproj (#13443) · ggml-org/llama.cpp@3eac209 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3eac209

Browse files
authored
mtmd : support InternVL 3 38B and 78B mmproj (#13443)
* Support InternVL 3 38B and 78B mmproj * Swap norms in clip.cpp * Group variables together
1 parent a634d75 commit 3eac209

File tree

4 files changed

+31
-0
lines changed

4 files changed

+31
-0
lines changed

gguf-py/gguf/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,9 @@ class MODEL_TENSOR(IntEnum):
483483
V_ENC_EMBD_PATCH = auto()
484484
V_ENC_EMBD_POS = auto()
485485
V_ENC_ATTN_Q = auto()
486+
V_ENC_ATTN_Q_NORM = auto()
486487
V_ENC_ATTN_K = auto()
488+
V_ENC_ATTN_K_NORM = auto()
487489
V_ENC_ATTN_V = auto()
488490
V_ENC_INPUT_NORM = auto()
489491
V_ENC_OUTPUT = auto()
@@ -742,7 +744,9 @@ class MODEL_TENSOR(IntEnum):
742744
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
743745
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
744746
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
747+
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
745748
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
749+
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
746750
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
747751
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
748752
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
@@ -782,7 +786,9 @@ class MODEL_TENSOR(IntEnum):
782786
MODEL_TENSOR.V_ENC_EMBD_PATCH,
783787
MODEL_TENSOR.V_ENC_EMBD_POS,
784788
MODEL_TENSOR.V_ENC_ATTN_Q,
789+
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
785790
MODEL_TENSOR.V_ENC_ATTN_K,
791+
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
786792
MODEL_TENSOR.V_ENC_ATTN_V,
787793
MODEL_TENSOR.V_ENC_INPUT_NORM,
788794
MODEL_TENSOR.V_ENC_OUTPUT,

gguf-py/gguf/tensor_mapping.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,10 @@ class TensorNameMap:
938938
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
939939
),
940940

941+
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
942+
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
943+
),
944+
941945
MODEL_TENSOR.V_ENC_ATTN_K: (
942946
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
943947
"vpm.encoder.layers.{bid}.self_attn.k_proj",
@@ -946,6 +950,10 @@ class TensorNameMap:
946950
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
947951
),
948952

953+
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
954+
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
955+
),
956+
949957
MODEL_TENSOR.V_ENC_ATTN_V: (
950958
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
951959
"vpm.encoder.layers.{bid}.self_attn.v_proj",

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@
5353
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
5454
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
5555
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
56+
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
57+
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
5658
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
5759
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
5860
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"

tools/mtmd/clip.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,9 @@ struct clip_layer {
205205
ggml_tensor * o_w = nullptr;
206206
ggml_tensor * o_b = nullptr;
207207

208+
ggml_tensor * k_norm = nullptr;
209+
ggml_tensor * q_norm = nullptr;
210+
208211
// layernorm 1
209212
ggml_tensor * ln_1_w = nullptr;
210213
ggml_tensor * ln_1_b = nullptr;
@@ -1363,6 +1366,16 @@ struct clip_graph {
13631366
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
13641367
}
13651368

1369+
if (layer.q_norm) {
1370+
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
1371+
cb(Qcur, "Qcur_norm", il);
1372+
}
1373+
1374+
if (layer.k_norm) {
1375+
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
1376+
cb(Kcur, "Kcur_norm", il);
1377+
}
1378+
13661379
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
13671380
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
13681381
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
@@ -1988,6 +2001,8 @@ struct clip_model_loader {
19882001
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
19892002
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
19902003
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
2004+
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
2005+
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
19912006
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
19922007
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
19932008
layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias

0 commit comments

Comments
 (0)
0