8000 mpt : do not duplicate token_embd.weight on disk · ggml-org/llama.cpp@549fe80 · GitHub
[go: up one dir, main page]

Skip to content

Commit 549fe80

Browse files
committed
mpt : do not duplicate token_embd.weight on disk
Previous attempt was #3626
1 parent 201294a commit 549fe80

File tree

2 files changed

+4
-6
lines changed

2 files changed

+4
-6
lines changed

convert-hf-to-gguf.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -618,11 +618,6 @@ d 8000 ef write_tensors(self):
618618

619619
self.gguf_writer.add_tensor(new_name, data)
620620

621-
# note: MPT output is tied to (same as) wte in original model;
622-
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
623-
if new_name == "token_embd.weight":
624-
self.gguf_writer.add_tensor("output.weight", data)
625-
626621

627622
class OrionModel(Model):
628623
def set_vocab(self):

llama.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4056,7 +4056,10 @@ static bool llm_load_tensors(
40564056
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
40574057
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
40584058

4059-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4059+
// same as tok_embd, duplicated to allow offloading
4060+
model.output = ml. 5221 create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4061+
ml.n_created--; // artificial tensor
4062+
ml.size_data += ggml_nbytes(model.output);
40604063
}
40614064

40624065
for (int i = 0; i < n_layer; ++i) {

0 commit comments

Comments
 (0)
0