Skip to content

Commit 15499eb

Browse files
authored
mpt : do not duplicate token_embd.weight on disk (#5670)
1 parent 96633ee commit 15499eb

File tree

2 files changed

+4
-7
lines changed

2 files changed

+4
-7
lines changed

convert-hf-to-gguf.py

-5
Original file line numberDiff line numberDiff line change
@@ -622,11 +622,6 @@ def write_tensors(self):
622622

623623
self.gguf_writer.add_tensor(new_name, data)
624624

625-
# note: MPT output is tied to (same as) wte in original model;
626-
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
627-
if new_name == "token_embd.weight":
628-
self.gguf_writer.add_tensor("output.weight", data)
629-
630625

631626
class OrionModel(Model):
632627
def set_vocab(self):

llama.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
509509
{
510510
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511511
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
512-
{ LLM_TENSOR_OUTPUT, "output" },
513512
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
514513
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
515514
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -4056,7 +4055,10 @@ static bool llm_load_tensors(
40564055
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
40574056
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
40584057

4059-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4058+
// same as tok_embd, duplicated to allow offloading
4059+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4060+
ml.n_created--; // artificial tensor
4061+
ml.size_data += ggml_nbytes(model.output);
40604062
}
40614063

40624064
for (int i = 0; i < n_layer; ++i) {

0 commit comments

Comments
 (0)