Skip to content

Commit 531b317

Browse files
authored
Support BailingMoE
1 parent 0d4c4a7 commit 531b317

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
708708
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
709709
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
710710
res = "superbpe"
711+
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
712+
# ref: https://huggingface.co/inclusionAI/Ling-lite
713+
res = "bailingmoe"
711714

712715
if res is None:
713716
logger.warning("\n")
@@ -5130,6 +5133,108 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51305133
return super().modify_tensors(data_torch, name, bid)
51315134

51325135

5136+
@Model.register("BailingMoeForCausalLM")
5137+
class BailingMoeModel(Model):
5138+
model_arch = gguf.MODEL_ARCH.BAILINGMOE
5139+
5140+
def set_vocab(self):
5141+
self._set_vocab_gpt2()
5142+
5143+
def set_gguf_parameters(self):
5144+
super().set_gguf_parameters()
5145+
hparams = self.hparams
5146+
if "head_dim" in hparams:
5147+
rope_dim = hparams["head_dim"]
5148+
else:
5149+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5150+
5151+
self.gguf_writer.add_rope_dimension_count(rope_dim)
5152+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5153+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
5154+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
5155+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
5156+
self.gguf_writer.add_expert_weights_scale(1.0)
5157+
self.gguf_writer.add_expert_count(hparams["num_experts"])
5158+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
5159+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
5160+
5161+
_experts: list[dict[str, Tensor]] | None = None
5162+
5163+
@staticmethod
5164+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
5165+
if n_head_kv is not None and n_head != n_head_kv:
5166+
n_head = n_head_kv
5167+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
5168+
.swapaxes(1, 2)
5169+
.reshape(weights.shape))
5170+
5171+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5172+
n_head = self.hparams["num_attention_heads"]
5173+
n_kv_head = self.hparams.get("num_key_value_heads")
5174+
n_embd = self.hparams["hidden_size"]
5175+
head_dim = self.hparams.get("head_dim", n_embd // n_head)
5176+
5177+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
5178+
5179+
if name.endswith("attention.dense.weight"):
5180+
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
5181+
elif name.endswith("query_key_value.weight"):
5182+
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
5183+
5184+
return [
5185+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
5186+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
5187+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
5188+
]
5189+
elif name.find("mlp.experts") != -1:
5190+
n_experts = self.hparams["num_experts"]
5191+
assert bid is not None
5192+
5193+
tensors: list[tuple[str, Tensor]] = []
5194+
5195+
if self._experts is None:
5196+
self._experts = [{} for _ in range(self.block_count)]
5197+
5198+
self._experts[bid][name] = data_torch
5199+
5200+
if len(self._experts[bid]) >= n_experts * 3:
5201+
# merge the experts into a single 3d tensor
5202+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
5203+
datas: list[Tensor] = []
5204+
5205+
for xid in range(n_experts):
5206+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
5207+
datas.append(self._experts[bid][ename])
5208+
del self._experts[bid][ename]
5209+
5210+
data_torch = torch.stack(datas, dim=0)
5211+
5212+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
5213+
5214+
new_name = self.map_tensor_name(merged_name)
5215+
5216+
tensors.append((new_name, data_torch))
5217+
5218+
return tensors
5219+
5220+
new_name = self.map_tensor_name(name)
5221+
5222+
if new_name == output_name and self.hparams.get("norm_head"):
5223+
data_torch = data_torch.float()
5224+
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
5225+
5226+
return [(new_name, data_torch)]
5227+
5228+
def prepare_tensors(self):
5229+
super().prepare_tensors()
5230+
5231+
if self._experts is not None:
5232+
# flatten `list[dict[str, Tensor]]` into `list[str]`
5233+
experts = [k for d in self._experts for k in d.keys()]
5234+
if len(experts) > 0:
5235+
raise ValueError(f"Unprocessed experts: {experts}")
5236+
5237+
51335238
@Model.register("ChameleonForConditionalGeneration")
51345239
@Model.register("ChameleonForCausalLM") # obsolete
51355240
class ChameleonModel(Model):

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ class TOKENIZER_TYPE(IntEnum):
111111
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
112112
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
113113
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
114+
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
114115
]
115116

116117

0 commit comments

Comments
 (0)