@@ -708,6 +708,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
708
708
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f" :
709
709
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
710
710
res = "superbpe"
711
+ if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224" :
712
+ # ref: https://huggingface.co/inclusionAI/Ling-lite
713
+ res = "bailingmoe"
711
714
712
715
if res is None :
713
716
logger .warning ("\n " )
@@ -5130,6 +5133,108 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
5130
5133
return super ().modify_tensors (data_torch , name , bid )
5131
5134
5132
5135
5136
+ @Model .register ("BailingMoeForCausalLM" )
5137
+ class BailingMoeModel (Model ):
5138
+ model_arch = gguf .MODEL_ARCH .BAILINGMOE
5139
+
5140
+ def set_vocab (self ):
5141
+ self ._set_vocab_gpt2 ()
5142
+
5143
+ def set_gguf_parameters (self ):
5144
+ super ().set_gguf_parameters ()
5145
+ hparams = self .hparams
5146
+ if "head_dim" in hparams :
5147
+ rope_dim = hparams ["head_dim" ]
5148
+ else :
5149
+ rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
5150
+
5151
+ self .gguf_writer .add_rope_dimension_count (rope_dim )
5152
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
5153
+ self .gguf_writer .add_leading_dense_block_count (hparams ["first_k_dense_replace" ])
5154
+ self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
5155
+ self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
5156
+ self .gguf_writer .add_expert_weights_scale (1.0 )
5157
+ self .gguf_writer .add_expert_count (hparams ["num_experts" ])
5158
+ self .gguf_writer .add_expert_shared_count (hparams ["num_shared_experts" ])
5159
+ self .gguf_writer .add_expert_weights_norm (hparams ["norm_topk_prob" ])
5160
+
5161
+ _experts : list [dict [str , Tensor ]] | None = None
5162
+
5163
+ @staticmethod
5164
+ def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
5165
+ if n_head_kv is not None and n_head != n_head_kv :
5166
+ n_head = n_head_kv
5167
+ return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
5168
+ .swapaxes (1 , 2 )
5169
+ .reshape (weights .shape ))
5170
+
5171
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
5172
+ n_head = self .hparams ["num_attention_heads" ]
5173
+ n_kv_head = self .hparams .get ("num_key_value_heads" )
5174
+ n_embd = self .hparams ["hidden_size" ]
5175
+ head_dim = self .hparams .get ("head_dim" , n_embd // n_head )
5176
+
5177
+ output_name = self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT )
5178
+
5179
+ if name .endswith ("attention.dense.weight" ):
5180
+ return [(self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_OUT , bid ), data_torch )]
5181
+ elif name .endswith ("query_key_value.weight" ):
5182
+ q , k , v = data_torch .split ([n_head * head_dim , n_kv_head * head_dim , n_kv_head * head_dim ], dim = - 2 )
5183
+
5184
+ return [
5185
+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_Q , bid ), BailingMoeModel .permute (q , n_head , n_head )),
5186
+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_K , bid ), BailingMoeModel .permute (k , n_head , n_kv_head )),
5187
+ (self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_V , bid ), v )
5188
+ ]
5189
+ elif name .find ("mlp.experts" ) != - 1 :
5190
+ n_experts = self .hparams ["num_experts" ]
5191
+ assert bid is not None
5192
+
5193
+ tensors : list [tuple [str , Tensor ]] = []
5194
+
5195
+ if self ._experts is None :
5196
+ self ._experts = [{} for _ in range (self .block_count )]
5197
+
5198
+ self ._experts [bid ][name ] = data_torch
5199
+
5200
+ if len (self ._experts [bid ]) >= n_experts * 3 :
5201
+ # merge the experts into a single 3d tensor
5202
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
5203
+ datas : list [Tensor ] = []
5204
+
5205
+ for xid in range (n_experts ):
5206
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
5207
+ datas .append (self ._experts [bid ][ename ])
5208
+ del self ._experts [bid ][ename ]
5209
+
5210
+ data_torch = torch .stack (datas , dim = 0 )
5211
+
5212
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
5213
+
5214
+ new_name = self .map_tensor_name (merged_name )
5215
+
5216
+ tensors .append ((new_name , data_torch ))
5217
+
5218
+ return tensors
5219
+
5220
+ new_name = self .map_tensor_name (name )
5221
+
5222
+ if new_name == output_name and self .hparams .get ("norm_head" ):
5223
+ data_torch = data_torch .float ()
5224
+ data_torch /= torch .norm (data_torch , p = 2 , dim = 0 , keepdim = True ) + 1e-7
5225
+
5226
+ return [(new_name , data_torch )]
5227
+
5228
+ def prepare_tensors (self ):
5229
+ super ().prepare_tensors ()
5230
+
5231
+ if self ._experts is not None :
5232
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
5233
+ experts = [k for d in self ._experts for k in d .keys ()]
5234
+ if len (experts ) > 0 :
5235
+ raise ValueError (f"Unprocessed experts: { experts } " )
5236
+
5237
+
5133
5238
@Model .register ("ChameleonForConditionalGeneration" )
5134
5239
@Model .register ("ChameleonForCausalLM" ) # obsolete
5135
5240
class ChameleonModel (Model ):
0 commit comments