@@ -149,9 +149,9 @@ def model_info(self):
149
149
info ['expert_inter_size' ] = cfg ['moe_intermediate_size' ]
150
150
info ['experts_per_token' ] = cfg ['num_experts_per_tok' ]
151
151
info ['inter_size' ] = cfg ['shared_expert_intermediate_size' ]
152
- info ['moe_shared_gate' ] = info [ 'inter_size' ] > 0
152
+ info ['moe_shared_gate' ] = True
153
153
info ['norm_topk_prob' ] = cfg ['norm_topk_prob' ]
154
- info ['attn_bias' ] = cfg .get ('attention_bias ' , 1 )
154
+ info ['attn_bias' ] = cfg .get ('qkv_bias ' , 1 )
155
155
return info
156
156
157
157
@@ -170,8 +170,9 @@ class Qwen3Model(LlamaModel):
170
170
Reader = Qwen3Reader
171
171
172
172
def model_info (self ):
173
+ cfg = self .model_config
173
174
info = super ().model_info ()
174
- info [ ' qk_norm' ] = True
175
+ info . update ( qk_norm = True , attn_bias = cfg . get ( 'attention_bias' , 0 ))
175
176
return info
176
177
177
178
@@ -186,10 +187,18 @@ def qk_norm(self, i: int):
186
187
187
188
188
189
@INPUT_MODELS .register_module (name = 'qwen3-moe' )
189
- class Qwen3MoeModel (Qwen2MoeModel ):
190
+ class Qwen3MoeModel (LlamaModel ):
190
191
Reader = Qwen3MoeReader
191
192
192
193
def model_info (self ):
194
+ cfg = self .model_config
193
195
info = super ().model_info ()
194
- info ['qk_norm' ] = True
196
+ info .update (
197
+ qk_norm = True ,
198
+ expert_num = cfg .get ('num_experts' , 128 ),
199
+ experts_per_token = cfg .get ('num_experts_per_tok' , 8 ),
200
+ expert_inter_size = cfg .get ('moe_intermediate_size' , 768 ),
201
+ attn_bias = cfg .get ('attention_bias' , 0 ),
202
+ inter_size = 0 , # no shared expert
203
+ norm_topk_prob = cfg .get ('norm_topk_prob' , False ))
195
204
return info
0 commit comments