@@ -27,7 +27,7 @@ struct Qwen3Attention {
27
27
impl Qwen3Attention {
28
28
pub fn load ( vb : VarBuilder , config : & Qwen3Config ) -> Result < Self > {
29
29
if config. use_sliding_window {
30
- candle:: bail!( "Sliding window is not supported for Qwen3" , ) ;
30
+ candle:: bail!( "Sliding window is not supported for Qwen3" ) ;
31
31
}
32
32
33
33
let num_attention_heads = config. num_attention_heads ;
@@ -143,8 +143,8 @@ impl Qwen3Attention {
143
143
) ?;
144
144
145
145
// Apply normalization layers
146
- let ( q, _res ) = self . q_norm . forward ( & q, None ) ?;
147
- let ( k, _res ) = self . k_norm . forward ( & k, None ) ?;
146
+ let ( q, _ ) = self . q_norm . forward ( & q, None ) ?;
147
+ let ( k, _ ) = self . k_norm . forward ( & k, None ) ?;
148
148
149
149
apply_rotary_inplace ( & q, & k, & cos, & sin, true ) ?;
150
150
@@ -158,7 +158,7 @@ impl Qwen3Attention {
158
158
max_s,
159
159
max_s,
160
160
self . softmax_scale ,
161
- false ,
161
+ true ,
162
162
None ,
163
163
None ,
164
164
) ?;
@@ -215,8 +215,8 @@ impl Qwen3MLP {
215
215
let up_states = gate_up_states. narrow ( 1 , self . intermediate_size , self . intermediate_size ) ?;
216
216
217
217
let gate_states = self . act . forward ( & gate_states) ?;
218
- let r = self . down_proj . forward ( & ( gate_states * up_states ) ? ) ;
219
- r
218
+
219
+ self . down_proj . forward ( & ( gate_states * up_states ) ? )
220
220
}
221
221
}
222
222
@@ -266,12 +266,15 @@ impl Qwen3Layer {
266
266
let _enter = self . span . enter ( ) ;
267
267
268
268
let ( normed_hidden_states, res) = self . input_layer_norm . forward ( hidden_states, residual) ?;
269
+
269
270
let attn_output =
270
271
self . attention
271
272
. forward ( & normed_hidden_states, cu_seqlens, cos, sin, max_s) ?;
273
+
272
274
let ( normed_attn_res_output, attn_res) = self
273
275
. post_attention_layer_norm
274
276
. forward ( & attn_output, Some ( & res) ) ?;
277
+
275
278
let mlp_output = self . mlp . forward ( & normed_attn_res_output) ?;
276
279
277
280
Ok ( ( mlp_output, attn_res) )
0 commit comments