Add Qwen3

bzantium · bzantium · commit 78927446281e · 2025-06-14T16:40:53.000+09:00
diff --git a/MaxText/common_types.py b/MaxText/common_types.py
@@ -81,3 +81,4 @@ class DecoderBlockType(enum.Enum):
   SIMPLE = "simple"
   SIMPLE_MLP = "simple_mlp"
   LLAMA4 = "llama4"
+  QWEN3 = "qwen3"
diff --git a/MaxText/configs/models/qwen3-0.6b.yml b/MaxText/configs/models/qwen3-0.6b.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-0.6b
+
+base_emb_dim: 1024
+base_num_query_heads: 16
+base_num_kv_heads: 8
+base_num_decoder_layers: 28
+base_mlp_dim: 3072
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 151936
+enable_dropout: False
+logits_via_embedding: True
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1_000_000
+decoder_block: "qwen3"
diff --git a/MaxText/configs/models/qwen3-1.7b.yml b/MaxText/configs/models/qwen3-1.7b.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-1.7b
+
+base_emb_dim: 2048
+base_num_query_heads: 16
+base_num_kv_heads: 8
+base_num_decoder_layers: 28
+base_mlp_dim: 6144
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 151936
+enable_dropout: False
+logits_via_embedding: True
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1_000_000
+decoder_block: "qwen3"
diff --git a/MaxText/configs/models/qwen3-14b.yml b/MaxText/configs/models/qwen3-14b.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-14b
+
+base_emb_dim: 5120
+base_num_query_heads: 40
+base_num_kv_heads: 8
+base_num_decoder_layers: 40
+base_mlp_dim: 17408
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 151936
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1_000_000
+decoder_block: "qwen3"
diff --git a/MaxText/configs/models/qwen3-32b.yml b/MaxText/configs/models/qwen3-32b.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-32b
+
+base_emb_dim: 5120
+base_num_query_heads: 64
+base_num_kv_heads: 8
+base_num_decoder_layers: 64
+base_mlp_dim: 25600
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 151936
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1_000_000
+decoder_block: "qwen3"
diff --git a/MaxText/configs/models/qwen3-4b.yml b/MaxText/configs/models/qwen3-4b.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-4b
+
+base_emb_dim: 2560
+base_num_query_heads: 32
+base_num_kv_heads: 8
+base_num_decoder_layers: 36
+base_mlp_dim: 9728
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 151936
+enable_dropout: False
+logits_via_embedding: True
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1_000_000
+decoder_block: "qwen3"
diff --git a/MaxText/configs/models/qwen3-8b.yml b/MaxText/configs/models/qwen3-8b.yml
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-8b
+
+base_emb_dim: 4096
+base_num_query_heads: 32
+base_num_kv_heads: 8
+base_num_decoder_layers: 36
+base_mlp_dim: 12288
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 151936
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1_000_000
+decoder_block: "qwen3"
diff --git a/MaxText/layers/models.py b/MaxText/layers/models.py
@@ -362,6 +362,10 @@ def get_decoder_layers(self):
         return [llama4.Llama4ScannableBlock]
       else:
         return [llama4.Llama4DecoderLayer]
+    elif self.config.decoder_block == DecoderBlockType.QWEN3:
+      from MaxText.layers import qwen3  # pylint: disable=import-outside-toplevel
+
+      return [qwen3.Qwen3DecoderLayer]
     else:
       raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")
 
@@ -379,6 +383,7 @@ def get_norm_layer(self):
         DecoderBlockType.SIMPLE,
         DecoderBlockType.SIMPLE_MLP,
         DecoderBlockType.LLAMA4,
+        DecoderBlockType.QWEN3,
     ):
       return RMSNorm
     elif self.config.decoder_block == DecoderBlockType.GPT3:
diff --git a/MaxText/layers/qwen3.py b/MaxText/layers/qwen3.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2023 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Transformer model definition."""
+# pylint: disable=arguments-differ
+# pylint: disable=no-name-in-module
+
+from typing import Optional
+
+import jax.numpy as jnp
+from jax.ad_checkpoint import checkpoint_name
+from jax.sharding import Mesh
+# from jax.experimental.pallas.ops.tpu import flash_attention
+
+from flax import linen as nn
+
+from MaxText.inference import page_manager
+from MaxText.layers import linears
+from MaxText.layers import models
+from MaxText.layers import quantizations
+from MaxText.layers.attentions import Attention
+from MaxText.layers.quantizations import AqtQuantization as Quant
+from MaxText.layers.normalizations import RMSNorm
+
+
+# -----------------------------------------
+# The Decoder Layer specific for Qwen3
+# -----------------------------------------
+
+
+class Qwen3DecoderLayer(nn.Module):
+  """Transformer decoder layer that attends to the encoder."""
+
+  config: models.Config
+  mesh: Mesh
+  quant: Optional[Quant] = None
+
+  @nn.compact
+  def __call__(
+      self,
+      inputs,
+      decoder_segment_ids,
+      decoder_positions,
+      deterministic,
+      model_mode,
+      slot: Optional[int] = None,
+      page_state: Optional[page_manager.PageState] = None,
+      previous_chunk=None,
+  ):
+    cfg = self.config
+    mesh = self.mesh
+
+    inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_norm_length", "activation_embed"))
+    inputs = checkpoint_name(inputs, "decoder_layer_input")
+    lnx_rms = RMSNorm(
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        name="pre_self_attention_layer_norm",
+        kernel_axes=("norm",),
+        epsilon=cfg.normalization_layer_epsilon,
+    )
+    lnx = lnx_rms(inputs)
+
+    lnx = nn.with_logical_constraint(lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
+    # Instead of scaling the query values in the checkpoint conversion
+    # we'll do it dynamically in the forward pass of Attention
+    query_pre_attn_scalar = cfg.head_dim**-0.5
+
+    # Self-attention block
+    attention_layer = Attention(
+        config=cfg,
+        num_query_heads=cfg.num_query_heads,
+        num_kv_heads=cfg.num_kv_heads,
+        head_dim=cfg.head_dim,
+        max_target_length=cfg.max_target_length,
+        max_prefill_predict_length=cfg.max_prefill_predict_length,
+        attention_kernel=cfg.attention,
+        mesh=mesh,
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        dropout_rate=cfg.dropout_rate,
+        name="self_attention",
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
+        quant=self.quant,
+        kv_quant=quantizations.configure_kv_quant(cfg),
+        prefill_cache_axis_order=tuple(map(int, cfg.prefill_cache_axis_order.split(","))),
+        ar_cache_axis_order=tuple(map(int, cfg.ar_cache_axis_order.split(","))),
+        compute_axis_order=tuple(map(int, cfg.compute_axis_order.split(","))),
+        reshape_q=cfg.reshape_q,
+        use_ragged_attention=cfg.use_ragged_attention,
+        ragged_block_size=cfg.ragged_block_size,
+        use_qk_norm=cfg.use_qk_norm,
+        query_pre_attn_scalar=query_pre_attn_scalar,
+    )
+
+    attention_lnx = attention_layer(
+        lnx,
+        lnx,
+        decoder_positions,
+        decoder_segment_ids=decoder_segment_ids,
+        deterministic=deterministic,
+        model_mode=model_mode,
+        slot=slot,
+        page_state=page_state,
+        previous_chunk=previous_chunk,
+    )
+
+    attention_lnx = nn.with_logical_constraint(
+        attention_lnx, ("activation_batch", "activation_norm_length", "activation_embed")
+    )
+    intermediate_inputs = inputs + attention_lnx
+
+    # Fully Connected
+    hidden_states = models.RMSNorm(
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        name="post_self_attention_layer_norm",
+        kernel_axes=("norm",),
+        epsilon=cfg.normalization_layer_epsilon,
+    )(intermediate_inputs)
+    hidden_states = nn.with_logical_constraint(
+        hidden_states, ("activation_batch", "activation_norm_length", "activation_embed")
+    )
+
+    # MLP block.
+    mlp_lnx = linears.MlpBlock(
+        intermediate_dim=cfg.mlp_dim,
+        activations=cfg.mlp_activations,
+        intermediate_dropout_rate=cfg.dropout_rate,
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        name="mlp",
+        config=cfg,
+        quant=self.quant,
+    )(hidden_states, deterministic=deterministic)
+    mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
+
+    layer_output = mlp_lnx + intermediate_inputs
+
+    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+
+    layer_output = nn.with_logical_constraint(
+        layer_output,
+        ("activation_batch", "activation_norm_length", "activation_embed"),
+    )
+
+    if cfg.record_internal_nn_metrics:
+      self.sow("intermediates", "activation_mean", jnp.mean(layer_output))
+      self.sow("intermediates", "activation_stdev", jnp.std(layer_output))
+      self.sow(
+          "intermediates",
+          "activation_fraction_zero",
+          jnp.sum(layer_output == 0) / jnp.size(layer_output),
+      )
+
+    if cfg.scan_layers:
+      return layer_output, None
+    else:
+      return layer_output
diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
@@ -301,6 +301,12 @@ def validate_model_name(s: str) -> bool:
       "gpt3-52k",
       "llama4-17b-16e",
       "llama4-17b-128e",
+      "qwen3-0.6b",
+      "qwen3-1.7b",
+      "qwen3-4b",
+      "qwen3-8b",
+      "qwen3-14b",
+      "qwen3-32b",
   )
   if s not in valid_model_names:
     raise ValueError(f"Invalid model name was passed. Got {s}, Valid options {valid_model_names}")