AI-Hypercomputer
diff --git a/‎MaxText/configs/base.yml
Lines changed: 8 additions & 0 deletions b/‎MaxText/configs/base.yml
Lines changed: 8 additions & 0 deletions
diff --git a/‎MaxText/layers/blocks.py
Lines changed: 141 additions & 0 deletions b/‎MaxText/layers/blocks.py
Lines changed: 141 additions & 0 deletions
@@ -133,6 +133,14 @@ cast_logits_to_fp32: True # whether to cast the logits to fp32. The higher preci
 float32_qk_product: False # in dot_product attention, whether to cast to fp32 the inputs to qk product
 float32_logits: False # in dot_product attention, whether to cast to fp32 the inputs to softmax
 
+# Multi-Token Prediction Configs
+# The number of auxiliary prediction layers to use for MTP.
+# Set to 0 to disable the feature.
+mtp_num_layers: 0
+# The scaling factor (lambda) for the MTP auxiliary loss. The final loss is:
+# main_loss + mtp_loss_scaling_factor * avg_mtp_loss
+mtp_loss_scaling_factor: 0.1
+
 # mixture of experts (moe)
 num_experts: 1
 num_experts_per_tok: 1
 
@@ -0,0 +1,141 @@
+#  Copyright 2025 Google LLC
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Module for fundamental transformer building blocks."""
+
+from typing import Optional
+
+import jax.numpy as jnp
+from jax.ad_checkpoint import checkpoint_name
+from jax.sharding import Mesh
+
+from flax import linen as nn
+
+from MaxText.common_types import Config
+from MaxText.inference import page_manager
+from MaxText.layers import linears, quantizations
+from MaxText.layers.attentions import Attention
+from MaxText.layers.normalizations import RMSNorm
+
+# Type alias for cleaner type hints
+Quant = quantizations.AqtQuantization
+
+
+class DecoderLayer(nn.Module):
+  """
+  Transformer decoder layer that attends to the encoder.
+  This is the core, reusable building block for both the main model's
+  decoder stack and the auxiliary MTP layers.
+  """
+
+  config: Config
+  mesh: Mesh
+  quant: Optional[Quant] = None
+
+  @nn.compact
+  def __call__(
+      self,
+      inputs,
+      decoder_segment_ids,
+      decoder_positions,
+      deterministic,
+      model_mode,
+      previous_chunk=None,
+      slot: Optional[int] = None,
+      page_state: Optional[page_manager.PageState] = None,
+  ):
+    cfg = self.config
+    mesh = self.mesh
+
+    inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_length", "activation_embed"))
+    inputs = checkpoint_name(inputs, "decoder_layer_input")
+    # inputs: embedded inputs to the decoder with shape [batch, length, emb_dim]
+    lnx = RMSNorm(
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        name="pre_self_attention_norm",
+        epsilon=cfg.normalization_layer_epsilon,
+        kernel_axes=("norm",),
+    )(inputs)
+    lnx = nn.with_logical_constraint(lnx, ("activation_batch", "activation_length", "activation_embed"))
+
+    attention_layer = Attention(
+        config=self.config,
+        num_query_heads=cfg.num_query_heads,
+        num_kv_heads=cfg.num_kv_heads,
+        head_dim=cfg.head_dim,
+        max_target_length=cfg.max_target_length,
+        max_prefill_predict_length=cfg.max_prefill_predict_length,
+        attention_kernel=cfg.attention,
+        mesh=mesh,
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        dropout_rate=cfg.dropout_rate,
+        name="self_attention",
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
+        quant=self.quant,
+        kv_quant=quantizations.configure_kv_quant(cfg),
+        prefill_cache_axis_order=tuple(map(int, cfg.prefill_cache_axis_order.split(","))),
+        ar_cache_axis_order=tuple(map(int, cfg.ar_cache_axis_order.split(","))),
+        compute_axis_order=tuple(map(int, cfg.compute_axis_order.split(","))),
+        reshape_q=cfg.reshape_q,
+    )
+
+    attention_lnx = attention_layer(
+        lnx,
+        lnx,
+        decoder_positions,
+        decoder_segment_ids=decoder_segment_ids,
+        deterministic=deterministic,
+        model_mode=model_mode,
+    )
+
+    attention_lnx = nn.with_logical_constraint(attention_lnx, ("activation_batch", "activation_length", "activation_embed"))
+
+    # MLP block.
+    mlp_lnx = linears.MlpBlock(
+        intermediate_dim=cfg.mlp_dim,
+        activations=cfg.mlp_activations,
+        intermediate_dropout_rate=cfg.dropout_rate,
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        name="mlp",
+        config=cfg,
+        quant=self.quant,
+    )(lnx, deterministic=deterministic)
+    mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_length", "activation_embed"))
+
+    next_layer_addition = mlp_lnx + attention_lnx
+
+    next_layer_addition_dropped_out = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(
+        next_layer_addition, deterministic=deterministic
+    )
+
+    layer_output = next_layer_addition_dropped_out + inputs
+    layer_output = nn.with_logical_constraint(
+        layer_output,
+        ("activation_batch", "activation_length", "activation_embed"),
+    )
+
+    if cfg.record_internal_nn_metrics:
+      self.sow("intermediates", "activation_mean", jnp.mean(layer_output))
+      self.sow("intermediates", "activation_stdev", jnp.std(layer_output))
+      self.sow(
+          "intermediates",
+          "activation_fraction_zero",
+          jnp.sum(layer_output == 0) / jnp.size(layer_output),
+      )
+
+    return layer_output, None if cfg.scan_layers else layer_output