Adding output HEAD

parambole · parambole · commit 120c7849d4d6 · 2025-06-17T02:14:02.000Z
diff --git a/MaxText/layers/models.py b/MaxText/layers/models.py
@@ -142,6 +142,67 @@ def __call__(
       return inputs
 
 
+class OutputHead(nn.Module):
+  """
+  The final logit projection pipeline.
+  This module encapsulates Normalization, Dropout, and the final Logit Head
+  to ensure architectural consistency between the main model and auxiliary heads.
+  """
+
+  config: Config
+  shared_embedding: nn.Module
+
+  @nn.compact
+  def __call__(self, hidden_states: jnp.ndarray, deterministic: bool, model_mode: str) -> jnp.ndarray:
+    cfg = self.config
+
+    # 1. Final Normalization
+    y = RMSNorm(
+        dtype=cfg.dtype,
+        weight_dtype=cfg.weight_dtype,
+        name="decoder_norm",
+        epsilon=cfg.normalization_layer_epsilon,
+        kernel_axes=("norm",),
+    )(hidden_states)
+
+    # 2. Final Dropout
+    y = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(y, deterministic=deterministic)
+
+    # 3. Logit Projection (handles both methods)
+    if cfg.logits_via_embedding:
+      logits = self.shared_embedding.attend(y)
+      if cfg.normalize_embedding_logits:
+        logits = logits / jnp.sqrt(y.shape[-1])
+      if cfg.final_logits_soft_cap:
+        logits = jnp.tanh(logits / cfg.final_logits_soft_cap) * cfg.final_logits_soft_cap
+    else:
+      dense_layer = linears.dense_general(
+          inputs_shape=y.shape,
+          features=cfg.vocab_size,
+          weight_dtype=cfg.weight_dtype,
+          dtype=jnp.float32 if cfg.logits_dot_in_fp32 else cfg.dtype,
+          kernel_axes=("embed", "vocab"),
+          name="logits_dense",
+          matmul_precision=self.config.matmul_precision,
+      )
+      # Then, call the instance with the input tensor.
+      logits = dense_layer(y)
+
+    # 4. Final Casting
+    if cfg.cast_logits_to_fp32:
+      logits = logits.astype(jnp.float32)
+
+    # 5. Logical Constraints
+    if model_mode in (MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE):
+      logits = nn.with_logical_constraint(logits, (None, None, "activation_vocab"))
+    else:
+      logits = nn.with_logical_constraint(
+          logits, ("activation_embed_and_logits_batch", "activation_length", "activation_vocab")
+      )
+
+    return logits
+
+
 class Decoder(nn.Module):
   """A stack of decoder layers as a part of an encoder-decoder architecture."""
 
@@ -540,53 +601,7 @@ def __call__(
                 **layer_call_kwargs,
             )
     # After the final transformer layer, `y` holds the raw, un-normalized hidden state.
-    final_hidden_state = y
-    y = self.get_norm_layer()(
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="decoder_norm",
-        epsilon=cfg.normalization_layer_epsilon,
-        kernel_axes=("norm",),
-        parameter_memory_host_offload=cfg.parameter_memory_host_offload,
-    )(y)
-    y = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(y, deterministic=deterministic)
-
-    # [batch, length, emb_dim] -> [batch, length, vocab_size]
-    if cfg.logits_via_embedding:
-      # Use the transpose of embedding matrix for logit transform.
-      logits = self.shared_embedding.attend(y)
-      if self.config.normalize_embedding_logits:
-        # Correctly normalize pre-softmax logits for this shared case.
-        logits = logits / jnp.sqrt(y.shape[-1])
-      if cfg.final_logits_soft_cap:
-        logits = logits / cfg.final_logits_soft_cap
-        logits = jnp.tanh(logits) * cfg.final_logits_soft_cap
-    else:
-      logits = linears.dense_general(
-          inputs_shape=y.shape,
-          features=cfg.vocab_size,
-          weight_dtype=cfg.weight_dtype,
-          dtype=jnp.float32 if cfg.logits_dot_in_fp32 else cfg.dtype,  # for logit training stability
-          kernel_axes=("embed", "vocab"),
-          name="logits_dense",
-          matmul_precision=self.config.matmul_precision,
-          parameter_memory_host_offload=cfg.parameter_memory_host_offload,
-      )(
-          y
-      )  # We do not quantize the logits matmul.
-
-    if model_mode in (MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE):
-      logits = nn.with_logical_constraint(logits, (None, None, "activation_vocab"))
-    else:
-      logits = nn.with_logical_constraint(
-          logits, ("activation_embed_and_logits_batch", "activation_length", "activation_vocab")
-      )
-
-    if self.config.cast_logits_to_fp32:
-      logits = logits.astype(jnp.float32)
-    # The API of the Decoder is now a tuple, providing both the main output
-    # and the raw hidden state needed for auxiliary tasks.
-    return logits, final_hidden_state
+    return y
 
 
 class VisionEncoder(nn.Module):
@@ -662,6 +677,8 @@ def setup(self):
           config=self.config, mesh=self.mesh, name="mtp_block", transformer_layer_module=mtp_layer
       )
     self.vision_encoder = VisionEncoder(config=cfg, mesh=mesh) if cfg.use_multimodal else None
+    # Instantiate ONE OutputHead, which will be shared by the main path and MTP.
+    self.output_head = OutputHead(config=cfg, shared_embedding=self.shared_embedding)
     self.decoder = Decoder(config=cfg, shared_embedding=self.shared_embedding, mesh=mesh, quant=self.quant)
 
   def __call__(
@@ -702,7 +719,7 @@ def __call__(
       if self.config.decoder_block == DecoderBlockType.GEMMA3:
         bidirectional_mask = decoder_input_tokens == multimodal_utils.GEMMA_TOKEN_PLACEHOLDER
 
-    logits, final_hidden_state = self.decoder(
+    final_hidden_state = self.decoder(
         decoder_input_tokens=decoder_input_tokens,
         decoder_positions=decoder_positions,
         decoder_segment_ids=decoder_segment_ids,
@@ -715,6 +732,9 @@ def __call__(
         image_embeddings=image_embeddings,
     )
 
+    # The main logits are now computed by calling the dedicated OutputHead.
+    logits = self.output_head(hidden_states=final_hidden_state, deterministic=not enable_dropout, model_mode=model_mode)
+
     # If we are initializing the model AND MTP is enabled, we must create
     # dummy target tensors. This allows Flax to trace the MTPBlock and create
     # all its necessary parameters, without requiring the main training pipeline
@@ -736,8 +756,9 @@ def __call__(
     if self.config.mtp_num_layers > 0 and model_mode == MODEL_MODE_TRAIN:
       self.mtp_block(
           main_hidden_state=final_hidden_state,
-          input_ids=decoder_input_tokens,
           shared_embedding=self.shared_embedding,
+          output_head=self.output_head,
+          input_ids=decoder_input_tokens,
           target_ids=decoder_target_tokens,
           target_mask=decoder_target_mask,
           position_ids=decoder_positions,
diff --git a/MaxText/layers/multi_token_prediction.py b/MaxText/layers/multi_token_prediction.py
@@ -153,6 +153,7 @@ def __call__(
       self,
       main_hidden_state,
       shared_embedding,
+      output_head,
       input_ids,
       target_ids,
       target_mask,
@@ -193,8 +194,8 @@ def __call__(
           mtp_hidden_state, target_token_embedding, position_ids, decoder_segment_ids, deterministic
       )
 
-      # Project to logits using the shared embedding transpose
-      mtp_logits = shared_embedding.attend(next_mtp_hidden_state)
+      # Project to logits using the shared output head
+      mtp_logits = output_head(hidden_states=next_mtp_hidden_state, deterministic=deterministic, model_mode=MODEL_MODE_TRAIN)
 
       # Calculate cross-entropy loss for this specific layer's prediction
       mtp_xent, _ = max_utils.cross_entropy_with_logits(mtp_logits, jax.nn.one_hot(rolled_target_ids, cfg.vocab_size), 0.0)
diff --git a/MaxText/tests/multi_token_prediction_test.py b/MaxText/tests/multi_token_prediction_test.py
@@ -124,6 +124,9 @@ class MTPBlockTestModel(nn.Module):
   mesh: Mesh
 
   def setup(self):
+
+    self.output_head = models.OutputHead(config=self.config, shared_embedding=self.shared_embedding)
+
     self.shared_embedding = embeddings.Embed(
         num_embeddings=self.config.vocab_size, features=self.config.base_emb_dim, name="token_embedder", config=self.config
     )
@@ -137,6 +140,7 @@ def __call__(
     return self.mtp_block(
         main_hidden_state,
         self.shared_embedding,
+        self.self.output_head,
         input_ids,
         target_ids,
         target_mask,