float8 moe training conversion API prototype

danielvegamyhre · danielvegamyhre · commit eeabd9c67f23 · 2025-06-05T13:30:03.000-07:00
stack-info: PR: #2275, branch: danielvegamyhre/stack/1 migrate to quantize and add test work on moe training test
diff --git a/test/prototype/scaled_grouped_mm/test_moe_training_conversion.py b/test/prototype/scaled_grouped_mm/test_moe_training_conversion.py
@@ -0,0 +1,178 @@
+import pytest
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torchao.quantization.quant_api import quantize_
+from torchao.prototype.scaled_grouped_mm.conversion_utils import MoETrainingConfig
+from torchao.float8.float8_utils import compute_error
+
+# model definition from torchtitan:
+# https://github.com/pytorch/torchtitan/blob/768cde131105bde624160029d808e94649faf0f4/torchtitan/experiments/llama4/model/moe.py#L14
+class GroupedExperts(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        num_experts: int,
+        use_grouped_mm: bool,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+        self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.use_grouped_mm = use_grouped_mm
+        self.init_weights()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        num_local_tokens_per_expert: torch.Tensor | list[int] | None = None,
+    ) -> torch.Tensor:
+        # TODO: keeping this for loop implementation for comparison
+        #       and readability, will remove later
+        if not self.use_grouped_mm:
+            if num_local_tokens_per_expert is not None:
+                # a tuple of tensors indexed by experts
+                # each with shape (tokens_per_expert(varying), dim)
+                x = torch.split(
+                    x,
+                    split_size_or_sections=num_local_tokens_per_expert,
+                    dim=0,
+                )
+                out_experts_splits = []
+                for expert_idx, x_expert in enumerate(x):
+                    w1, w2, w3 = (
+                        self.w1[expert_idx],
+                        self.w2[expert_idx],
+                        self.w3[expert_idx],
+                    )
+                    h = F.silu(torch.matmul(x_expert, w1))
+                    h = h * torch.matmul(x_expert, w3)
+                    h = torch.matmul(h, w2)
+                    # h shape (tokens_per_expert(varying), dim)
+                    out_experts_splits.append(h)
+                out = torch.cat(out_experts_splits, dim=0)
+            else:
+                # x shape (num_experts, tokens_per_expert, dim)
+                h = F.silu(torch.bmm(x, self.w1))
+                h = h * torch.bmm(x, self.w3)
+                # out shape (num_experts, tokens_per_expert, dim)
+                out = torch.bmm(h, self.w2)
+
+            return out
+
+        # grouped mm implementation
+        if num_local_tokens_per_expert is not None:
+            # https://github.com/pytorch/pytorch/pull/150374
+            # NOTE: torch._gouped_mm requires bf16 dtypes
+            #       and shapes to be multiple of 8
+            offsets = torch.cumsum(
+                num_local_tokens_per_expert, dim=0, dtype=torch.int32
+            )
+            # grouped mm between a 2D tensor and a 3D tensor
+            assert x.dim() == 2
+        else:
+            offsets = None
+            # fall back to regular bmm between 3D tensors
+            assert x.dim() == 3
+
+        assert (
+            x.dtype == self.w1.dtype == self.w2.dtype == self.w3.dtype == torch.bfloat16
+        ), "torch._grouped_mm only supports bf16 dtypes"
+
+        h = F.silu(torch._grouped_mm(x, self.w1, offs=offsets))
+        h = h * torch._grouped_mm(x, self.w3, offs=offsets)
+        out = torch._grouped_mm(h, self.w2, offs=offsets)
+
+        return out
+
+    def init_weights(self, init_std: float = 0.02):
+        nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
+        nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std)
+
+class MoE(nn.Module):
+    """Toy MoE for testing. Not a complete implementation."""
+    def __init__(self, 
+        dim: int,
+        hidden_dim: int,
+        num_experts: int,
+        use_grouped_mm: bool
+    ):
+        super().__init__()
+        self.gate = nn.Linear(dim, num_experts)
+        self.experts = GroupedExperts(
+            dim,
+            hidden_dim,
+            num_experts,
+            use_grouped_mm,
+        )
+        self.init_weights()
+
+    def forward(self, x: torch.Tensor, num_local_tokens_per_expert: torch.Tensor) -> torch.Tensor:
+        return self.experts(x, num_local_tokens_per_expert=num_local_tokens_per_expert)
+
+    def init_weights(self, init_std: float = 0.02):
+        nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
+
+@pytest.mark.parametrize(
+    "model_class,target_fqns", [
+        # (MoE, ["experts"]),     # calling quantize_ on higher level module 
+        (GroupedExperts, [""]), # calling quantize_ on experts directly
+    ])
+def test_moe_float8_training(model_class: nn.Module, target_fqns: list[str]):
+    batch, seq, dim = 1, 8192, 4096
+    num_experts, top_k = 2, 1
+
+    def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
+        for target_fqn in target_fqns:
+            if target_fqn in cur_fqn:
+                return True
+        return False
+
+    # define MoE layer
+    torch.manual_seed(42)
+    model = model_class(dim=dim, hidden_dim=4*dim, num_experts=num_experts, use_grouped_mm=True).to(torch.bfloat16).cuda()
+    torch.manual_seed(42)
+    ref_model = model_class(dim=dim, hidden_dim=4*dim, num_experts=num_experts, use_grouped_mm=True).to(torch.bfloat16).cuda()
+    for param1, param2 in zip(model.parameters(), ref_model.parameters()):
+        assert torch.equal(param1, param2)
+
+    # convert MoE to float8 training
+    config = MoETrainingConfig()
+    quantize_(model, config=config, filter_fn=moe_module_filter_fn)
+
+    # inputs
+    torch.manual_seed(42)
+    x = torch.randn(batch*seq*top_k, dim, dtype=torch.bfloat16, requires_grad=True).cuda()
+    torch.manual_seed(42)
+    ref_x = torch.randn(batch*seq*top_k, dim, dtype=torch.bfloat16, requires_grad=True).cuda()
+
+    # offsets
+    num_tokens_per_expert = (batch * seq * top_k) // num_experts
+    tokens_per_expert_tensor = torch.tensor([num_tokens_per_expert], dtype=torch.int32).repeat(num_experts).cuda()
+    ref_tokens_per_expert_tensor = tokens_per_expert_tensor.clone()
+
+    # forward pass
+    out = model(x, num_local_tokens_per_expert=tokens_per_expert_tensor)
+    ref_out = ref_model(ref_x, num_local_tokens_per_expert=ref_tokens_per_expert_tensor)
+    
+    # validate SQNR is acceptable.
+    # a single fp8 gemm uses SQNR >= 25.0 for testing, so for a full MoE layer
+    # we'll use a slightly lower threshold.
+    out_sqnr = compute_error(out, ref_out)
+    assert out_sqnr.item() >= 23.0, f"SQNR must be >= 23.0, got {out_sqnr.item()}."
+
+    # backward pass
+    out.sum().backward()
+    ref_out.sum().backward()
+
+    # validate input gradients
+    assert torch.allclose(x.grad, ref_x.grad)
+
+    # validate param gradients
+    for param1, param2 in zip(model.parameters(), ref_model.parameters()):
+        assert torch.allclose(param1.grad, param2.grad)
diff --git a/test/prototype/scaled_grouped_mm/test_scaled_grouped_mm.py b/test/prototype/scaled_grouped_mm/test_scaled_grouped_mm.py
@@ -23,7 +23,7 @@
     Float8LinearConfig,
     Float8LinearRecipeName,
 )
-from torchao.float8.float8_linear import matmul_with_hp_or_float8_args
+from torchao.float8.float8_linear import _matmul_with_hp_or_float8_args
 from torchao.float8.float8_tensor import LinearMMConfig
 from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
 from torchao.prototype.scaled_grouped_mm.scaled_grouped_mm import (
@@ -183,7 +183,7 @@ def compute_reference_forward(
 
     # Validate each actual result group from the _scaled_grouped_mm is equal to:
     # 1. A manual _scaled_mm for the group.
-    # 2. A matmul_with_hp_or_float8_args for the group (which is differentiable, and thus used to validate gradients).
+    # 2. A _matmul_with_hp_or_float8_args for the group (which is differentiable, and thus used to validate gradients).
     outputs = []
     list1 = list(zip(A_list_fp8, B_t_fp8, A_scale_list, B_t_scales, result_list))
     list2 = list(zip(A_list, B_t, result_list))
@@ -199,7 +199,7 @@ def compute_reference_forward(
             use_fast_accum=float8_config.gemm_config_output.use_fast_accum,
         )
         a2, b2, result2 = list2[i]
-        ref_group_result2 = matmul_with_hp_or_float8_args.apply(
+        ref_group_result2 = _matmul_with_hp_or_float8_args.apply(
             a2,
             b2,
             LinearMMConfig(),
diff --git a/torchao/prototype/scaled_grouped_mm/conversion_utils.py b/torchao/prototype/scaled_grouped_mm/conversion_utils.py
@@ -0,0 +1,93 @@
+from typing import Callable, Optional
+
+from torch import nn
+
+from torchao.core.config import AOBaseConfig
+from torchao.quantization.transform_module import (
+    register_quantize_module_handler,
+)
+from torchao.prototype.scaled_grouped_mm.tensor import ScaledGroupedMMTensor
+
+
+class MoETrainingConfig(AOBaseConfig):
+    pass
+
+
+@register_quantize_module_handler(MoETrainingConfig)
+def _moe_training_transform(
+    module: nn.Module,
+    config: MoETrainingConfig,
+) -> nn.Module:
+    """
+    Swaps `torch.nn.Parameter` data tensor with a ScaledGroupedMMTensor.
+
+    Args:
+        module: Module to modify.
+        config: MoETrainingConfig which defines how to perform the MoE training transform.
+
+    Returns:
+     nn.Module: The modified module with swapped parameters.
+    """
+    out = swap_params(module)
+    return out
+
+def swap_params(
+    module: nn.Module,
+    *,
+    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
+) -> nn.Module:
+    """
+    Recurses through the nn.Module, recursively swapping the data tensor of
+    each nn.Parameter with a ScaledGroupedMMTensor. Only applies if the module
+    passed the module_filter_fn, if specified.
+
+    Args:
+        module: Module to modify.
+        module_filter_fn: If specified, only the `torch.nn.Parameter` subclasses that
+            that pass the filter function will be swapped. The inputs to the
+            filter function are the module instance, and the FQN.
+
+    Returns:
+     nn.Module: The modified module with swapped linear layers.
+    """
+    if isinstance(module, nn.Parameter) and (
+        module_filter_fn is None or module_filter_fn(module, "")
+    ):
+        if len(list(module.children())) > 0:
+            raise AssertionError(
+                f"Does not support a root nn.Parameter with children: {module}"
+            )
+        if not isinstance(module.data, ScaledGroupedMMTensor):
+            new_data = ScaledGroupedMMTensor(module.data)
+            return nn.Parameter(new_data, requires_grad=module.requires_grad)
+        return module
+
+    root_module = module
+
+    def post_order_traversal(
+        module: nn.Module,
+        cur_fqn: Optional[str] = None,
+        parent_module: Optional[nn.Module] = None,
+    ):
+        if cur_fqn is None:
+            cur_fqn = ""
+
+        for child_module_name, child_module in module.named_children():
+            if cur_fqn == "":
+                new_fqn = child_module_name
+            else:
+                new_fqn = f"{cur_fqn}.{child_module_name}"
+
+            post_order_traversal(child_module, new_fqn, module)
+
+        if module_filter_fn is None or module_filter_fn(module, cur_fqn):
+            for param_name, param in module.named_parameters(recurse=False):
+                if not isinstance(param.data, ScaledGroupedMMTensor):
+                    new_param = nn.Parameter(
+                        ScaledGroupedMMTensor(param), requires_grad=param.requires_grad
+                    )
+                    setattr(module, param_name, new_param)
+                    print(f"Swapped {cur_fqn}.{param_name} to ScaledGroupedMMTensor")
+
+    post_order_traversal(root_module)
+    return root_module
diff --git a/torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py b/torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py
@@ -83,7 +83,10 @@ def forward(
         assert not _is_column_major(A), "A must be row-major"
 
         # Due to hardware requirements, the right operand in a scaled grouped GEMM must be column-major.
-        assert _is_column_major(B_t), "B must be column-major"
+        if not _is_column_major(B_t):
+            # FSDP will complain if B_t (weights) is not contiguous, we can't require B_t to be column-major.
+            # TODO: figure out better solution than transposing for each forward pass.
+            B_t = B_t.transpose(-2, -1).contiguous().transpose(-2, -1)
 
         # Convert high precision input tensor to float8, row-major for left operand of grouped GEMM.
         # A shape: (M, K)
diff --git a/torchao/prototype/scaled_grouped_mm/tensor.py b/torchao/prototype/scaled_grouped_mm/tensor.py
@@ -0,0 +1,35 @@
+import torch
+
+from torchao.prototype.scaled_grouped_mm import _scaled_grouped_mm
+
+
+class ScaledGroupedMMTensor(torch.Tensor):
+    """
+    ScaledGroupedMMTensor is a simple tensor subclass that wraps a regular tensor
+    and overrides the torch._grouped_mm op by dispatching to the
+    differentiable _scaled_grouped_mm autograd function.
+    """
+
+    grouped_mm_func_name = "_grouped_mm"
+    offs_arg_name = "offs"
+
+    def __init__(self, data: torch.Tensor):
+        self._data = data
+
+    @classmethod
+    def __torch_function__(cls, func, types, args, kwargs={}):
+        if func.__name__ == cls.grouped_mm_func_name:
+            # Use torchao scaled grouped mm with dynamic quant for
+            # "2d x 3d with offsets" case (used for routed experts).
+            # Otherwise, fall back to regular grouped mm.
+            #
+            # TODO: support "3d x 3d without offsets" case, which is
+            # used for shared experts. This is basically the grouped_mm
+            # kernel handling a bmm.
+            A, B = args[0], args[1]
+            A_is_2d = A.dim() == 2
+            B_is_3d = B.dim() == 3
+            has_offs = kwargs.get(cls.offs_arg_name) is not None
+            if A_is_2d and B_is_3d and has_offs:
+                return _scaled_grouped_mm(*args, **kwargs)
+        return super().__torch_function__(func, types, args, kwargs)