float8 moe training conversion API prototype

danielvegamyhre · danielvegamyhre · commit a71744c07960 · 2025-05-29T20:16:51.000-07:00
stack-info: PR: #2275, branch: danielvegamyhre/stack/1
diff --git a/torchao/prototype/scaled_grouped_mm/conversion_utils.py b/torchao/prototype/scaled_grouped_mm/conversion_utils.py
@@ -0,0 +1,91 @@
+from typing import Callable, Optional
+
+from torch import nn
+from torchao.prototype.scaled_grouped_mm.tensor import ScaledGroupedMMTensor
+
+
+def swap_params(
+    module: nn.Module,
+    *,
+    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
+) -> nn.Module:
+    """
+    Recurses through the nn.Module, recursively swapping the data tensor of
+    each nn.Parameter with a ScaledGroupedMMTensor. Only applies if the module
+    passed the module_filter_fn, if specified.
+
+    Args:
+        module: Module to modify.
+        module_filter_fn: If specified, only the `torch.nn.Parameter` subclasses that
+            that pass the filter function will be swapped. The inputs to the
+            filter function are the module instance, and the FQN.
+
+    Returns:
+     nn.Module: The modified module with swapped linear layers.
+    """
+    if isinstance(module, nn.Parameter) and (
+        module_filter_fn is None or module_filter_fn(module, "")
+    ):
+        if len(list(module.children())) > 0:
+            raise AssertionError(
+                f"Does not support a root nn.Parameter with children: {module}"
+            )
+        if not isinstance(module.data, ScaledGroupedMMTensor):
+            new_data = ScaledGroupedMMTensor(module.data)
+            return nn.Parameter(new_data, requires_grad=module.requires_grad)
+        return module
+
+    root_module = module
+
+    def post_order_traversal(
+        module: nn.Module,
+        cur_fqn: Optional[str] = None,
+        parent_module: Optional[nn.Module] = None,
+    ):
+        if cur_fqn is None:
+            cur_fqn = ""
+
+        for child_module_name, child_module in module.named_children():
+            if cur_fqn == "":
+                new_fqn = child_module_name
+            else:
+                new_fqn = f"{cur_fqn}.{child_module_name}"
+
+            post_order_traversal(child_module, new_fqn, module)
+
+        if module_filter_fn is None or module_filter_fn(module, cur_fqn):
+            for param_name, param in module.named_parameters(recurse=False):
+                if not isinstance(param.data, ScaledGroupedMMTensor):
+                    new_param = nn.Parameter(
+                        ScaledGroupedMMTensor(param), requires_grad=param.requires_grad
+                    )
+                    setattr(module, param_name, new_param)
+                    print(f"Swapped {cur_fqn}.{param_name} to ScaledGroupedMMTensor")
+
+    post_order_traversal(root_module)
+    return root_module
+
+
+def convert_moe_to_float8_training(
+    module: nn.Module,
+    *,
+    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
+) -> nn.Module:
+    """
+    Swaps `torch.nn.Parameter` data tensor with a ScaledGroupedMMTensor.
+
+    Args:
+        module: Module to modify.
+        module_filter_fn: If specified, only the `torch.nn.Parameter` instances of
+        modules that pass the filter function will be swapped. The inputs to the
+        filter function are the module instance and the FQN.
+
+    Returns:
+     nn.Module: The modified module with swapped parameters.
+    """
+
+    out = swap_params(
+        module,
+        module_filter_fn=module_filter_fn,
+    )
+    return out
diff --git a/torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py b/torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py
@@ -83,7 +83,10 @@ def forward(
         assert not _is_column_major(A), "A must be row-major"
 
         # Due to hardware requirements, the right operand in a scaled grouped GEMM must be column-major.
-        assert _is_column_major(B_t), "B must be column-major"
+        if not _is_column_major(B_t):
+            # FSDP will complain if B_t (weights) is not contiguous, we can't require B_t to be column-major.
+            # TODO: figure out better solution than transposing for each forward pass.
+            B_t = B_t.transpose(-2, -1).contiguous().transpose(-2, -1)
 
         # Convert high precision input tensor to float8, row-major for left operand of grouped GEMM.
         # A shape: (M, K)
diff --git a/torchao/prototype/scaled_grouped_mm/tensor.py b/torchao/prototype/scaled_grouped_mm/tensor.py
@@ -0,0 +1,35 @@
+import torch
+
+from torchao.prototype.scaled_grouped_mm import _scaled_grouped_mm
+
+
+class ScaledGroupedMMTensor(torch.Tensor):
+    """
+    ScaledGroupedMMTensor is a simple tensor subclass that wraps a regular tensor
+    and overrides the torch._grouped_mm op by dispatching to the
+    differentiable _scaled_grouped_mm autograd function.
+    """
+
+    grouped_mm_func_name = "_grouped_mm"
+    offs_arg_name = "offs"
+
+    def __init__(self, data: torch.Tensor):
+        self._data = data
+
+    @classmethod
+    def __torch_function__(cls, func, types, args, kwargs={}):
+        if func.__name__ == cls.grouped_mm_func_name:
+            # Use torchao scaled grouped mm with dynamic quant for
+            # "2d x 3d with offsets" case (used for routed experts).
+            # Otherwise, fall back to regular grouped mm.
+            #
+            # TODO: support "3d x 3d without offsets" case, which is
+            # used for shared experts. This is basically the grouped_mm
+            # kernel handling a bmm.
+            A, B = args[0], args[1]
+            A_is_2d = A.dim() == 2
+            B_is_3d = B.dim() == 3
+            has_offs = kwargs.get(cls.offs_arg_name) is not None
+            if A_is_2d and B_is_3d and has_offs:
+                return _scaled_grouped_mm(*args, **kwargs)
+        return super().__torch_function__(func, types, args, kwargs)