Remove int_scaled_mm's dependency on triton for cpu (#128)

Xia-Weiwen · web-flow · commit 5cfc4c7a9c9e · 2024-10-29T11:43:57.000-07:00
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -2,7 +2,7 @@
 import os
 import torch
 
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_2
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_2, TORCH_VERSION_AT_LEAST_2_6
 
 try:
     # Only works for torch2.2 or newer.
@@ -134,6 +134,13 @@ def int_scaled_matmul(a: torch.Tensor, b: torch.Tensor, scales1: torch.Tensor) -
     assert scales1.is_contiguous()
     scales1 = scales1.expand((M, N))
     assert scales1.dim() == 2
+
+    if scales1.device.type == "cpu" and TORCH_VERSION_AT_LEAST_2_6:
+        # CPU prefers decomposed version of int_scaled_matmul
+        # to leverage the fusion capability of Inductor
+        c = torch._int_mm(a, b)
+        return c.to(scales1.dtype) * scales1
+
     if intmm_triton is not None and AUTOTUNER_ENABLE:
         return torch.ops.torchao.int_scaled_matmul(a, b, scales1)
 
diff --git a/torchao/kernel/intmm_triton.py b/torchao/kernel/intmm_triton.py
@@ -356,9 +356,3 @@ def int_scaled_matmul_cuda(a, b, scales1):
         int_scaled_matmul_kernel, [a, b, scales1, c], int8_mm_kernel_configs
     )
     return int_scaled_matmul_kernel(a, b, scales1, c, best_config)
-
-
-@torch.library.impl(lib, "int_scaled_matmul", "CPU")
-def int_scaled_matmul_cpu(a, b, scales1):
-    c = torch._int_mm(a, b)
-    return c.to(scales1.dtype) * scales1