Register codebook quant ops

jerryzh168 · jerryzh168 · commit 0117572ea8f9 · 2025-04-08T14:40:19.000-07:00
Summary:
Register the codebook quant / dequant ops as custom ops so they can be recongnized after export

Test Plan:
python test/prototype/test_codebook_quant.py -k test_export

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/prototype/test_codebook_quant.py b/test/prototype/test_codebook_quant.py
@@ -20,7 +20,7 @@ class TestCodebookQuantization(unittest.TestCase):
     def setUp(self):
         torch.manual_seed(123)
         self.input = torch.randn(100, 256, dtype=torch.float32)
-        self.block_size = (1, 1)
+        self.block_size = (2, 2)
         self.scale_block_size = 64
         self.code_dtype = torch.uint8
         self.chunk_size = 1024
@@ -74,6 +74,20 @@ def test_quantize_api(self):
         quantize_(m, codebook_weight_only())
         assert type(m[0].weight) == CodebookQuantizedTensor
 
+    def test_export(self):
+        m = torch.nn.Sequential(torch.nn.Linear(128, 64)).to(
+            dtype=torch.bfloat16, device="cuda"
+        )
+        quantize_(m, codebook_weight_only())
+        # quantize_(m, int4_weight_only(group_size=16))
+        example_inputs = (torch.randn(1, 128, dtype=torch.bfloat16, device="cuda"),)
+        print("m:", m)
+        # torchao.utils.unwrap_tensor_subclass(m)
+        m = torch.export.export_for_training(m, example_inputs).module()
+        print("m:", m)
+        targets = [n.target for n in m.graph.nodes]
+        self.assertTrue(torch.ops.quant.quantize_codebook.default in targets)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/prototype/quantization/codebook/codebook_ops.py b/torchao/prototype/quantization/codebook/codebook_ops.py
@@ -11,8 +11,13 @@
     _DTYPE_TO_QVALUE_BOUNDS,
     _SUB_BYTE_UINT_BOUNDS,
 )
+from torchao.utils import _register_custom_op
 
+quant_lib = torch.library.Library("quant", "FRAGMENT")
+register_custom_op = _register_custom_op(quant_lib)
 
+
+@register_custom_op
 def quantize_codebook(
     input: torch.Tensor,
     codebook: torch.Tensor,
@@ -25,7 +30,8 @@ def quantize_codebook(
 
     Args:
         input (torch.Tensor): Input tensor to quantize, shape (d1, d2, ..., dN).
-        codebook (torch.Tensor): Codebook tensor for quantization, shape (k, b1, b2, ..., bN) where b_i are block sizes.
+        codebook (torch.Tensor): Codebook tensor for quantization, shape (k, b1, b2, ..., bN) where b_i are block sizes and k is the codebook_size, e.g. for uint4 (4 bit), codebook size is 2**4
+            one corresponding dequantized vector of (b1, b2, .., bN) dimension for each of uint4 integer value of 0 to 15
         scales (torch.Tensor): Scales, shape (d1, d2, ..., dN // scale_block_size, 1).
         chunk_size (int): Number of elements to process per chunk to control memory usage.
         code_dtype (torch.dtype): dtype for the codes.
@@ -95,20 +101,24 @@ def quantize_codebook(
     return codes.to(code_dtype)
 
 
+@register_custom_op
 def dequantize_codebook(
     codes: torch.Tensor,
     codebook: torch.Tensor,
+    input_dtype: torch.dtype,
     scales: torch.Tensor,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """
     Reconstructs the original tensor from codes and the codebook.
 
     Args:
-        codes (torch.Tensor): Indices of codebook entries for each block,
-                                          shape (d1//b1, d2//b2, ..., dN//bN).
+        codes (torch.Tensor): torch.int32 dtype, indices of codebook entries for each block,
+                              shape (d1//b1, d2//b2, ..., dN//bN).
         codebook (torch.Tensor): Codebook tensor used for quantization,
                                  shape (k, b1, b2, ..., bN) where b_i are block sizes.
+        input_dtype (torch.dtype): Input dtype for `codes`, used for downstream pattern matching
+                             and not enforced in `codes`. can be sub byte dtype like torch.uint4
         scales (torch.Tensor): Scales, shape (d1, d2, ..., dN // scale_block_size, 1).
         output_dtype (torch.dtype): dtype for the output tensor.
 
@@ -142,7 +152,7 @@ def dequantize_codebook(
     dequant = dequant.view(
         *new_shape
     )  # (d1, d2, ..., num_scale_blocks, scale_block_size)
-    dequant.mul_(scales)
+    dequant = dequant * scales
 
     dequant = dequant.view(*original_shape)
 
@@ -172,6 +182,7 @@ def choose_qparams_codebook(
     Returns:
         torch.Tensor: The codebook tensor, shape (codebook_size, *block_size).
     """
+    breakpoint()
     if code_dtype == torch.int32:
         codebook_size = 2**16
     else:
diff --git a/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py b/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py
@@ -96,12 +96,15 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
             codes = self.codes.get_plain()
         else:
             codes = self.codes
+
         if codes.dtype != torch.int32:
             # TODO: Investigate and support not casting to torch.int32 for indexing to improve performance
             codes = codes.to(torch.int32)
+
         return dequantize_codebook(
             codes,
             self.codebook,
+            self.codes.dtype,
             self.scales,
             output_dtype=output_dtype,
         )
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -210,13 +210,13 @@ def decorator(fn):
 
             # expecting fn.__name__ starts with `_` and we want to take the rest
             # to be the name of the custom op
-            assert (
-                fn.__name__[0] == "_"
-            ), f"Expecting function name starts with `_`, got {fn.__name__}"
             assert not any(
                 c in fn.__name__ for c in ".<>"
             ), f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
-            op_name = fn.__name__[1:]
+            op_name = fn.__name__
+            if op_name[0] == "_":
+                op_name = op_name[1:]
+
             schema = op_name + infer_schema(fn, mutates_args={})
             lib.define(schema)
             lib.impl(op_name, fn, "CompositeImplicitAutograd")