diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py index 8c36f5ac7a..b9488a0617 100644 --- a/test/dtypes/test_affine_quantized_float.py +++ b/test/dtypes/test_affine_quantized_float.py @@ -3,24 +3,21 @@ # # This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. -import pytest +import unittest from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_5, ) if not TORCH_VERSION_AT_LEAST_2_5: - pytest.skip("Unsupported PyTorch version", allow_module_level=True) + unittest.skip("Unsupported PyTorch version") import copy import io import random -import unittest -from contextlib import nullcontext from functools import partial from typing import Tuple -import pytest import torch from torch._inductor.test_case import TestCase as InductorTestCase from torch.testing._internal import common_utils @@ -95,68 +92,68 @@ def test_fp8_linear_variants( "Static quantization only supports PerTensor granularity" ) - error_context = ( - pytest.raises(AssertionError, match=error_message) - if error_message - else nullcontext() + if error_message: + with self.assertRaisesRegex(AssertionError, error_message): + self._run_fp8_linear_variant(dtype, mode, compile, sizes, granularity) + else: + self._run_fp8_linear_variant(dtype, mode, compile, sizes, granularity) + + def _run_fp8_linear_variant(self, dtype, mode, compile, sizes, granularity): + M, N, K = sizes + input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda") + # Get a "reasonable" scale for the input tensor even though + # we use the same scale for multiple activations + scale, _ = choose_qparams_affine( + input_tensor, + MappingType.SYMMETRIC, + input_tensor.shape, + torch.float8_e4m3fn, + scale_dtype=torch.float32, ) + mode_map = { + "dynamic": partial( + float8_dynamic_activation_float8_weight, granularity=granularity + ), + "weight-only": float8_weight_only, + "static": partial( + float8_static_activation_float8_weight, + scale=scale, + granularity=granularity, + ), + } - with error_context: - M, N, K = sizes - input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda") - # Get a "reasonable" scale for the input tensor even though - # we use the same scale for multiple activations - scale, _ = choose_qparams_affine( - input_tensor, - MappingType.SYMMETRIC, - input_tensor.shape, - torch.float8_e4m3fn, - scale_dtype=torch.float32, - ) - mode_map = { - "dynamic": partial( - float8_dynamic_activation_float8_weight, granularity=granularity - ), - "weight-only": float8_weight_only, - "static": partial( - float8_static_activation_float8_weight, - scale=scale, - granularity=granularity, - ), - } - - # Create a linear layer with bfloat16 dtype - model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") - - quantized_model = copy.deepcopy(model) - factory = mode_map[mode]() - quantize_(quantized_model, factory) - - if compile: - quantized_model = torch.compile(quantized_model, fullgraph=True) - - output_original = model(input_tensor) - output_quantized = quantized_model(input_tensor) - - error = compute_error(output_original, output_quantized) - assert compute_error(output_original, output_quantized) > 20, ( - f"Quantization error is too high got a SQNR of {error}" - ) + # Create a linear layer with bfloat16 dtype + model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") + + quantized_model = copy.deepcopy(model) + factory = mode_map[mode]() + quantize_(quantized_model, factory) + + if compile: + quantized_model = torch.compile(quantized_model, fullgraph=True) + + output_original = model(input_tensor) + output_quantized = quantized_model(input_tensor) + + error = compute_error(output_original, output_quantized) + self.assertGreater( + error, 20, f"Quantization error is too high got a SQNR of {error}" + ) @unittest.skipIf( not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" ) def test_invalid_granularity(self): - with pytest.raises(ValueError, match="Invalid granularity specification"): + with self.assertRaisesRegex(ValueError, "Invalid granularity specification"): float8_dynamic_activation_float8_weight(granularity="invalid") @unittest.skipIf( not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" ) def test_mismatched_granularity(self): - with pytest.raises( + with self.assertRaisesRegex( ValueError, - match="Different granularities for activation and weight are not supported", + "Different granularities for activation and weight are not supported", ): float8_dynamic_activation_float8_weight(granularity=(PerTensor(), PerRow())) @@ -167,7 +164,7 @@ def test_unsupported_granularity(self): class UnsupportedGranularity: pass - with pytest.raises(ValueError, match="Invalid granularity types"): + with self.assertRaisesRegex(ValueError, "Invalid granularity types"): float8_dynamic_activation_float8_weight( granularity=(UnsupportedGranularity(), UnsupportedGranularity()) ) @@ -177,9 +174,9 @@ class UnsupportedGranularity: not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" ) def test_per_row_with_float32(self): - with pytest.raises( + with self.assertRaisesRegex( AssertionError, - match="PerRow quantization only works for bfloat16 precision", + "PerRow quantization only works for bfloat16 precision", ): model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda") quantize_( @@ -317,4 +314,4 @@ def test_mm_float8dq(self): common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile) if __name__ == "__main__": - pytest.main([__file__]) + unittest.main() diff --git a/test/dtypes/test_bitpacking.py b/test/dtypes/test_bitpacking.py index 0ed4462d5d..e91ad627d2 100644 --- a/test/dtypes/test_bitpacking.py +++ b/test/dtypes/test_bitpacking.py @@ -3,8 +3,10 @@ # # This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. -import pytest +import unittest + import torch +from torch.testing._internal import common_utils from torch.utils._triton import has_triton from torchao.dtypes.uintx.bitpacking import pack, pack_cpu, unpack, unpack_cpu @@ -13,68 +15,102 @@ dimensions = (0, -1, 1) -@pytest.fixture(autouse=True) -def run_before_and_after_tests(): - yield - torch._dynamo.reset() # reset cache between tests - - -@pytest.mark.parametrize("bit_width", bit_widths) -@pytest.mark.parametrize("dim", dimensions) -def test_CPU(bit_width, dim): - test_tensor = torch.randint( - 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8, device="cpu" - ) - packed = pack_cpu(test_tensor, bit_width, dim=dim) - unpacked = unpack_cpu(packed, bit_width, dim=dim) - assert unpacked.allclose(test_tensor) - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -@pytest.mark.parametrize("bit_width", bit_widths) -@pytest.mark.parametrize("dim", dimensions) -def test_GPU(bit_width, dim): - test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() - packed = pack(test_tensor, bit_width, dim=dim) - unpacked = unpack(packed, bit_width, dim=dim) - assert unpacked.allclose(test_tensor) - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -@pytest.mark.skipif(not has_triton(), reason="unsupported without triton") -@pytest.mark.parametrize("bit_width", bit_widths) -@pytest.mark.parametrize("dim", dimensions) -def test_compile(bit_width, dim): - torch._dynamo.config.specialize_int = True - torch.compile(pack, fullgraph=True) - torch.compile(unpack, fullgraph=True) - test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() - packed = pack(test_tensor, bit_width, dim=dim) - unpacked = unpack(packed, bit_width, dim=dim) - assert unpacked.allclose(test_tensor) - - -# these test cases are for the example pack walk through in the bitpacking.py file -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -def test_pack_example(): - test_tensor = torch.tensor( - [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 - ).cuda() - shard_4, shard_2 = pack(test_tensor, 6) - print(shard_4, shard_2) - assert torch.tensor([0, 105, 151, 37], dtype=torch.uint8).cuda().allclose(shard_4) - assert torch.tensor([39, 146], dtype=torch.uint8).cuda().allclose(shard_2) - unpacked = unpack([shard_4, shard_2], 6) - assert unpacked.allclose(test_tensor) - - -def test_pack_example_CPU(): - test_tensor = torch.tensor( - [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 - ) - shard_4, shard_2 = pack(test_tensor, 6) - print(shard_4, shard_2) - assert torch.tensor([0, 105, 151, 37], dtype=torch.uint8).allclose(shard_4) - assert torch.tensor([39, 146], dtype=torch.uint8).allclose(shard_2) - unpacked = unpack([shard_4, shard_2], 6) - assert unpacked.allclose(test_tensor) +class TestBitpacking(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + # Set random seeds for reproducibility + torch.manual_seed(0) + if torch.cuda.is_available(): + torch.cuda.manual_seed(0) + + # Initialize any common test data + self.test_shape = (32, 32, 32) + + # Set default device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Reset any cached state + torch._dynamo.reset() + + def tearDown(self): + """Clean up after each test method.""" + # Reset torch._dynamo cache + torch._dynamo.reset() + + # Clear CUDA cache if available + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Reset any other state if needed + torch._dynamo.config.specialize_int = False + + @common_utils.parametrize("bit_width", bit_widths) + @common_utils.parametrize("dim", dimensions) + def test_CPU(self, bit_width, dim): + test_tensor = torch.randint( + 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8, device="cpu" + ) + packed = pack_cpu(test_tensor, bit_width, dim=dim) + unpacked = unpack_cpu(packed, bit_width, dim=dim) + self.assertTrue(unpacked.allclose(test_tensor)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + @common_utils.parametrize("bit_width", bit_widths) + @common_utils.parametrize("dim", dimensions) + def test_GPU(self, bit_width, dim): + test_tensor = torch.randint( + 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8 + ).cuda() + packed = pack(test_tensor, bit_width, dim=dim) + unpacked = unpack(packed, bit_width, dim=dim) + self.assertTrue(unpacked.allclose(test_tensor)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + @unittest.skipIf(not has_triton(), "unsupported without triton") + @common_utils.parametrize("bit_width", bit_widths) + @common_utils.parametrize("dim", dimensions) + def test_compile(self, bit_width, dim): + torch._dynamo.config.specialize_int = True + torch.compile(pack, fullgraph=True) + torch.compile(unpack, fullgraph=True) + test_tensor = torch.randint( + 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8 + ).cuda() + packed = pack(test_tensor, bit_width, dim=dim) + unpacked = unpack(packed, bit_width, dim=dim) + self.assertTrue(unpacked.allclose(test_tensor)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_pack_example(self): + test_tensor = torch.tensor( + [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 + ).cuda() + shard_4, shard_2 = pack(test_tensor, 6) + print(shard_4, shard_2) + self.assertTrue( + torch.tensor([0, 105, 151, 37], dtype=torch.uint8).cuda().allclose(shard_4) + ) + self.assertTrue( + torch.tensor([39, 146], dtype=torch.uint8).cuda().allclose(shard_2) + ) + unpacked = unpack([shard_4, shard_2], 6) + self.assertTrue(unpacked.allclose(test_tensor)) + + def test_pack_example_CPU(self): + test_tensor = torch.tensor( + [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 + ) + shard_4, shard_2 = pack(test_tensor, 6) + print(shard_4, shard_2) + self.assertTrue( + torch.tensor([0, 105, 151, 37], dtype=torch.uint8).allclose(shard_4) + ) + self.assertTrue(torch.tensor([39, 146], dtype=torch.uint8).allclose(shard_2)) + unpacked = unpack([shard_4, shard_2], 6) + self.assertTrue(unpacked.allclose(test_tensor)) + + +common_utils.instantiate_parametrized_tests(TestBitpacking) + +if __name__ == "__main__": + unittest.main()