Revert "Mixed Precision batchnorm fix (pytorch#77089)"

pytorchmergebot · pytorchmergebot · commit 091f8915ae29 · 2022-05-11T03:00:33.000Z
This reverts commit bf61b79. Reverted pytorch#77089 on behalf of https://github.com/suo
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -8,7 +8,6 @@
 import torch
 import torch.cuda.nccl as nccl
 import torch.nn as nn
-import torch.nn.functional as F
 from torch import distributed as dist
 from torch.distributed.fsdp import (
     FullyShardedDataParallel as FSDP,
@@ -17,8 +16,6 @@
     BackwardPrefetch,
     ShardingStrategy,
 )
-from torch.distributed.fsdp.wrap import default_auto_wrap_policy
-from torch.nn.modules.batchnorm import _BatchNorm
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     FSDPTest,
@@ -29,18 +26,9 @@
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
-    sandcastle_skip_if,
 )
 from torch.testing._internal.common_cuda import CUDA11OrLater
 
-try:
-    import torchvision
-    HAS_TORCHVISION = True
-except ImportError:
-    HAS_TORCHVISION = False
-
-skipIfNoTorchVision = sandcastle_skip_if(not HAS_TORCHVISION, "no torchvision")
-
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -517,100 +505,6 @@ def test_mp_embedding_params_and_reduce_diff(self):
         )
         self._test_mixed_precision_embedding_table(mp_config=params_and_reduce_different)
 
-    @skip_if_lt_x_gpu(2)
-    @skipIfNoTorchVision
-    def test_mixed_precision_resnet(self):
-        """
-        End to end test to ensure mixed precision + auto_wrap works
-        for ResNet model.
-        """
-        resnet_model = torchvision.models.resnet50().cuda()
-        resnet_model = nn.SyncBatchNorm.convert_sync_batchnorm(
-            resnet_model,
-            process_group=dist.distributed_c10d._get_default_group()
-        )
-        n_bn = sum(1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules())
-        inp = torch.ones(1, 3, 1000, 1000, device='cuda')
-        mp_config = MixedPrecision(
-            param_dtype=torch.float16,
-            reduce_dtype=torch.float16,
-            buffer_dtype=torch.float16,
-        )
-        fsdp = FSDP(
-            resnet_model,
-            auto_wrap_policy=default_auto_wrap_policy,
-            mixed_precision=mp_config
-        )
-        # Batchnorm units should be wrapped individually. Validate this by
-        # ensuring there are equal no. of FSDP units that are BN as BN units
-        # in original resnet model.
-        fsdp_bn = 0
-        for module in fsdp.fsdp_modules(fsdp):
-            wrapped_module = module.module.module
-            if isinstance(wrapped_module, _BatchNorm):
-                fsdp_bn += 1
-
-        self.assertEqual(fsdp_bn, n_bn)
-        # Would throw type mismatch issue without mixed precision autowrapping.
-        loss = fsdp(inp).sum()
-        loss.backward()
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("convert_sync_bn", [True, False])
-    def test_mp_batchnorm(self, convert_sync_bn):
-        class BatchNormNet(nn.Module):
-            def __init__(self, affine=True):
-                super(BatchNormNet, self).__init__()
-                self.fc1 = nn.Linear(2, 40, bias=False)
-                self.bn = nn.BatchNorm1d(4, affine=affine)
-                self.fc2 = nn.Linear(40, 4, bias=False)
-
-            def forward(self, x):
-                x = torch.reshape(self.fc1(x), (-1, 4, 10))
-                x = self.bn(x)
-                x = torch.reshape(x, (-1, 40))
-                x = self.fc2(x)
-                return F.softmax(x, dim=1)
-
-        def never_wrap_policy(*args, **kwargs):
-            return False
-
-        net = BatchNormNet().cuda()
-        if convert_sync_bn:
-            net = nn.SyncBatchNorm.convert_sync_batchnorm(net)
-        # FSDP detects that mixed precision + batchnorm will cause issues
-        # and thus wrap batchnorm in a distinct FSDP unit that does not
-        # use mixed precision.
-        mp_config = MixedPrecision(
-            param_dtype=torch.float16,
-            reduce_dtype=torch.float16,
-            buffer_dtype=torch.float16,
-        )
-        with self.assertWarnsRegex(
-            expected_warning=UserWarning,
-            expected_regex="BatchNorm units will be wrapped as a separate"
-        ):
-            model = FSDP(
-                net,
-                mixed_precision=mp_config,
-                auto_wrap_policy=never_wrap_policy,
-            )
-
-        bn = model.bn
-        self.assertTrue(isinstance(bn, FSDP))
-        # policy should not have wrapped any other submodules
-        self.assertFalse(isinstance(model.fc1, FSDP))
-        self.assertFalse(isinstance(model.fc2, FSDP))
-        self.assertEqual(None, bn.mixed_precision)
-        self.assertNotEqual(None, model.mixed_precision)
-
-        inp = torch.randn((1, 2), device='cuda')
-        # Without FSDP BN mixed precision fix, this would result in
-        # RuntimeError: Expected counts to have type Half but got Float
-        # for syncBN
-        model(inp).sum().backward()
-
-
 class TestFSDPMixedPrecisionUnsharded(TestFSDPMixedPrecision):
     """
     Smaller test suite for unshared param (i.e. world_size == 1) case.
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
@@ -17,9 +17,7 @@
     always_wrap_policy,
     size_based_auto_wrap_policy,
     enable_wrap,
-    _or_policy,
     wrap,
-    _wrap_batchnorm_individually,
     transformer_auto_wrap_policy,
 )
 from torch.testing._internal.common_distributed import (
@@ -42,15 +40,6 @@
 )
 from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
 
-class BatchNormNet(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.lin = nn.Linear(10, 10, bias=False)
-        self.bn1 = nn.BatchNorm1d(10)
-        self.bn2 = nn.BatchNorm2d(10)
-        self.bn3 = nn.BatchNorm3d(10)
-        self.sync_bn = nn.SyncBatchNorm(10)
-
 class WrapMethod(Enum):
     FSDP_CTOR = auto()
     # FSDP_CTOR is the supported way forward, but keep WRAP_API in case we miss
@@ -148,62 +137,6 @@ def test_error_already_wrapped(self, nested, fsdp_init_mode):
         with self.assertRaisesRegex(ValueError, "to NOT be FullyShardedDataParallel"):
             mod = FSDP(wrapped_fsdp, auto_wrap_policy=size_based_auto_wrap_policy)
 
-    @skip_if_lt_x_gpu(2)
-    @parametrize("use_or_policy", [True, False])
-    def test_wrap_batchnorm_individually(self, use_or_policy):
-        def never_wrap_policy(*args, **kwargs):
-            return False
-
-        policy = (
-            functools.partial(
-                _or_policy,
-                policies=[never_wrap_policy, _wrap_batchnorm_individually]
-            ) if use_or_policy else _wrap_batchnorm_individually
-        )
-        model = BatchNormNet()
-        fsdp = FSDP(model, auto_wrap_policy=policy)
-        # Batchnorms should be wrapped
-        for layer in [fsdp.bn1, fsdp.bn2, fsdp.bn3, fsdp.sync_bn]:
-            self.assertTrue(isinstance(layer, FSDP))
-
-        self.assertFalse(isinstance(fsdp.lin, FSDP))
-
-    @skip_if_lt_x_gpu(2)
-    def test_bn_always_wrapped_individually(self):
-        """
-        Ensures that by using _or_policy with _wrap_batchnorm_individually, even
-        if the other policy results in a module containing a BN unit being
-        wrapped, the contained BN unit will still be individually wrapped.
-        """
-        class MyModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.bn_container = BatchNormNet()
-
-        def wrap_bn_container(module, recurse, *args, **kwargs):
-            if recurse:
-                return True
-            return isinstance(module, BatchNormNet)
-
-        my_policy = functools.partial(
-            _or_policy,
-            policies=[wrap_bn_container, _wrap_batchnorm_individually]
-        )
-        mod = MyModule()
-        fsdp = FSDP(mod, auto_wrap_policy=my_policy)
-
-        # Wrapping should be FSDP(FSDP(BatchNormNet(FSDP(BN))))
-        # and not FSDP(FSDP(BatchNormNet(BN))) (in the latter the inner
-        # BN is not individually wrapped.)
-
-        for bn in [
-            fsdp.bn_container.bn1,
-            fsdp.bn_container.bn2,
-            fsdp.bn_container.bn3,
-            fsdp.bn_container.sync_bn
-        ]:
-            self.assertTrue(isinstance(bn, FSDP))
-
     @skip_if_lt_x_gpu(2)
     @parametrize(
         "cpu_offload",
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
@@ -2,21 +2,10 @@
 from typing import Any, Callable, Dict, List, Set, Tuple, Union
 
 import torch
-from torch.nn.modules.batchnorm import _BatchNorm
-
 from torch.nn.utils.rnn import PackedSequence
 
 """Useful functions to deal with tensor types with other python container types."""
 
-def _contains_batchnorm(module):
-    return any(
-        isinstance(mod, _BatchNorm) for mod in module.modules()
-    )
-
-def _override_batchnorm_mixed_precision(module):
-    for mod in module.modules():
-        if isinstance(mod, _BatchNorm):
-            mod._wrap_overrides = {"mixed_precision": None}  # type: ignore[assignment]
 
 def _apply_to_tensors(
     fn: Callable, container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -50,17 +50,14 @@
     _process_pos_dim_tensor_state,
     _unflatten_optim_state,
 )
-from ._utils import (
-    _apply_to_modules, _apply_to_tensors, _replace_by_prefix,
-    _override_batchnorm_mixed_precision, _contains_batchnorm
-)
+from ._utils import _apply_to_modules, _apply_to_tensors, _replace_by_prefix
 from .flatten_params_wrapper import (
     FLAT_PARAM,
     FPW_MODULE,
     FlatParameter,
     FlattenParamsWrapper,
 )
-from .wrap import _recursive_wrap, _wrap_batchnorm_individually, _or_policy
+from .wrap import _recursive_wrap
 
 if TYPE_CHECKING:
     from collections import OrderedDict  # noqa: F401
@@ -499,14 +496,6 @@ class FullyShardedDataParallel(nn.Module):
             that only floating point data is cast to the reduced precision. This allows
             users potential memory saving and training speedup while trading off
             accuracy during model training. If ``None``, no mixed precision is applied.
-            Note that if ``mixed_precision`` is enabled for FSDP model that
-            contains ``BatchNorm`` with ``auto_wrap_policy``, FSDP will take
-            care to disable mixed precision for ``BatchNorm`` units by wrapping
-            them separately in their own FSDP unit with ``mixed_precision=None``.
-            This is done because several ``BatchNorm`` kernels do not implement
-            reduced type support at the moment. If individually wrapping the model,
-            users must take care to set ``mixed_precision=None`` for
-            ``BatchNorm`` units.
             (Default: ``None``)
         ignored_modules (Optional[Iterable[torch.nn.Module]]): Modules whose
             own parameters and child modules' parameters and buffers are
@@ -591,25 +580,9 @@ def __init__(
                 check_fn=lambda mod: not isinstance(mod, FullyShardedDataParallel),
                 err_fn=lambda mod: f"Expected {mod} to NOT be FullyShardedDataParallel if auto_wrap is enabled.",
             )
-            if mixed_precision is not None and _contains_batchnorm(module):
-                _override_batchnorm_mixed_precision(module)
-                policy_to_use = functools.partial(
-                    _or_policy,
-                    policies=[_wrap_batchnorm_individually, auto_wrap_policy]
-                )
-                warnings.warn(
-                    "Mixed precision was specified for FSDP module with"
-                    " batchnorm submodules wrapped via ``auto_wrap_policy``."
-                    " BatchNorm units will be wrapped as a separate FSDP unit,"
-                    " with mixed_precision disabled (i.e. set to ``None``)"
-                    " as several BatchNorm kernels would raise errors when"
-                    " operating on reduced precision inputs."
-                )
-            else:
-                policy_to_use = auto_wrap_policy
             _recursive_wrap(
                 module,
-                auto_wrap_policy=policy_to_use,
+                auto_wrap_policy=auto_wrap_policy,
                 wrapper_cls=FullyShardedDataParallel,
                 ignored_modules=ignored_modules,
                 ignored_params=ignored_params,
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
@@ -17,7 +17,6 @@
 )
 
 import torch.nn as nn
-from torch.nn.modules.batchnorm import _BatchNorm
 
 
 def always_wrap_policy(*args, **kwargs) -> bool:
@@ -29,6 +28,7 @@ def always_wrap_policy(*args, **kwargs) -> bool:
     """
     return True
 
+
 def transformer_auto_wrap_policy(
     module: nn.Module,
     recurse: bool,
@@ -72,37 +72,6 @@ def transformer_auto_wrap_policy(
         # if not recursing, decide whether we should wrap for the leaf node or reminder
         return isinstance(module, tuple(transformer_layer_cls))
 
-def _wrap_batchnorm_individually(
-    module: nn.Module,
-    recurse: bool,
-    *args,
-    **kwargs,
-) -> bool:
-    """
-    A policy that wraps ``BatchNorm`` instances in their own FSDP unit.
-    """
-    if recurse:
-        # always recurse
-        return True
-    else:
-        # if not recursing, decide whether we should wrap based on whether it is a
-        # BN layer or not.
-        return isinstance(module, _BatchNorm)
-
-def _or_policy(
-    module: nn.Module,
-    recurse: bool,
-    unwrapped_params: int,
-    policies,
-) -> bool:
-    """
-    A policy that wraps ``module`` if any policy in the passed in iterable of
-    ``policies`` returns ``True``.
-    """
-    return any(
-        policy(module, recurse, unwrapped_params) for policy in policies
-    )
-
 
 def size_based_auto_wrap_policy(
     module: nn.Module,
@@ -241,14 +210,6 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
 
 def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
     assert wrapper_cls is not None
-    if hasattr(module, '_wrap_overrides'):
-        # If module has a _wrap_overrides attribute, we force overriding the
-        # FSDP config with these attributes for this module. Currently this
-        # is only used to disable mixed precision for BatchNorm when
-        # auto_wrapping.
-        overrides = {**kwargs, **module._wrap_overrides}  # type: ignore[arg-type]
-        return wrapper_cls(module, **overrides)
-
     return wrapper_cls(module, **kwargs)