Enable random weights for unit test + Doc (#2889)

aporialiao · facebook-github-bot · commit 881939303868 · 2025-04-17T18:33:57.000-07:00
Summary: Pull Request resolved: #2889 Enable Random weights for unit test. When testing for DMP interface for Dynamic Sharding, I'm noticing discrepancies in predictions. Still debugging this case, but will be enabling random weights by default for the initial dynamic sharding interface and keep the debug values as an optional flag. Main changes: 1. Added comment to `copy_state_dict` in `test_sharding` to make it clear it is the global state_dict being copied to the local 2. Removing redundant `copy_state_dict` use in dynamic sharding unit test set up, since already using `load_state_dict` 3. Added `use_debug_state_dict` flag defaulted to `False` - if turned on this will force the test models to have dummy int values in embeddings weights. 4. With `use_debug_state_dict` turned off, the weights will be randomly generated up-on initialization of the EBCs. 1. Note: `torch.manual_seed(0)` - is needed to force the EBCs to be initialized with the same float values across ranks in the distributed env. 2. Alternate approach could be to initialize the global EBCs outside of distributed test process, but since this is just unit test, I can keep as is. Reviewed By: TroyGarden Differential Revision: D73077322 fbshipit-source-id: 093f23c10b73a90b61429c4109e484627270bd46
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -254,6 +254,9 @@ def copy_state_dict(
     glob: Dict[str, torch.Tensor],
     exclude_predfix: Optional[str] = None,
 ) -> None:
+    """
+    Copies the contents of the global tensors in glob to the local tensors in loc.
+    """
     for name, tensor in loc.items():
         if exclude_predfix is not None and name.startswith(exclude_predfix):
             continue
diff --git a/torchrec/distributed/tests/test_dynamic_sharding.py b/torchrec/distributed/tests/test_dynamic_sharding.py
@@ -213,10 +213,8 @@ def _test_ebc_resharding(
     trec_dist.comm_ops.set_gradient_division(False)
     with MultiProcessContext(rank, world_size, backend, local_size) as ctx:
         kjt_input_per_rank = [kjt.to(ctx.device) for kjt in kjt_input_per_rank]
-
-        initial_state_dict = {
-            fqn: tensor.to(ctx.device) for fqn, tensor in initial_state_dict.items()
-        }
+        # Set seed to be 0 to ensure models have the same initialization across ranks
+        torch.manual_seed(0)
         m1 = EmbeddingBagCollection(
             tables=tables,
             device=ctx.device,
@@ -226,19 +224,24 @@ def _test_ebc_resharding(
             tables=tables,
             device=ctx.device,
         )
-
-        # Load initial State - making sure models are identical
-        m1.load_state_dict(initial_state_dict)
-        copy_state_dict(
-            loc=m1.state_dict(),
-            glob=copy.deepcopy(initial_state_dict),
-        )
-
-        m2.load_state_dict(initial_state_dict)
-        copy_state_dict(
-            loc=m2.state_dict(),
-            glob=copy.deepcopy(initial_state_dict),
-        )
+        if initial_state_dict is not None:
+            initial_state_dict = {
+                fqn: tensor.to(ctx.device) for fqn, tensor in initial_state_dict.items()
+            }
+
+            # Load initial State - making sure models are identical
+            m1.load_state_dict(initial_state_dict)
+
+            m2.load_state_dict(initial_state_dict)
+
+        else:
+            # Note this is the only correct behavior due to setting random seed to 0 above
+            # Otherwise the weights generated in EBC initialization will be different on
+            # Each rank, resulting in different behavior after resharding
+            copy_state_dict(
+                loc=m2.state_dict(),
+                glob=m1.state_dict(),
+            )
 
         sharder = get_module_to_default_sharders()[type(m1)]
 
@@ -278,8 +281,8 @@ def _test_ebc_resharding(
             feature_keys.extend(table.feature_names)
 
         # For current test model and inputs, the prediction should be the exact same
-        rtol = 0
-        atol = 0
+        # rtol = 0
+        # atol = 0
 
         for _ in range(world_size):
             # sharded model
@@ -301,9 +304,7 @@ def _test_ebc_resharding(
             # their model. output from sharded_pred is correctly on the correct device.
 
             # Compare predictions of sharded vs unsharded models.
-            torch.testing.assert_close(
-                sharded_m1_pred.cpu(), resharded_m2_pred.cpu(), rtol=rtol, atol=atol
-            )
+            torch.testing.assert_close(sharded_m1_pred.cpu(), resharded_m2_pred.cpu())
 
             sharded_m1_pred.sum().backward()
             resharded_m2_pred.sum().backward()
@@ -320,6 +321,7 @@ def _run_ebc_resharding_test(
         data_type: DataType,
         embedding_dim: int = 16,
         num_embeddings: int = 4,
+        use_debug_state_dict: bool = False,  # Turn on to use dummy values for initial state dict
     ) -> None:
         embedding_bag_config = generate_embedding_bag_config(
             data_type, num_tables, embedding_dim, num_embeddings
@@ -359,14 +361,16 @@ def _run_ebc_resharding_test(
             for _ in range(world_size)
         ]
 
-        # initial_state_dict filled with deterministic dummy values
-        initial_state_dict = create_test_initial_state_dict(
-            ShardedEmbeddingBagCollection,  # pyre-ignore
-            num_tables,
-            data_type,
-            embedding_dim,
-            num_embeddings,
-        )
+        initial_state_dict = None
+        if use_debug_state_dict:
+            # initial_state_dict filled with deterministic dummy values
+            initial_state_dict = create_test_initial_state_dict(
+                ShardedEmbeddingBagCollection,  # pyre-ignore
+                num_tables,
+                data_type,
+                embedding_dim,
+                num_embeddings,
+            )
 
         self._run_multi_process_test(
             callable=_test_ebc_resharding,