selfattention block: Remove the fc linear layer if it is not used (#8325)

johnzielke · ericspod · KumoLiu · web-flow · commit b58e883c887e · 2025-04-13T16:27:57.000+08:00
### Description
when include_fc = False, the nn.Linear layer is unused. This leads to
errors and warning when training with the pytorch Distributed Data
Parallel infrastructure, since the parameters for the nn.Linear layer
will not have gradients attached.
### Types of changes
&lt;!--- Put an `x` in all the boxes that apply, and remove the not
applicable items --&gt;
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/`
folder.

---------

Signed-off-by: John Zielke &lt;j.l.zielke@gmail.com&gt;
Co-authored-by: Eric Kerfoot &lt;17726042+ericspod@users.noreply.github.com&gt;
Co-authored-by: YunLiu &lt;55491388+KumoLiu@users.noreply.github.com&gt;
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
@@ -101,7 +101,11 @@ def __init__(
 
         self.num_heads = num_heads
         self.hidden_input_size = hidden_input_size if hidden_input_size else hidden_size
-        self.out_proj = nn.Linear(self.inner_dim, self.hidden_input_size)
+        self.out_proj: Union[nn.Linear, nn.Identity]
+        if include_fc:
+            self.out_proj = nn.Linear(self.inner_dim, self.hidden_input_size)
+        else:
+            self.out_proj = nn.Identity()
 
         self.qkv: Union[nn.Linear, nn.Identity]
         self.to_q: Union[nn.Linear, nn.Identity]
diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py
@@ -1847,9 +1847,9 @@ def load_old_state_dict(self, old_state_dict: dict, verbose=False) -> None:
             new_state_dict[f"{block}.attn.to_v.bias"] = old_state_dict.pop(f"{block}.to_v.bias")
 
             # projection
-            new_state_dict[f"{block}.attn.out_proj.weight"] = old_state_dict.pop(f"{block}.proj_attn.weight")
-            new_state_dict[f"{block}.attn.out_proj.bias"] = old_state_dict.pop(f"{block}.proj_attn.bias")
-
+            if f"{block}.attn.out_proj.weight" in new_state_dict and f"{block}.attn.out_proj.bias" in new_state_dict:
+                new_state_dict[f"{block}.attn.out_proj.weight"] = old_state_dict.pop(f"{block}.proj_attn.weight")
+                new_state_dict[f"{block}.attn.out_proj.bias"] = old_state_dict.pop(f"{block}.proj_attn.bias")
         # fix the cross attention blocks
         cross_attention_blocks = [
             k.replace(".out_proj.weight", "")
diff --git a/tests/networks/blocks/test_selfattention.py b/tests/networks/blocks/test_selfattention.py
@@ -227,6 +227,27 @@ def test_flash_attention(self):
         out_2 = block_wo_flash_attention(test_data)
         assert_allclose(out_1, out_2, atol=1e-4)
 
+    @parameterized.expand([[True], [False]])
+    def test_no_extra_weights_if_no_fc(self, include_fc):
+        input_param = {
+            "hidden_size": 360,
+            "num_heads": 4,
+            "dropout_rate": 0.0,
+            "rel_pos_embedding": None,
+            "input_size": (16, 32),
+            "include_fc": include_fc,
+            "use_combined_linear": use_combined_linear,
+        }
+        net = SABlock(**input_param)
+        if not include_fc:
+            self.assertNotIn("out_proj.weight", net.state_dict())
+            self.assertNotIn("out_proj.bias", net.state_dict())
+            self.assertIsInstance(net.out_proj, torch.nn.Identity)
+        else:
+            self.assertIn("out_proj.weight", net.state_dict())
+            self.assertIn("out_proj.bias", net.state_dict())
+            self.assertIsInstance(net.out_proj, torch.nn.Linear)
+
 
 if __name__ == "__main__":
     unittest.main()