Revert "[FSDP2] support dataclass args/kwargs output (#173415)"

pytorchmergebot · pytorchmergebot · commit 67fbb31bea26 · 2026-01-29T00:47:03.000Z
This reverts commit 63c5a68. Reverted #173415 on behalf of https://github.com/weifengpy due to failing internal test. revert first and find root cause later ([comment](#173415 (comment)))
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -597,133 +597,28 @@ def assert_fn(output: torch.Tensor):
             loss.backward()
 
     @skip_if_lt_x_gpu(1)
-    def test_dataclass_input_output(self):
-        from unittest.mock import patch
-
-        from torch.distributed._composable_state import _get_module_state
-
+    def test_dataclass_input(self):
         @dataclasses.dataclass
         class Input:
             x: torch.Tensor
-            y: torch.Tensor
-
-        @dataclasses.dataclass
-        class Output:
-            x: torch.Tensor
-            y: torch.Tensor
-
-        @dataclasses.dataclass
-        class Scale:
-            factor: torch.Tensor
 
         class Model(nn.Module):
             def __init__(self, *args, **kwargs) -> None:
                 super().__init__(*args, **kwargs)
                 self._layer = nn.Linear(10, 10)
 
-            def forward(self, input: Input, *, scale: Scale | None = None):
-                x = self._layer(input.x)
-                y = self._layer(input.y)
-                if scale is not None:
-                    x = x * scale.factor
-                    y = y * scale.factor
-                return Output(x=x, y=y)
-
-        class TensorModel(nn.Module):
-            def __init__(self, *args, **kwargs) -> None:
-                super().__init__(*args, **kwargs)
-                self._layer = nn.Linear(10, 10)
+            def forward(self, input: Input):
+                return self._layer(input.x)
 
-            def forward(self, x: torch.Tensor, *, scale: torch.Tensor | None = None):
-                out = self._layer(x)
-                if scale is not None:
-                    out = out * scale
-                return out
-
-        # Test with different MixedPrecisionPolicy configurations
-        mp_policies = [
-            MixedPrecisionPolicy(
-                param_dtype=torch.bfloat16,
-                reduce_dtype=torch.bfloat16,
-            ),
-            MixedPrecisionPolicy(
-                param_dtype=torch.bfloat16,
-                reduce_dtype=torch.float32,
-            ),
-        ]
-
-        for mp_policy in mp_policies:
-            # Test with normal torch.Tensor as arg
-            tensor_model = TensorModel()
-            fully_shard(tensor_model, mp_policy=mp_policy)
-            fsdp_state = _get_module_state(tensor_model)
-            x = torch.randn(10, 10, device=device_type, requires_grad=True)
-            with patch.object(
-                fsdp_state, "_pre_backward", wraps=fsdp_state._pre_backward
-            ) as mock_pre_backward:
-                loss = tensor_model(x).sum()
-                loss.backward()
-                mock_pre_backward.assert_called()
-
-            # Test with normal torch.Tensor as both arg and kwarg
-            tensor_model.zero_grad()
-            x = torch.randn(10, 10, device=device_type, requires_grad=True)
-            scale = torch.randn(10, 10, device=device_type, requires_grad=True)
-            with patch.object(
-                fsdp_state, "_pre_backward", wraps=fsdp_state._pre_backward
-            ) as mock_pre_backward:
-                loss = tensor_model(x, scale=scale).sum()
-                loss.backward()
-                mock_pre_backward.assert_called()
-
-            # Test with dataclass as positional arg only
-            model = nn.Sequential(*[Model(), Model()])
-            inp = Input(
-                x=torch.randn(10, 10, device=device_type, requires_grad=True),
-                y=torch.randn(10, 10, device=device_type, requires_grad=True),
-            )
+        mp_policy = MixedPrecisionPolicy(
+            torch.bfloat16, torch.bfloat16, torch.bfloat16, True
+        )
+        model = Model()
+        inp = Input(torch.randn(2, 10).to(device_type))
 
-            for layer in model:
-                fully_shard(layer, mp_policy=mp_policy, reshard_after_forward=True)
-            fully_shard(model, mp_policy=mp_policy, reshard_after_forward=True)
-
-            # Patch _pre_backward on all FSDP states
-            layer0_state = _get_module_state(model[0])
-            layer1_state = _get_module_state(model[1])
-            root_state = _get_module_state(model)
-            with (
-                patch.object(
-                    layer0_state, "_pre_backward", wraps=layer0_state._pre_backward
-                ) as layer0_mock,
-                patch.object(
-                    layer1_state, "_pre_backward", wraps=layer1_state._pre_backward
-                ) as layer1_mock,
-                patch.object(
-                    root_state, "_pre_backward", wraps=root_state._pre_backward
-                ) as root_mock,
-            ):
-                output = model(inp)
-                loss = output.x.sum() + output.y.sum()
-                loss.backward()
-                layer0_mock.assert_called()
-                layer1_mock.assert_called()
-                root_mock.assert_called()
-
-            # Test with dataclass as both positional arg and kwarg
-            inp = Input(
-                x=torch.randn(10, 10, device=device_type, requires_grad=True),
-                y=torch.randn(10, 10, device=device_type, requires_grad=True),
-            )
-            scale = Scale(
-                factor=torch.randn(10, 10, device=device_type, requires_grad=True)
-            )
-            with patch.object(
-                layer0_state, "_pre_backward", wraps=layer0_state._pre_backward
-            ) as layer0_mock:
-                output = model[0](inp, scale=scale)
-                loss = output.x.sum() + output.y.sum()
-                loss.backward()
-                layer0_mock.assert_called()
+        fully_shard(model, mp_policy=mp_policy)
+        loss = model(inp).sum()
+        loss.backward()
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import contextlib
-import functools
 import logging
 from collections.abc import Callable
 from typing import Any, cast, NamedTuple
@@ -11,8 +10,8 @@
 from torch.distributed.device_mesh import _get_device_handle
 from torch.distributed.fsdp._common_utils import _named_parameters_with_duplicates
 from torch.distributed.tensor import Shard
-from torch.distributed.utils import _apply_to_tensors
 from torch.profiler import record_function
+from torch.utils._pytree import tree_flatten, tree_unflatten
 from torch.utils.hooks import RemovableHandle
 
 from ._fsdp_api import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
@@ -715,11 +714,24 @@ def _register_post_backward_hook(
             return args, kwargs
         if not torch.is_grad_enabled():
             return args, kwargs
-        register_post_backward_func = functools.partial(
-            RegisterPostBackwardFunction.apply, self
-        )
-        args = _apply_to_tensors(lambda t: register_post_backward_func(t)[0], args)
-        kwargs = _apply_to_tensors(lambda t: register_post_backward_func(t)[0], kwargs)
+        args_list, args_spec = tree_flatten(args)
+        kwargs_list, kwargs_spec = tree_flatten(kwargs)
+        args_kwargs_list = list(args_list) + list(kwargs_list)
+        inp_tensor_indices: list[int] = []
+        inp_tensors: list[torch.Tensor] = []
+        for i, obj in enumerate(args_kwargs_list):
+            if torch.is_tensor(obj) and obj.requires_grad:
+                inp_tensor_indices.append(i)
+                inp_tensors.append(obj)
+        if len(inp_tensors) == 0:
+            return args, kwargs  # no tensors that require gradients
+        inp_tensors = RegisterPostBackwardFunction.apply(self, *inp_tensors)
+        for inp_tensor_idx, inp_tensor in zip(inp_tensor_indices, inp_tensors):
+            args_kwargs_list[inp_tensor_idx] = inp_tensor
+        args_list = args_kwargs_list[: len(args_list)]
+        kwargs_list = args_kwargs_list[len(args_list) :]
+        args = tree_unflatten(args_list, args_spec)
+        kwargs = tree_unflatten(kwargs_list, kwargs_spec)
         return args, kwargs
 
     def _register_state_dict_hooks(self) -> None:
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
@@ -17,6 +17,7 @@
 )
 from torch.distributed.device_mesh import _get_device_handle
 from torch.distributed.utils import _apply_to_tensors, _to_kwargs
+from torch.utils._pytree import tree_flatten
 
 from ._fsdp_api import MixedPrecisionPolicy
 from ._fsdp_common import (
@@ -349,10 +350,10 @@ def _finalize_backward(self) -> None:
     def _register_pre_backward_hook(self, output: Any) -> Any:
         if not torch.is_grad_enabled():
             return output
-        _apply_to_tensors(
-            lambda x: x.register_hook(self._pre_backward) if x.requires_grad else x,
-            output,
-        )
+        flat_outputs, _ = tree_flatten(output)
+        for t in flat_outputs:
+            if torch.is_tensor(t) and t.requires_grad:
+                t.register_hook(self._pre_backward)
         return output
 
     def _register_root_post_backward_final_callback(self):