Update on "[dtensor][random] allow user to manual_seed different seed on device mesh; only sync RNG state in WORLD when manual_seed has not been called"

XilunWu · XilunWu · commit 366bd1cbdbcd · 2024-11-27T15:28:16.000-08:00
**Summary** This PR proposes 3 changes to DTensor RNG management: 1. DTensor allows users to eagerly initialize the RNG tracker by calling `torch.distributed.tensor._random.manual_seed`. 2. DTensor `manual_seed` no longer checks the integrity of the `seed` argument. Users are responsible for setting the same seed on all ranks within an SPMD group, but if there are multiple separate SPMD groups (e.g. across pipeline stages), users should set a _different_ seed for each SPMD group. For cases like Pipeline Parallel, users can set different initial seed for pipelining stages by calling ``` world_mesh = init_device_mesh( device_type="cuda", mesh_shape=(2, 2, 2), mesh_dim_names=("pp", "dp", "tp"), ) pp_mesh = world_mesh["pp"] pp_rank = pp_mesh.get_local_rank() spmd_mesh = world_mesh["dp", "tp"]._flatten("spmd") # this flattening is only needed if you need to call collective over this mesh torch.distributed.tensor._random.manual_seed(123+pp_rank, spmd_mesh) ``` In other word, if users want to call `torch.distributed.tensor._random.manual_seed`, they will be responsible for passing in the right value and DTensor won't perform any checks on it. If the current rank is not a part of the mesh, it will use the current device RNG state to initialize. 3. `OffsetBasedRNGTracker` still performs RNG state synchronization by broadcasting the RNG state on rank 0 to `WORLD`. However, calling `torch.distributed.tensor._random.manual_seed` is an exception. In this case, no broadcast will happen. **Motivation** tl;dr 1. Lazily initializing DTensor RNG tracker causes hang in non-SPMD code such as Pipeline Parallel. 2. Users may want to set different seed on ranks in one device mesh. 3. We want to keep the old behavior if users prefer not curating the RNG state and want to have DTensor take care of it. see detail in #140301 **Test** `pytest test/distributed/_tensor/test_random_ops.py` `pytest test/distributed/tensor/parallel/test_tp_random_state.py` cc wanchaol tianyu-l wz337 d4l3k H-Huang awgu kwen2501 fegin fduwjj wconstab c-p-i-occ c-p-i-o [ghstack-poisoned]
diff --git a/test/distributed/_tensor/test_random_ops.py b/test/distributed/_tensor/test_random_ops.py
@@ -6,13 +6,19 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._random as random
+from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed._tensor import DeviceMesh, DTensor, init_device_mesh
 from torch.distributed._tensor._utils import compute_local_shape_and_global_offset
 from torch.distributed._tensor.api import distribute_tensor
 from torch.distributed._tensor.placement_types import Replicate, Shard
 from torch.distributed.distributed_c10d import broadcast_object_list
-from torch.distributed.tensor._random import is_rng_supported_mesh, manual_seed
+from torch.distributed.tensor._random import (
+    is_rng_supported_mesh,
+    manual_seed,
+    OffsetBasedRNGTracker,
+)
 from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -118,6 +124,19 @@ def test_manual_seed(self):
 
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_manual_seed_submesh(self):
+        # the current rank is not a part of the mesh
+        single_rank_device_mesh = DeviceMesh(
+            self.device_type, [(self.rank + 1) % self.world_size]
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "manual_seed requires the current rank to be a part of the device mesh",
+        ):
+            manual_seed(self.rank, single_rank_device_mesh)
+
     @with_comms
     @skip_unless_torch_gpu
     def test_pipeline_parallel_manual_seed(self):
@@ -159,6 +178,102 @@ def test_pipeline_parallel_manual_seed(self):
                     tensor_gather[2 * other_rank : 2 * (other_rank + 1), :],
                 )
 
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_tp_model_meta_init(self):
+        # initialize the 1-d device mesh for TP
+        tp_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+
+        # model meta init
+        with torch.device("meta"):
+            model = torch.nn.Linear(self.world_size, self.world_size, bias=False)
+            self.assertEqual(model.weight.device, torch.device("meta"))
+            parallelize_module(model, tp_mesh, ColwiseParallel())
+            if random._rng_tracker is not None:
+                random._rng_tracker.distribute_region_enabled = True
+
+            self.assertEqual(model.weight.device, torch.device("meta"))
+
+        # actual initialization
+        device = torch.device("cuda", torch.cuda.current_device())
+        model.to_empty(device=device)
+        model.reset_parameters()
+        self.assertTrue(
+            random._rng_tracker is not None
+            and isinstance(random._rng_tracker, OffsetBasedRNGTracker)
+        )
+        self.assertEqual(model.weight.device, device)
+        assert isinstance(model.weight, DTensor)
+
+        # gather all the shards to compare initialization results
+        WORLD = torch.distributed.group.WORLD
+        assert WORLD is not None
+        weight_local = model.weight.to_local()
+        weight_gather = funcol.all_gather_tensor(
+            weight_local,
+            gather_dim=0,
+            group=WORLD,
+        )
+
+        # verify the weights are initialized differently on all ranks
+        for other_rank in range(self.world_size):
+            if self.rank != other_rank:
+                self.assertNotEqual(
+                    weight_local,
+                    weight_gather[other_rank : other_rank + 1, :],
+                )
+
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_fsdp_tp_model_meta_init(self):
+        # initialize the 2-d device mesh
+        global_mesh = init_device_mesh(
+            self.device_type,
+            mesh_shape=(self.world_size // 2, 2),
+            mesh_dim_names=("dp", "tp"),
+        )
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+
+        # model meta init
+        with torch.device("meta"):
+            model = torch.nn.Linear(self.world_size, self.world_size, bias=False)
+            self.assertEqual(model.weight.device, torch.device("meta"))
+            parallelize_module(model, tp_mesh, ColwiseParallel())
+            if random._rng_tracker is not None:
+                random._rng_tracker.distribute_region_enabled = True
+
+            fully_shard(model, mesh=dp_mesh)
+            self.assertEqual(model.weight.device, torch.device("meta"))
+
+        # actual initialization
+        device = torch.device("cuda", torch.cuda.current_device())
+        model.to_empty(device=device)
+        model.reset_parameters()
+        self.assertTrue(
+            random._rng_tracker is not None
+            and isinstance(random._rng_tracker, OffsetBasedRNGTracker)
+        )
+        self.assertEqual(model.weight.device, device)
+        assert isinstance(model.weight, DTensor)
+
+        # gather all the shards to compare initialization results
+        WORLD = torch.distributed.group.WORLD
+        assert WORLD is not None
+        weight_local = model.weight.to_local()
+        weight_gather = funcol.all_gather_tensor(
+            weight_local,
+            gather_dim=0,
+            group=WORLD,
+        )
+
+        # verify the weights are initialized differently on all ranks
+        for other_rank in range(self.world_size):
+            if self.rank != other_rank:
+                self.assertNotEqual(
+                    weight_local,
+                    weight_gather[other_rank : other_rank + 1, :],
+                )
+
     @with_comms
     @skip_unless_torch_gpu
     def test_deterministic_dropout_1d(self):
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
@@ -52,7 +52,10 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
 
     Args:
         seed (int): The desired seed.
-        device_mesh (:class:`DeviceMesh`): The device mesh to set the seed.
+        device_mesh (:class:`DeviceMesh`): The device mesh to set the seed. It is
+            required that the ``device_mesh`` include the calling rank. This is
+            to ensure that the SPMD region maintains a synchronous RNG state, which
+            means no ranks should be initialized with values other than ``seed``.
 
     Returns:
         None
@@ -62,7 +65,7 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
         ensure on their own that the value passed in is the desired ``seed`` for ranks
         within ``device_mesh``.
         If ``device_mesh`` is a sub-mesh and the calling rank is not a part of it,
-        ``manual_seed`` will not set its GPU device's generator seed.
+        ``manual_seed`` will throw an error.
         Current implementation only supports a GPU device mesh.
     """
     device_handle = _get_device_handle(device_mesh.device_type)
@@ -82,6 +85,12 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
     # the current rank is in mesh
     if device_mesh.get_coordinate() is not None:
         _rng_tracker._manual_seed(seed)
+    else:
+        raise RuntimeError(
+            "manual_seed requires the current rank to be a part of the device mesh "
+            "otherwise DTensor RNG state on the rank will not be initialized and "
+            "the behavior of DTensor random ops is undefined."
+        )
 
 
 class _RNGStateTracker:
@@ -130,8 +139,8 @@ def get_seed(self, name: str) -> int:
         return int(seed_tensor.item())
 
     def set_seed(self, name: str, seed: int) -> None:
-        seed_tensor = torch.tensor([seed]).view(torch.uint8)
-        offset_tensor = torch.tensor([0]).view(torch.uint8)
+        seed_tensor = torch.tensor([seed], device="cpu").view(torch.uint8)
+        offset_tensor = torch.tensor([0], device="cpu").view(torch.uint8)
         self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
 
     def _distribute_region(self, spec: DTensorSpec):
@@ -198,7 +207,7 @@ def set_offset(self, name: str, offset: int) -> None:
             )
 
         seed_tensor = (self.rng_states[name])[0:8]
-        offset_tensor = torch.tensor([offset]).view(torch.uint8)
+        offset_tensor = torch.tensor([offset], device="cpu").view(torch.uint8)
         self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
 
     def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
@@ -277,7 +286,6 @@ def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
             total_num_shards = 1
             # the tensor dim is sharded on more than 1 mesh dim
             if isinstance(mesh_dim, List):
-                assert isinstance(mesh_dim, List)
                 rank_coord = [mesh_coordinate[d] for d in mesh_dim]
                 num_shards = [mesh_size[d] for d in mesh_dim]
                 # compute the shard idx and total number of shards