fix some issues in test case, cuda specific code, world_size 8, etc.

pytorch · daisyden · May 11, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
commit 8d8c5fe60570e2adf18c0fc1b1091b273e7d0c49
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
@@ -60,7 +60,7 @@ def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
             input = torch.randn((2,), device=device_type)
 
         loss = model(input).sum()
-        scaler = GradScaler(init_scale=2.0, enabled=True)
+        scaler = GradScaler(init_scale=2.0, enabled=True, device=device_type)
         opt = torch.optim.Adam(model.parameters(), lr=1e-2)
         scaler.scale(loss).backward()
         inv_scale = scaler._scale.double().reciprocal().float()

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -62,10 +62,10 @@ def delay_collective():
             # Share a stream so that all-gather and reduce-scatter block each
             # other like in `ProcessGroupNCCL`
             comm_stream.wait_stream(torch.accelerator.current_stream())
-            with torch.cuda.stream(comm_stream):
-                if device_type == 'cuda':
+            if device_type == 'cuda':
+                with torch.cuda.stream(comm_stream):
                     torch.cuda._sleep(int(comm_sleep_ms * get_cycles_per_ms()))          
-            torch.cuda.current_stream().wait_stream(comm_stream)
+            torch.accelerator.current_stream().wait_stream(comm_stream)
 
         def delayed_all_gather(*args, **kwargs):
             delay_collective()

diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
@@ -613,7 +613,7 @@ def test_shard_tensor_2d(self):
 class DTensorMeshTest(DTensorTestBase):
     @property
     def world_size(self):
-        return 8
+        return min(8, torch.accelerator.device_count())
 
     def sub_mesh_assert_equal(self, mesh, exp_in_mesh, exp_out_of_mesh, tensor):
         if self.rank in mesh:
@@ -930,7 +930,7 @@ def test_metadata_consistency_check(self):
 class TestDTensorPlacementTypes(DTensorTestBase):
     @property
     def world_size(self):
-        return 8
+        return min(8, torch.accelerator.device_count())
 
     def _create_tensor(self, size):
         # Keep everything deterministic.

diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
@@ -449,7 +449,7 @@ def test_shard_dim_alltoall(self):
 class MultiDimRedistributeTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
-        return 8
+        return min(8, torch.accelerator.device_count())
 
     @with_comms
     def test_multi_dim_mesh(self):

diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -211,7 +211,7 @@ def test_device_mesh_init_backend(self):
         # we call init_backend we should make sure the default pg already created
         mesh.get_coordinate()
 
-    @unittest.skipif(not torch.accelerator.is_available(), "No accelerator available!")
+    @unittest.skipIf(not torch.accelerator.is_available(), "No accelerator available!")
     def test_fake_pg_device_mesh(self):
         fake_store = FakeStore()
         init_process_group("fake", store=fake_store, rank=0, world_size=self.world_size)
@@ -266,7 +266,7 @@ def test_from_group_with_invalid_mesh(self):
                 groups, self.device_type, invalid_mesh, mesh_dim_names=("dim0", "dim1")
             )
 
-    @unittest.skipif(not torch.accelerator.is_available(), "No accelerator available!")
+    @unittest.skipIf(not torch.accelerator.is_available(), "No accelerator available!")
     def test_raises_invalid_device_type(self):
         with self.assertRaisesRegex(
             RuntimeError,