pytorch · daisyden · May 11, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_autograd.py b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
@@ -20,12 +20,13 @@
     get_devtype,
     MLP,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
 )
 
+device_type = torch.accelerator.current_accelerator().type
 
 device_type = torch.device(get_devtype())
 

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -45,7 +45,7 @@
     patch_reshard,
     patch_unshard,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -221,6 +221,7 @@ def test_reduce_scatter_fp32(self):
                 reduce_scatter_dtype=torch.float32,
             )
 
+
     @skip_if_lt_x_gpu(1)
     def test_reduce_scatter_fp16(self):
         param_sizes = self._get_param_sizes()

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -129,6 +129,9 @@ class TestFullyShardCompile(FSDPTest):
     def skipTestForOldSm(self):
         # Assumption: This test class is only run on GPU. See `HAS_GPU` check at
         # the top of the class.
+        # XPU is not applicable in this function
+        if device_type == 'xpu':
+            return
         device = torch.device(
             device_type.type,
             self.rank % torch.get_device_module(device_type).device_count(),

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_extensions.py b/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
@@ -22,9 +22,10 @@
     get_devtype,
     MLP,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
 from torch.testing._internal.two_tensor import TwoTensor
 
+device_type = torch.accelerator.current_accelerator().type
 
 device_type = torch.device(get_devtype())
 
@@ -259,7 +260,7 @@ class TestFullyShardAllGatherExtensionsMultiThread(
 ):
     @property
     def world_size(self) -> int:
-        return 8
+        return min(8, torch.accelerator.device_count())
 
     @property
     def device(self) -> torch.device:

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_frozen.py b/test/distributed/_composable/fsdp/test_fully_shard_frozen.py
@@ -26,6 +26,7 @@
 )
 from torch.testing._internal.common_utils import run_tests
 
+device_type = torch.accelerator.current_accelerator().type
 
 device_type = torch.device(get_devtype())
 

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
@@ -15,6 +15,7 @@
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLP
 from torch.testing._internal.common_utils import run_tests
 
+device_type = torch.accelerator.current_accelerator().type
 
 device_type = torch.device(get_devtype())
 

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py b/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py
@@ -33,7 +33,6 @@
     )
     sys.exit(0)
 
-
 class C(nn.Module):
     def __init__(self, dim: int) -> None:
         super().__init__()

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -45,7 +45,6 @@
     TransformerBlock,
 )
 
-
 device_type = torch.device(get_devtype())
 
 
@@ -62,6 +61,7 @@ def test_move_states_to_device_tensor(self):
         for tensor in itertools.chain(model.parameters(), model.buffers()):
             self.assertEqual(tensor.device, torch.device("cpu"))
         fully_shard(model)
+
         accelerator_device = torch.device(
             device_type.type, torch.get_device_module(device_type).current_device()
         )

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_logging.py b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@@ -6,11 +6,13 @@
 import torch.distributed as dist
 from torch._dynamo.test_case import run_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA, HAS_XPU
 from torch.testing._internal.logging_utils import LoggingTestCase
+import torch
 
+device_type = torch.accelerator.current_accelerator().type
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+requires_gpu = unittest.skipUnless(HAS_CUDA or HAS_XPU, "requires cuda or xpu")
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
@@ -8,7 +8,7 @@
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, OffloadPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
-from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -236,14 +236,15 @@ def test_fully_shard_del_memory(self):
 
     def _get_peak_active_memory_mb(self) -> int:
         mem_stats = torch.get_device_module(device_type).memory_stats()
-        if TEST_CUDA:
+
+        if TEST_CUDA or TEST_XPU:
             return round(mem_stats["active_bytes.all.peak"] / 1e6)
         if TEST_HPU:
             return round(mem_stats["MaxInUse"] / 1e6)
 
     def _get_curr_active_memory_mb(self) -> int:
         mem_stats = torch.get_device_module(device_type).memory_stats()
-        if TEST_CUDA:
+        if TEST_CUDA or TEST_XPU:
             return round(mem_stats["active_bytes.all.current"] / 1e6)
         if TEST_HPU:
             return round(mem_stats["InUse"] / 1e6)

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -15,6 +15,7 @@
 from torch.distributed.tensor import Shard
 from torch.testing._internal.common_distributed import (
     requires_nccl_version,
+    requires_nccl_version_or,
     SaveForwardInputsModel,
     skip_if_lt_x_gpu,
 )
@@ -32,6 +33,7 @@
 
 device_type = torch.device(get_devtype())
 
+device_type = torch.accelerator.current_accelerator().type
 
 class TestFullyShardMixedPrecisionTraining(FSDPTest):
     @property
@@ -87,7 +89,7 @@ def _get_use_shard_placement_fn_vals_for_bf16_reduce(self):
 
     @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
     @skip_if_lt_x_gpu(2)
-    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    @requires_nccl_version_or((2, 10), "Need NCCL 2.10+ for bf16 collectives", backends=['xccl',])
     def test_compute_dtype(self):
         use_shard_placement_fn_vals = (
             self._get_use_shard_placement_fn_vals_for_bf16_reduce()
@@ -167,7 +169,7 @@ def assert_fn(output: torch.Tensor):
 
     @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
     @skip_if_lt_x_gpu(2)
-    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    @requires_nccl_version_or((2, 10), "Need NCCL 2.10+ for bf16 collectives", backends=['xccl',])
     def test_reduce_dtype(self):
         self.run_subtests(
             {
@@ -500,7 +502,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         )
 
     @skip_if_lt_x_gpu(1)
-    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    @requires_nccl_version_or((2, 10), "Need NCCL 2.10+ for bf16 collectives", backends=['xccl',])
     def test_norm_modules_bf16(self):
         mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
         self._test_norm_modules(mp_policy)

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -17,7 +17,8 @@
     patch_all_gather,
     patch_reduce_scatter,
 )
-from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests, TEST_HPU
+
+from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests, TEST_HPU, TEST_XPU
 
 
 device_type = torch.device(get_devtype())
@@ -45,6 +46,7 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+    @unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
     def test_fully_shard_training_overlap(self):
         torch.manual_seed(42)
 
@@ -66,6 +68,7 @@ def test_fully_shard_training_overlap(self):
         def delay_collective():
             # Share a stream so that all-gather and reduce-scatter block each
             # other like in `ProcessGroupNCCL`
+
             comm_stream.wait_stream(
                 torch.get_device_module(device_type).current_stream()
             )
@@ -158,6 +161,7 @@ def fwd_bwd():
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+    @unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
     def test_fully_shard_post_optim_event_overlap(self):
         torch.manual_seed(42)
 

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -29,6 +29,7 @@
     TransformerBlock,
 )
 
+device_type = torch.accelerator.current_accelerator().type
 
 device_type = torch.device(get_devtype())
 

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -41,6 +41,7 @@
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
     run_tests,
+    TEST_XPU,
     TEST_HPU,
     wrapSwapTensorsTest,
 )
@@ -50,7 +51,6 @@
     TransformerBlock,
 )
 
-
 c10d_ops = torch.ops.c10d
 funcol = torch.ops.c10d_functional
 
@@ -315,6 +315,7 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
+    @unittest.skipIf(TEST_XPU, "sleep kernel not supported on XPU")
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group(self):
         """
@@ -338,6 +339,7 @@ def test_train_parity_multi_group(self):
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+    @unittest.skipIf(TEST_XPU, "sleep kernel not supported on XPU")
     def test_train_parity_multi_group_cpu_offload_eager(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
@@ -362,6 +364,7 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+    @unittest.skipIf(TEST_XPU, "sleep kernel not supported on XPU")
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group_unshard_async_op(self):
         """
@@ -616,6 +619,7 @@ def test_explicit_prefetching(self):
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+    @unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
     def test_post_optim_event(self):
         torch.manual_seed(42)
         model_args = ModelArgs(dropout_p=0.0)
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
     )
     from torch.testing._internal.common_utils import run_tests
+    device_type = torch.accelerator.current_accelerator().type
     device_type = torch.device(get_devtype())
@@ Expand Down @@