pytorch
diff --git a/‎.ci/pytorch/test.sh
Lines changed: 2 additions & 1 deletion b/‎.ci/pytorch/test.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/distributed/test_nccl.py
Lines changed: 4 additions & 10 deletions b/‎test/distributed/test_nccl.py
Lines changed: 4 additions & 10 deletions
diff --git a/‎test/distributed/test_symmetric_memory.py
Lines changed: 1 addition & 22 deletions b/‎test/distributed/test_symmetric_memory.py
Lines changed: 1 addition & 22 deletions
diff --git a/‎torch/testing/_internal/common_utils.py
Lines changed: 25 additions & 5 deletions b/‎torch/testing/_internal/common_utils.py
Lines changed: 25 additions & 5 deletions
@@ -329,7 +329,8 @@ test_h100_distributed() {
   time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   # symmetric memory test
   time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time TORCH_SYMMMEM=NCCL python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
 
@@ -18,6 +18,7 @@
     IS_WINDOWS,
     load_tests,
     NoTest,
+    requires_cuda_p2p_access,
     run_tests,
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_ROCM,
@@ -241,24 +242,17 @@ def test_reduce_scatter(self, device, dtype):
             self.assertEqual(outputs[i], expected[i])
 
 
-device_type = "cuda"
-device_module = torch.get_device_module(device_type)
-
-
+@requires_cuda_p2p_access()
 class NCCLSymmetricMemoryTest(MultiProcContinousTest):
-    def _init_device(self) -> None:
-        # TODO: relieve this (seems to hang if without)
-        device_module.set_device(self.device)
-
     @property
     def device(self) -> torch.device:
-        return torch.device(device_type, self.rank)
+        return torch.device("cuda", self.rank)
 
     # To run this test, one needs to TORCH_SYMMMEM=NCCL when running the test.
     @skip_but_pass_in_sandcastle_if(TEST_WITH_ROCM, "Skip NCCL tests for ROCm")
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     def test_nccl_symmem_alloc(self):
-        self._init_device()
+        torch.cuda.set_device(self.rank)
         c10d.all_reduce(torch.ones(1, device=self.device))
         group_name = c10d.group.WORLD.group_name
         symm_mem.enable_symm_mem_for_group(group_name)
 
@@ -34,9 +34,9 @@
     MI300_ARCH,
     parametrize,
     requires_cuda,
+    requires_cuda_p2p_access,
     run_tests,
     runOnRocmArch,
-    skip_but_pass_in_sandcastle_if,
     skipIfRocm,
     TEST_WITH_ROCM,
     TestCase,
@@ -50,27 +50,6 @@
 device_module = torch.get_device_module(device_type)
 
 
-def requires_cuda_p2p_access():
-    cuda_p2p_access_available = (
-        torch.cuda.is_available()
-        and torch.cuda.get_device_capability() >= (8, 0)
-        and torch.cuda.device_count() >= 2
-    )
-    num_devices = torch.cuda.device_count()
-    for i in range(num_devices - 1):
-        for j in range(i + 1, num_devices):
-            if not torch.cuda.can_device_access_peer(i, j):
-                cuda_p2p_access_available = False
-                break
-        if not cuda_p2p_access_available:
-            break
-
-    return skip_but_pass_in_sandcastle_if(
-        not cuda_p2p_access_available,
-        "cuda p2p access is not available",
-    )
-
-
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
 class SymmetricMemoryTest(MultiProcContinousTest):
 
@@ -2037,6 +2037,26 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def requires_cuda_p2p_access():
+    cuda_p2p_access_available = (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (8, 0)
+        and torch.cuda.device_count() >= 2
+    )
+    num_devices = torch.cuda.device_count()
+    for i in range(num_devices - 1):
+        for j in range(i + 1, num_devices):
+            if not torch.cuda.can_device_access_peer(i, j):
+                cuda_p2p_access_available = False
+                break
+        if not cuda_p2p_access_available:
+            break
+
+    return skip_but_pass_in_sandcastle_if(
+        not cuda_p2p_access_available,
+        "cuda p2p access is not available",
+    )
+
 # Reverts the linalg backend back to default to make sure potential failures in one
 # test do not affect other tests
 def setLinalgBackendsToDefaultFinally(fn):
@@ -2551,18 +2571,18 @@ def __exit__(self, exc_type, exc_value, traceback):
                 msg = ("CUDA caching allocator reports a memory leak not "  # type: ignore[possibly-undefined]
                        f"verified by the driver API in {self.name}! "
                        f"Caching allocator allocated memory was {self.caching_allocator_befores[i]} "
-                       f"and is now reported as {caching_allocator_mem_allocated} "
+                       f"and is now reported as {caching_allocator_mem_allocated} "  # type: ignore[possibly-undefined]
                        f"on device {i}. "
-                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")
+                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")  # type: ignore[possibly-undefined]
                 warnings.warn(msg)
-            elif caching_allocator_discrepancy and driver_discrepancy:
+            elif caching_allocator_discrepancy and driver_discrepancy:  # type: ignore[possibly-undefined]
                 # A caching allocator discrepancy validated by the driver API is a
                 #   failure (except on ROCm, see below)
                 msg = (f"CUDA driver API confirmed a leak in {self.name}! "  # type: ignore[possibly-undefined]
                        f"Caching allocator allocated memory was {self.caching_allocator_befores[i]} "
-                       f"and is now reported as {caching_allocator_mem_allocated} "
+                       f"and is now reported as {caching_allocator_mem_allocated} "  # type: ignore[possibly-undefined]
                        f"on device {i}. "
-                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")
+                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")  # type: ignore[possibly-undefined]
 
                 raise RuntimeError(msg)
Original file line number	Diff line number	Diff line change
`@@ -329,7 +329,8 @@ test_h100_distributed() {`
`329`	`329`	`time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`330`	`330`	`# symmetric memory test`
`331`	`331`	`time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`332`		`- time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
	`332`	`+ time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
	`333`	`+ time TORCH_SYMMMEM=NCCL python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`333`	`334`	`assert_git_not_dirty`
`334`	`335`	`}`
`335`	`336`