pytorch
diff --git a/‎test/distributed/test_functional_api.py
Lines changed: 22 additions & 10 deletions b/‎test/distributed/test_functional_api.py
Lines changed: 22 additions & 10 deletions
diff --git a/‎test/inductor/test_autoheuristic.py
Lines changed: 13 additions & 5 deletions b/‎test/inductor/test_autoheuristic.py
Lines changed: 13 additions & 5 deletions
diff --git a/‎test/inductor/test_b2b_gemm.py
Lines changed: 2 additions & 2 deletions b/‎test/inductor/test_b2b_gemm.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/inductor/test_layout_optim.py
Lines changed: 3 additions & 2 deletions b/‎test/inductor/test_layout_optim.py
Lines changed: 3 additions & 2 deletions
@@ -3,13 +3,15 @@
 import sys
 import unittest
 from functools import partial, wraps
+from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as ft_c
 import torch.distributed.distributed_c10d as c10d
 import torch.distributed.tensor as dt
 from functorch import make_fx
+from torch._dynamo.metrics_context import MetricsContext
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
@@ -31,7 +33,6 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
-    skipIfHpu,
     TEST_CUDA,
     TEST_HPU,
     TestCase,
@@ -90,7 +91,7 @@ def new_subgroups(group_size: int, pg_tag=None):
     return cur_subgroup, subgroups
 
 
-@skipIfHpu
+@unittest.skipIf(TEST_HPU, "Unsupported on HPU")
 class TestExpand(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -180,7 +181,7 @@ def test_expand_device_mesh_tuple(self):
         self.assertEqual(2, group_size)
 
 
-@skipIfHpu
+@unittest.skipIf(TEST_HPU, "Unsupported on HPU")
 class TestPgTag(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -257,7 +258,7 @@ def test_find_root_pg(self):
 
 
 @instantiate_parametrized_tests
-@skipIfHpu
+@unittest.skipIf(TEST_HPU, "Unsupported on HPU")
 class TestTraceableCollectives(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -403,7 +404,7 @@ def test_all_reduce(self):
         self.assertEqual(x.size(), out.size())
 
 
-@skipIfHpu
+@unittest.skipIf(TEST_HPU, "Unsupported on HPU")
 class TestGradCollectives(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -656,7 +657,7 @@ def test_permute_tensor_with_sub_group(self, device):
 
 
 @instantiate_parametrized_tests
-@skipIfHpu
+@unittest.skipIf(TEST_HPU, "Unsupported on HPU")
 class TestFunctionalAutograd(MultiThreadedTestCase):
     def setUp(self):
         super().setUp()
@@ -666,6 +667,13 @@ def setUp(self):
     def world_size(self):
         return 2
 
+    # `compilation_metric` attempts to update the `is_forward` field of `metrics_context`. Since
+    # `metrics_context` is a singleton, a runtime error will occur if multiple threads try to update it
+    # because `MetricsContext` does not allow updating existing fields when `overwrite` is False.
+    # So, we need to patch the `update` function of MetricsContext
+    def _metrics_context_update(self, *args, **kwargs) -> None:
+        pass
+
     @parametrize("compile", [True, False])
     def test_all_to_all_single(self, compile: bool = True) -> None:
         group = dist.group.WORLD.group_name
@@ -691,7 +699,8 @@ def my_func(t: torch.Tensor, world_size: int) -> torch.Tensor:
         self.assertIsNotNone(out.grad_fn)
         self.assertTrue(out.requires_grad)
         loss = out.sum()
-        loss.backward()
+        with patch.object(MetricsContext, "update", self._metrics_context_update):
+            loss.backward()
         self.assertEqual(t.grad, torch.full_like(t, 2.0))
 
     def test_all_to_all_single_inductor(self) -> None:
@@ -711,7 +720,8 @@ def my_func(t: torch.Tensor, world_size: int) -> torch.Tensor:
 
         def run_with_backward():
             out = compiled(t, self.world_size)
-            out.backward()
+            with patch.object(MetricsContext, "update", self._metrics_context_update):
+                out.backward()
 
         _, codes = run_and_get_code(run_with_backward)
         for code in codes:
@@ -751,7 +761,8 @@ def my_func(t: torch.Tensor, dim: int) -> torch.Tensor:
             gathered_tensor = compiled(local_tensor, dim)
             self.assertEqual(gathered_tensor, torch.ones(output_size))
 
-            gathered_tensor.sum().backward()
+            with patch.object(MetricsContext, "update", self._metrics_context_update):
+                gathered_tensor.sum().backward()
             self.assertEqual(
                 local_tensor.grad,
                 torch.full((3, 3, 3), fill_value=float(self.world_size)),
@@ -786,7 +797,8 @@ def my_func(t: torch.Tensor, dim: int) -> torch.Tensor:
             rs_tensor = compiled(input_tensor, dim)
             res_num = 1 * group_size
             self.assertEqual(rs_tensor, torch.ones(input_size) * res_num)
-            rs_tensor.sum().backward()
+            with patch.object(MetricsContext, "update", self._metrics_context_update):
+                rs_tensor.sum().backward()
             self.assertEqual(input_tensor.grad, torch.full(output_size, fill_value=1.0))
 
 
 
@@ -4,17 +4,22 @@
 
 import torch
 import torch._inductor.config as inductor_config
-from torch._dynamo.device_interface import get_interface_for_device
 from torch._inductor.autoheuristic.autoheuristic import AutoHeuristic, LocalFeedback
 from torch._inductor.autoheuristic.autoheuristic_utils import AHContext
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import get_gpu_shared_memory
-from torch.testing._internal.common_utils import skipIfXpu
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_A100, IS_H100
+from torch.testing._internal.common_utils import TEST_XPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CUDA,
+    HAS_GPU,
+    IS_A100,
+    IS_H100,
+)
 
 
-@skipIfXpu(msg="AutoHeuristic doesn't currently work on the XPU stack")
+@unittest.skipIf(TEST_XPU, "AutoHeuristic doesn't currently work on the XPU stack")
 class AutoHeuristicTest(TestCase):
     def count_lines_in_file(self, file_path):
         with open(file_path) as file:
@@ -102,7 +107,9 @@ def feedback_fn(choice):
         self.assertEqual(num_lines, 5)
 
         shared_memory = get_gpu_shared_memory()
-        (fst, snd) = get_interface_for_device(GPU_TYPE).get_device_capability()
+
+        self.assertTrue(HAS_CUDA)
+        (fst, snd) = torch.cuda.get_device_capability()
 
         with open(path) as file:
             lines = file.readlines()
@@ -151,6 +158,7 @@ def fn(a, b):
         fx_graph_cache=False,
         fx_graph_remote_cache=False,
     )
+    @unittest.skipIf(not IS_A100, "heuristic only run on A100")
     def test_global_feedback(self):
         self.run_mixed_mm()
         path = self.get_path_to_autoheuristic_log("mixed_mm")
 
@@ -6,11 +6,11 @@
 from torch._inductor.runtime.benchmarking import benchmarker
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
-from torch.testing._internal.common_utils import skipIfXpu
+from torch.testing._internal.common_utils import TEST_XPU
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
-@skipIfXpu(msg="Segmentation fault on CI machine")
+@unittest.skipIf(TEST_XPU, "Segmentation fault on CI machine")
 class B2BGEMMTest(TestCase):
     device = GPU_TYPE
 
 
@@ -2,14 +2,15 @@
 import copy
 import os
 import random
+import unittest
 
 import torch
 from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_cuda import tf32_off
-from torch.testing._internal.common_utils import skipIfXpu
+from torch.testing._internal.common_utils import TEST_XPU
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -34,7 +35,7 @@ def get_example_inputs(self):
         return (torch.rand(2, 3, 16, 16),)
 
 
-@skipIfXpu(msg="ccl doesn't currently work on the XPU stack")
+@unittest.skipIf(TEST_XPU, "ccl doesn't currently work on the XPU stack")
 class TestLayoutOptim(TestCase):
     @classmethod
     def setUpClass(cls):