pytorch
diff --git a/‎test/inductor/test_kernel_benchmark.py
-6 b/‎test/inductor/test_kernel_benchmark.py
-6
diff --git a/‎test/inductor/test_pattern_matcher.py
-1 b/‎test/inductor/test_pattern_matcher.py
-1
diff --git a/‎test/inductor/test_torchinductor.py
+8-8 b/‎test/inductor/test_torchinductor.py
+8-8
diff --git a/‎torch/_inductor/fx_passes/post_grad.py
+2-2 b/‎torch/_inductor/fx_passes/post_grad.py
+2-2
@@ -362,12 +362,6 @@ def f(a, b, c):
         # num_gb = (1000 * 1000 + 2 * 1000 * 1000 + 1000 * 1000) * 2/ 1e9
         #        = 0.008
         num_gb = "0.008"
-        if GPU_TYPE == "xpu":
-            # In XPU backend, mm + add + add will be fused as admm + add
-            # And CUDA prefer not fuse add + mm, please check in function
-            # `should_prefer_unfused_addmm` in torch/_inductor/fx_passes/post_grad.py
-            num_gb = "0.006"
-
         self.check_bandwidth(compiled_module, num_gb)
 
     def test_mm_slice_add_bandwidth_computation_2(self):
 
@@ -1127,7 +1127,6 @@ def fn(a, b):
         self.assertIn("return (buf0, )", code[0])
         self.assertNotIn("async_compile.cpp", code[0])
 
-    @expectedFailureXPU
     def test_unfuse_bias_addmm(self):
         args = [
             torch.randn(20, device=GPU_TYPE),
 
@@ -14024,7 +14024,7 @@ def fn(x):
                 return x.sin()
 
             fn_c = torch.compile(fn)
-            x = torch.rand(16, device="cuda")
+            x = torch.rand(16, device=GPU_TYPE)
 
             _, code = run_and_get_code(fn_c, x)
 
@@ -14039,7 +14039,7 @@ def f(x, y):
                 y1 = y + 1
                 y_cpu = y1.cpu() + 1
                 z = x @ y
-                return x1 + y1 + z + y_cpu.cuda()
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
 
             x, y = [torch.ones(2, 2, device=self.device) for _ in range(2)]
             x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
@@ -14065,7 +14065,7 @@ def f(x, y):
                 y1 = y + 1
                 y_cpu = y1.cpu() + 1
                 z = x @ y
-                return x1 + y1 + z + y_cpu.cuda()
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
 
             def g(x):
                 return x + 1
@@ -14106,7 +14106,7 @@ def f(x, y):
                 y1 = y + 1
                 y_cpu = y1.cpu() + 1
                 z = x @ y
-                return x1 + y1 + z + y_cpu.cuda()
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
 
             f_compiled = torch.compile(f)
             x, y = torch.ones(3, 3, device=self.device), torch.randn(
@@ -14128,7 +14128,7 @@ def f(x, y):
                 y1 = y + 1
                 y_cpu = y1.cpu() + 1
                 z = x @ y
-                return x1 + y1 + z + y_cpu.cuda()
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
 
             f_compiled = torch.compile(f)
             x, y = torch.ones(3, 3, device=self.device), torch.randn(
@@ -14149,11 +14149,11 @@ def f(x, y):
                 y1 = y + 1
                 y_cpu = y1.cpu() + 1
                 z = x1 + y1 + x @ y
-                u = (y_cpu.cuda() + 2) @ y + 3
+                u = (y_cpu.to(GPU_TYPE) + 2) @ y + 3
                 u_cpu = u.cpu() + 2
-                return z + u_cpu.cuda()
+                return z + u_cpu.to(GPU_TYPE)
 
-            x, y = [torch.ones(2, 2, device="cuda") for _ in range(2)]
+            x, y = [torch.ones(2, 2, device=GPU_TYPE) for _ in range(2)]
             x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
             eager_out = f(x, y)
 
 
@@ -44,7 +44,7 @@
     register_graph_pattern,
     stable_topological_sort,
 )
-from ..utils import decode_device, get_gpu_type, is_pointwise_use
+from ..utils import decode_device, get_gpu_type, is_gpu, is_pointwise_use
 from ..virtualized import V
 from .b2b_gemm import B2B_GEMM_PASS
 from .ddp_fusion import fuse_ddp_communication
@@ -888,7 +888,7 @@ def view_to_reshape(gm):
 
 def should_prefer_unfused_addmm(match):
     inp = match.kwargs["inp"]
-    if not inp.meta["val"].is_cuda:
+    if not is_gpu(inp.meta["val"].device.type):
         return False
 
     output = match.output_node()