pytorch · weishi-deng · Mar 31, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
@@ -411,7 +411,7 @@ TORCH_IMPL_FUNC(log_softmax_backward_cpu_out) (
 Tensor softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
-    if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
+    if ((input_.is_cuda() || input_.is_xpu()) && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float) {
         return at::_softmax(input_, dim_, true);
     } else {
         Tensor converted = dtype.has_value() ? input_.toType(dtype.value()) : input_;
@@ -428,7 +428,7 @@ Tensor& softmax_out(
     std::optional<ScalarType> dtype,
     Tensor& output_) {
   Tensor output_temp;
-  if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
+  if ((input_.is_cuda() || input_.is_xpu()) && input_.scalar_type() == ScalarType::Half &&
       dtype == ScalarType::Float) {
     if (!output_.is_contiguous()) {
       auto options =
@@ -467,7 +467,7 @@ Tensor special_softmax(const Tensor& input_, const int64_t dim_, std::optional<S
 Tensor log_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
-    if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
+    if ((input_.is_cuda() || input_.is_xpu()) && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float) {
         return at::_log_softmax(input_, dim_, true);
     } else {
         Tensor converted = dtype.has_value()? input_.toType(dtype.value()) : input_;
@@ -484,7 +484,7 @@ Tensor& log_softmax_out(
     std::optional<ScalarType> dtype,
     Tensor& output_) {
   Tensor output_temp;
-  if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
+  if ((input_.is_cuda() || input_.is_xpu()) && input_.scalar_type() == ScalarType::Half &&
       dtype == ScalarType::Float) {
     if (!output_.is_contiguous()) {
       auto options =

@@ -12,8 +12,8 @@
 from torch.testing import make_tensor
 from torch.testing._internal.autocast_test_lists import AutocastTestLists, TestAutocast
 from torch.testing._internal.common_device_type import (
+    dtypes,
     instantiate_device_type_tests,
-    onlyXPU,
     OpDTypes,
     ops,
     skipXPUIf,
@@ -378,56 +378,6 @@ def test_generator(self):
         torch.xpu.set_rng_state(g_state0)
         self.assertEqual(2024, torch.xpu.initial_seed())
 
-    @onlyXPU
-    @suppress_warnings
-    @ops(_xpu_computation_ops, dtypes=any_common_cpu_xpu_one)
-    def test_compare_cpu(self, device, dtype, op):
-        def to_cpu(arg):
-            if isinstance(arg, torch.Tensor):
-                return arg.to(device="cpu")
-            return arg
-
-        samples = op.reference_inputs(device, dtype)
-
-        for sample in samples:
-            cpu_sample = sample.transform(to_cpu)
-            xpu_results = op(sample.input, *sample.args, **sample.kwargs)
-            cpu_results = op(cpu_sample.input, *cpu_sample.args, **cpu_sample.kwargs)
-
-            xpu_results = sample.output_process_fn_grad(xpu_results)
-            cpu_results = cpu_sample.output_process_fn_grad(cpu_results)
-
-            # Lower tolerance because we are running this as a `@slowTest`
-            # Don't want the periodic tests to fail frequently
-            self.assertEqual(xpu_results, cpu_results, atol=1e-4, rtol=1e-4)
-
-    @onlyXPU
-    @ops(_xpu_computation_ops, allowed_dtypes=(torch.bool,))
-    def test_non_standard_bool_values(self, device, dtype, op):
-        # Test boolean values other than 0x00 and 0x01 (gh-54789)
-        def convert_boolean_tensors(x):
-            if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:
-                return x
-
-            # Map False -> 0 and True -> Random value in [2, 255]
-            true_vals = torch.randint(
-                2, 255, x.shape, dtype=torch.uint8, device=x.device
-            )
-            false_vals = torch.zeros((), dtype=torch.uint8, device=x.device)
-            x_int = torch.where(x, true_vals, false_vals)
-
-            ret = x_int.view(torch.bool)
-            self.assertEqual(ret, x)
-            return ret
-
-        for sample in op.sample_inputs(device, dtype):
-            expect = op(sample.input, *sample.args, **sample.kwargs)
-
-            transformed = sample.transform(convert_boolean_tensors)
-            actual = op(transformed.input, *transformed.args, **transformed.kwargs)
-
-            self.assertEqual(expect, actual)
-
     def test_serialization_array_with_storage(self):
         x = torch.randn(5, 5).xpu()
         y = torch.zeros(2, 5, dtype=torch.int, device="xpu")
@@ -747,5 +697,89 @@ def test_torch_config_for_xpu(self):
             self.assertTrue(value.group(1) in ["OFF", "0"])
 
 
+@unittest.skipIf(not TEST_XPU, "XPU not available, skipping tests")
+class TestXpuOps(TestCase):
+    @dtypes(torch.float16)
+    def test_softmax_half_to_float(self, device, dtype):
+        shape = [
+            [8],
+            [7, 8],
+            [8192, 64],
+            [8192, 8192],
+            [7, 8, 512],
+            [7, 8, 11],
+            [16, 7, 8, 512],
+            [16, 7, 8, 512, 35],
+            [117, 7, 9, 513, 35],
+        ]
+        output_type = torch.float
+        for i in range(len(shape)):
+            for j in range(len(shape[i])):
+                dim = j - 1
+                x = torch.randn(shape[i], dtype=dtype)
+                grad = torch.randn(shape[i]).to(output_type)
+                x_cpu = x.clone().requires_grad_()
+                y_cpu = torch.nn.functional.softmax(x_cpu, dim, dtype=output_type)
+                y_cpu.backward(grad.clone())
+
+                x_xpu = x.clone().to(device).requires_grad_()
+                y_xpu = torch.nn.functional.softmax(x_xpu, dim, dtype=output_type)
+                self.assertEqual(y_xpu.dtype, torch.float32)
+                y_xpu.backward(grad.clone().to(device))
+                self.assertEqual(y_cpu, y_xpu.cpu())
+                self.assertEqual(x_cpu.grad, x_xpu.grad.cpu())
+
+    @suppress_warnings
+    @ops(_xpu_computation_ops, dtypes=any_common_cpu_xpu_one)
+    def test_compare_cpu(self, device, dtype, op):
+        def to_cpu(arg):
+            if isinstance(arg, torch.Tensor):
+                return arg.to(device="cpu")
+            return arg
+
+        samples = op.reference_inputs(device, dtype)
+
+        for sample in samples:
+            cpu_sample = sample.transform(to_cpu)
+            xpu_results = op(sample.input, *sample.args, **sample.kwargs)
+            cpu_results = op(cpu_sample.input, *cpu_sample.args, **cpu_sample.kwargs)
+
+            xpu_results = sample.output_process_fn_grad(xpu_results)
+            cpu_results = cpu_sample.output_process_fn_grad(cpu_results)
+
+            # Lower tolerance because we are running this as a `@slowTest`
+            # Don't want the periodic tests to fail frequently
+            self.assertEqual(xpu_results, cpu_results, atol=1e-4, rtol=1e-4)
+
+    @ops(_xpu_computation_ops, allowed_dtypes=(torch.bool,))
+    def test_non_standard_bool_values(self, device, dtype, op):
+        # Test boolean values other than 0x00 and 0x01 (gh-54789)
+        def convert_boolean_tensors(x):
+            if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:
+                return x
+
+            # Map False -> 0 and True -> Random value in [2, 255]
+            true_vals = torch.randint(
+                2, 255, x.shape, dtype=torch.uint8, device=x.device
+            )
+            false_vals = torch.zeros((), dtype=torch.uint8, device=x.device)
+            x_int = torch.where(x, true_vals, false_vals)
+
+            ret = x_int.view(torch.bool)
+            self.assertEqual(ret, x)
+            return ret
+
+        for sample in op.sample_inputs(device, dtype):
+            expect = op(sample.input, *sample.args, **sample.kwargs)
+
+            transformed = sample.transform(convert_boolean_tensors)
+            actual = op(transformed.input, *transformed.args, **transformed.kwargs)
+
+            self.assertEqual(expect, actual)
+
+
+instantiate_device_type_tests(TestXpuOps, globals(), only_for="xpu", allow_xpu=True)
+
+
 if __name__ == "__main__":
     run_tests()