[ROCm] Fix sort for non-standard bool (#147459)

pragupta · pytorchmergebot · commit 703176e538eb · 2025-03-06T00:23:02.000Z
When converting from uint8 to bool using `view` op, we get a bool that has 0 for false and a non-zero value for true. However, these kinds of bool have undefined behavior. We only read the last bit as 0 or 1 to convert to false or true. In this fix, we convert bools to uint8, which will convert false to 0 and non-zero value to 1. Essentially, converting non-standard bool to a standard bool and fixing the sort op for non-standard bool. Fixes #139972 Pull Request resolved: #147459 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony
diff --git a/aten/src/ATen/native/cuda/Sort.cpp b/aten/src/ATen/native/cuda/Sort.cpp
@@ -65,6 +65,15 @@ void sort_cuda_kernel(
   const auto self_dtype = self.dtype();
   TORCH_CHECK(self_dtype != ScalarType::ComplexFloat && self_dtype != ScalarType::ComplexDouble,
     "Sort currently does not support complex dtypes on CUDA.");
+#if defined(USE_ROCM)
+  // ROCm has undefined behavior for non-standard bools. Here we are converting bool to uint8 which will
+  // convert false to 0 and true or any non-zero value to a 1. copy_ on const Tensors only changes the
+  // data in the tensor and not the metadata.
+  // That's why, tensor's dtype stays as bool. It just becomes a standard bool.
+  if (self_dtype == ScalarType::Bool) {
+      self.copy_(self.to(at::kByte));
+  }
+#endif
 
   // use inplace algorithm for smaller input sizes without stable=True
   if (should_use_small_sort(self, dim)) {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -18498,7 +18498,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_fwgrad_bwgrad=True,
            skips=(
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_non_standard_bool_values',
-                            dtypes=[torch.bool], device_type='cuda'),
+                            dtypes=[torch.bool], device_type='cuda', active_if=not TEST_WITH_ROCM),
            )),
     OpInfo('unique',
            dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16, torch.uint16, torch.uint32, torch.uint64),
@@ -19549,12 +19549,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_msort,
-           skips=(
-               # https://github.com/pytorch/pytorch/issues/139972
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_non_standard_bool_values',
-                            dtypes=[torch.bool], device_type='cuda', active_if=TEST_WITH_ROCM),
-           )),
+           sample_inputs_func=sample_inputs_msort),
     OpInfo('movedim',
            aliases=('moveaxis',),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -21380,6 +21375,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 "test_non_standard_bool_values",
                 dtypes=[torch.bool],
                 device_type='cuda',
+                active_if=not TEST_WITH_ROCM
             ),
         ),
     ),