pytorch
diff --git a/‎aten/src/ATen/cuda/cub.cu
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/cuda/cub.cu
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/cub.cuh
Lines changed: 5 additions & 24 deletions b/‎aten/src/ATen/cuda/cub.cuh
Lines changed: 5 additions & 24 deletions
diff --git a/‎aten/src/ATen/native/cuda/Sort.cu
Lines changed: 5 additions & 5 deletions b/‎aten/src/ATen/native/cuda/Sort.cu
Lines changed: 5 additions & 5 deletions
diff --git a/‎test/test_sort_and_select.py
Lines changed: 7 additions & 0 deletions b/‎test/test_sort_and_select.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎torch/testing/_internal/common_methods_invocations.py
Lines changed: 2 additions & 2 deletions b/‎torch/testing/_internal/common_methods_invocations.py
Lines changed: 2 additions & 2 deletions
@@ -57,7 +57,10 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4)
 
 AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8)
 
+// BFloat16 is not supported by ROCm's radix sort
+#if !AT_ROCM_ENABLED()
 AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8)
+#endif
 
 }  // namespace detail
 
 
@@ -45,23 +45,17 @@
 
 #ifdef USE_ROCM
 #define NO_ROCM(x)
-#define ROCM_HIPCUB(x) ::hipcub
 #else
 #define NO_ROCM(x) x
-#define ROCM_HIPCUB(x) x
 #endif
 
-#if !CUB_SUPPORTS_NV_BFLOAT16() || \
-     (defined(USE_ROCM) && ROCM_VERSION >= 40500)
+#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()
 
-#if !defined(USE_ROCM)
 namespace at_cuda_detail {
-#endif
-
 // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
 
 template <>
-struct ROCM_HIPCUB(cub)::FpLimits<c10::BFloat16>
+struct cub::FpLimits<c10::BFloat16>
 {
     static __host__ __device__ __forceinline__ c10::BFloat16 Max() {
         unsigned short max_word = 0x7F7F;
@@ -74,14 +68,8 @@ struct ROCM_HIPCUB(cub)::FpLimits<c10::BFloat16>
     }
 };
 
-template <>
-struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
-       ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
-
-#if !defined(USE_ROCM)
-} // namespace at_cuda_detail
-#endif
-
+template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
+}
 #endif
 
 #if !defined(USE_ROCM)
@@ -105,20 +93,13 @@ struct cuda_type<c10::Half> {
   using type = __half;
 };
 
-#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16()
+#if CUB_SUPPORTS_NV_BFLOAT16()
 
 template<>
 struct cuda_type<c10::BFloat16> {
   using type = __nv_bfloat16;
 };
 
-#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500)
-
-template<>
-struct cuda_type<c10::BFloat16> {
-  using type = hip_bfloat16;
-};
-
 #endif
 
 }  // namespace detail
 
@@ -325,14 +325,14 @@ void launch_stable_sort_kernel(
   TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort);
   int64_t *indices_ptr = indices.data_ptr<int64_t>();
 
-#if (defined(USE_ROCM) && ROCM_VERSION < 40500)
-  constexpr bool is_rocm_bf16_sort_unsupported = true;
+#if defined(USE_ROCM)
+  constexpr bool is_rocm = true;
 #else
-  constexpr bool is_rocm_bf16_sort_unsupported = false;
+  constexpr bool is_rocm = false;
 #endif
 
   AT_DISPATCH_ALL_TYPES_AND3(kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&]{
-    c10::guts::if_constexpr<!(is_rocm_bf16_sort_unsupported && std::is_same<scalar_t, c10::BFloat16>::value)>([&](auto _){
+    c10::guts::if_constexpr<!(is_rocm && std::is_same<scalar_t, c10::BFloat16>::value)>([&](auto _){
       const scalar_t *self_ptr = self.data_ptr<scalar_t>();
       scalar_t *values_ptr = values.data_ptr<scalar_t>();
       int64_t remaining = _(numel);
@@ -353,7 +353,7 @@ void launch_stable_sort_kernel(
         values_ptr += n;
         indices_ptr += n;
       }
-    }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm < 4.5"); });
+    }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm"); });
   });
 }
 
 
@@ -135,6 +135,8 @@ def test_sort(self, device):
     # FIXME: remove torch.bool from unsupported types once support is added for cub sort
     @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
     def test_stable_sort(self, device, dtype):
+        if TEST_WITH_ROCM and dtype == torch.bfloat16:
+            return
         sizes = (100, 1000, 10000)
         for ncopies in sizes:
             x = torch.tensor([0, 1] * ncopies, dtype=dtype, device=device)
@@ -228,6 +230,8 @@ def test_topk_1d_output_discontiguous(self, device, dtype):
     # FIXME: remove torch.bool from unsupported types once support is added for cub sort
     @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
     def test_stable_sort_against_numpy(self, device, dtype):
+        if TEST_WITH_ROCM and dtype == torch.bfloat16:
+            return
         if dtype in floating_types_and(torch.float16, torch.bfloat16):
             inf = float('inf')
             neg_inf = -float('inf')
@@ -291,6 +295,9 @@ def repeated_index_fill(t, dim, idxs, vals):
 
     @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_msort(self, device, dtype):
+        if TEST_WITH_ROCM and dtype == torch.bfloat16:
+            return
+
         def test(shape):
             tensor = make_tensor(shape, device, dtype, low=-9, high=9)
             if tensor.size() != torch.Size([]):
 
@@ -13285,7 +13285,7 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('sort',
            dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
-           dtypesIfROCM=all_types_and(torch.float16, torch.bfloat16),
+           dtypesIfROCM=all_types_and(torch.float16),
            sample_inputs_func=sample_inputs_sort,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -13931,7 +13931,7 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('msort',
            dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
-           dtypesIfROCM=all_types_and(torch.float16, torch.bfloat16),
+           dtypesIfROCM=all_types_and(torch.float16),
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,