pytorch
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cpu/SortingKernel.cpp
Lines changed: 120 additions & 2 deletions b/‎aten/src/ATen/native/cpu/SortingKernel.cpp
Lines changed: 120 additions & 2 deletions
diff --git a/‎cmake/Dependencies.cmake
Lines changed: 16 additions & 0 deletions b/‎cmake/Dependencies.cmake
Lines changed: 16 additions & 0 deletions
diff --git a/‎cmake/Summary.cmake
Lines changed: 1 addition & 0 deletions b/‎cmake/Summary.cmake
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/x86-simd-sort b/‎third_party/x86-simd-sort
@@ -131,3 +131,6 @@
 	path = third_party/composable_kernel
 	url = https://github.com/ROCm/composable_kernel.git
 	branch = develop
+[submodule "third_party/x86-simd-sort"]
+	path = third_party/x86-simd-sort
+	url = https://github.com/intel/x86-simd-sort.git
@@ -262,6 +262,7 @@ else()
   cmake_dependent_option(USE_CUFILE "Use cuFile" OFF "USE_CUDA AND NOT WIN32" OFF)
 endif()
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
+option(USE_X86_SIMD_SORT "Use x86-simd-sort to accelerate sorting and topk for AVX2/AVX512" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
 option(USE_FAKELOWP "Use FakeLowp operators" OFF)
@@ -903,6 +904,10 @@ if(USE_FBGEMM)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
 endif()
 
+if(USE_X86_SIMD_SORT)
+  string(APPEND CMAKE_CXX_FLAGS " -DUSE_X86_SIMD_SORT")
+endif()
+
 if(USE_PYTORCH_QNNPACK)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()
 
@@ -16,10 +16,17 @@
 #include <ATen/native/TopKImpl.h>
 #include <c10/core/WrapDimMinimal.h>
 #include <c10/util/irange.h>
+
 #ifdef USE_FBGEMM
 #include <fbgemm/Utils.h>
 #endif
 
+#if USE_X86_SIMD_SORT && (defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2))
+#define XSS_COMPILE_TIME_SUPPORTED
+#define XSS_USE_OPENMP
+#include <src/x86simdsort-static-incl.h>
+#endif
+
 namespace at::native {
 
 namespace {
@@ -117,6 +124,7 @@ static void parallel_sort1d_kernel(
     std::vector<int64_t> tmp_vals(elements);
     const scalar_t* sorted_keys = nullptr;
     const int64_t* sorted_vals = nullptr;
+
     std::tie(sorted_keys, sorted_vals) = fbgemm::radix_sort_parallel(
         keys,
         vals,
@@ -165,6 +173,107 @@ static inline void sort_kernel_impl(const value_accessor_t& value_accessor,
   }
 }
 
+#if defined(XSS_COMPILE_TIME_SUPPORTED)
+
+#define AT_DISPATCH_CASE_XSS_TYPES(...)          \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)
+
+#define AT_DISPATCH_XSS_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_XSS_TYPES(__VA_ARGS__))
+
+static bool can_use_xss_sort(const TensorBase& values, const TensorBase& indices, int64_t dim, const bool stable) {
+  // xss_sort is not a stable sort
+  if (stable) return false;
+
+  auto type = values.scalar_type();
+  if (! (type == ScalarType::Long || type == ScalarType::Int || type == ScalarType::Double || type == ScalarType::Float)) return false;
+
+  return true;
+}
+
+static void xss_sort_kernel(
+    const TensorBase& values,
+    const TensorBase& indices,
+    int64_t dim,
+    bool descending) {
+  auto iter = TensorIteratorConfig()
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .declare_static_shape(values.sizes(), /*squash_dims=*/dim)
+    .add_output(values)
+    .add_output(indices)
+    .build();
+
+  using index_t = int64_t;
+
+  AT_DISPATCH_XSS_TYPES(values.scalar_type(), "xss_sort_kernel", [&] {
+
+    auto values_dim_stride = values.stride(dim);
+    auto indices_dim_stride = indices.stride(dim);
+    auto dim_size = values.size(dim);
+
+    auto loop = [&](char** data, const int64_t* strides, int64_t n) {
+      auto* values_data_bytes = data[0];
+      auto* indices_data_bytes = data[1];
+
+      if(values_data_bytes==nullptr || indices_data_bytes==nullptr){
+        return;
+      }
+
+      if (values_dim_stride == 1 && indices_dim_stride == 1){
+        for (const auto i C10_UNUSED : c10::irange(n)) {
+          x86simdsortStatic::keyvalue_qsort<scalar_t, index_t>(
+              reinterpret_cast<scalar_t*>(values_data_bytes),
+              reinterpret_cast<index_t*>(indices_data_bytes),
+              dim_size,
+              true,
+              descending);
+
+          values_data_bytes += strides[0];
+          indices_data_bytes += strides[1];
+        }
+      }else{
+        std::vector<scalar_t> tmp_values(dim_size);
+        std::vector<index_t> tmp_indices(dim_size);
+
+        for (const auto i : c10::irange(n)) {
+          TensorAccessor<scalar_t, 1> mode_values_acc(
+              reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
+              &dim_size, &values_dim_stride);
+          TensorAccessor<index_t, 1> mode_indices_acc(
+              reinterpret_cast<index_t*>(data[1] + i * strides[1]),
+              &dim_size, &indices_dim_stride);
+
+          for (const auto j : c10::irange(dim_size)) {
+            tmp_values[j] = mode_values_acc[j];
+            tmp_indices[j] = j;
+          }
+
+          x86simdsortStatic::keyvalue_qsort<scalar_t, index_t>(
+              tmp_values.data(),
+              tmp_indices.data(),
+              dim_size,
+              true,
+              descending);
+
+          for (const auto j : c10::irange(dim_size)) {
+            mode_values_acc[j] = tmp_values[j];
+            mode_indices_acc[j] = tmp_indices[j];
+          }
+        }
+      }
+    };
+
+    int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, dim_size);
+    iter.for_each(loop, /*grain_size=*/grain_size);
+
+  });
+}
+#endif
+
 static void sort_kernel(
     const TensorBase& self,
     const TensorBase& values,
@@ -179,6 +288,14 @@ static void sort_kernel(
     // https://github.com/pytorch/pytorch/issues/91420
     return;
   }
+
+#if defined(XSS_COMPILE_TIME_SUPPORTED)
+  if (can_use_xss_sort(values, indices, dim, stable)){
+    xss_sort_kernel(values, indices, dim, descending);
+    return;
+  }
+#endif
+
 #ifdef USE_FBGEMM
   if (can_use_radix_sort(values, descending)) {
     parallel_sort1d_kernel(values, indices);
@@ -230,6 +347,7 @@ static void topk_kernel(
     int64_t dim,
     bool largest,
     bool sorted) {
+
   auto sizes = self.sizes();
   auto iter = TensorIteratorConfig()
     .check_all_same_dtype(false)
@@ -264,7 +382,7 @@ static void topk_kernel(
 
 } // anonymous namespace
 
-REGISTER_DISPATCH(sort_stub, &sort_kernel)
-REGISTER_DISPATCH(topk_stub, &topk_kernel)
+ALSO_REGISTER_AVX512_DISPATCH(sort_stub, &sort_kernel)
+ALSO_REGISTER_AVX512_DISPATCH(topk_stub, &topk_kernel)
 
 } //at::native
@@ -1301,6 +1301,22 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
 endif()
 
+# --[ x86-simd-sort integration
+if(USE_X86_SIMD_SORT)
+  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    message(WARNING
+      "x64 operating system is required for x86-simd-sort. "
+      "Not compiling with x86-simd-sort. "
+      "Turn this warning off by USE_X86_SIMD_SORT=OFF.")
+    set(USE_X86_SIMD_SORT OFF)
+  endif()
+
+  if(USE_X86_SIMD_SORT)
+    set(XSS_SIMD_SORT_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/x86-simd-sort)
+    include_directories(SYSTEM ${XSS_SIMD_SORT_INCLUDE_DIR})
+  endif()
+endif()
+
 # --[ ATen checks
 set(USE_LAPACK 0)
 
 
@@ -133,6 +133,7 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
   message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
+  message(STATUS "  USE_X86_SIMD_SORT     : ${USE_X86_SIMD_SORT}")
   message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
   message(STATUS "    USE_FAKELOWP          : ${USE_FAKELOWP}")
   message(STATUS "  USE_KINETO            : ${USE_KINETO}")