pytorch
diff --git a/‎aten/src/ATen/cpu/vec256/functional.h
Lines changed: 29 additions & 0 deletions b/‎aten/src/ATen/cpu/vec256/functional.h
Lines changed: 29 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/ReduceAllOps.cpp
Lines changed: 10 additions & 0 deletions b/‎aten/src/ATen/native/ReduceAllOps.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/ReduceAllOps.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/native/ReduceAllOps.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
Lines changed: 107 additions & 2 deletions b/‎aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
Lines changed: 107 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 7 additions & 0 deletions b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 7 additions & 0 deletions
diff --git a/‎test/test_torch.py
Lines changed: 35 additions & 14 deletions b/‎test/test_torch.py
Lines changed: 35 additions & 14 deletions
@@ -44,6 +44,35 @@ inline scalar_t reduce_all(const Op& vec_fun, scalar_t* data, int64_t size) {
   return vec_reduce_all(vec_fun, acc_vec, Vec::size());
 }
 
+// similar to reduce_all, but reduces into two outputs
+template <typename scalar_t, typename Op1, typename Op2>
+inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
+    scalar_t* data, int64_t size) {
+  using Vec = vec256::Vec256<scalar_t>;
+  if (size < Vec::size()) {
+    auto loaded_data = Vec::loadu(data, size);
+    return std::pair<scalar_t, scalar_t>(
+      vec_reduce_all(vec_fun1, loaded_data, size),
+      vec_reduce_all(vec_fun2, loaded_data, size));
   }
+  int64_t d = Vec::size();
+  Vec acc_vec1 = Vec::loadu(data);
+  Vec acc_vec2 = Vec::loadu(data);
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(data + d);
+    acc_vec1 = vec_fun1(acc_vec1, data_vec);
+    acc_vec2 = vec_fun2(acc_vec2, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    acc_vec1 = Vec::set(acc_vec1, vec_fun1(acc_vec1, data_vec), size - d);
+    acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
+  }
+  return std::pair<scalar_t, scalar_t>(
+    vec_reduce_all(vec_fun1, acc_vec1, Vec::size()),
+    vec_reduce_all(vec_fun2, acc_vec2, Vec::size()));
+}
+
 template <typename scalar_t, typename MapOp, typename ReduceOp>
 inline scalar_t map_reduce_all(
     const MapOp& map_fun,
 
@@ -8,6 +8,7 @@ namespace native {
 
 DEFINE_DISPATCH(min_all_stub);
 DEFINE_DISPATCH(max_all_stub);
+DEFINE_DISPATCH(_min_max_all_stub);
 
 Tensor min(const Tensor &self) {
   TORCH_CHECK(!self.is_complex(), "min is not yet implemented for complex tensors.");
@@ -25,4 +26,13 @@ Tensor max(const Tensor &self) {
   return result;
 }
 
+std::tuple<Tensor, Tensor> _min_max(const Tensor &self) {
+  TORCH_CHECK(!self.is_complex(), "max is not yet implemented for complex tensors.");
+  TORCH_CHECK(self.numel() > 0, "operation does not have an identity.");
+  Tensor min_result = at::empty({}, self.options());
+  Tensor max_result = at::empty({}, self.options());
+  _min_max_all_stub(self.device().type(), min_result, max_result, self.contiguous());
+  return std::tuple<Tensor&, Tensor&>(min_result, max_result);
+}
+
 }} // namesapce at::native
@@ -6,7 +6,9 @@
 namespace at { namespace native {
 
 using reduce_all_fn = void (*)(Tensor & result, const Tensor & self);
+using reduce_min_max_fn = void (*)(Tensor & max_result, Tensor & min_result, const Tensor & self);
 DECLARE_DISPATCH(reduce_all_fn, min_all_stub);
 DECLARE_DISPATCH(reduce_all_fn, max_all_stub);
+DECLARE_DISPATCH(reduce_min_max_fn, _min_max_all_stub);
 
 }}
@@ -27,7 +27,7 @@ inline void reduce_all_impl_vec(
   const int64_t input_numel = input.numel();
   auto input_data = input.data_ptr<scalar_t>();
   // NOTE: parallel_reduce not support bool type
-  scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, 
+  scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
     [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t {
       scalar_t partial_out = vec256::reduce_all<scalar_t>(
         [=](Vec x, Vec y) { return vop(x, y); },
@@ -47,7 +47,7 @@ inline void reduce_all_impl(
     func_t op) {
   const int64_t input_numel = input.numel();
   auto input_data = input.data_ptr<scalar_t>();
-  scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, 
+  scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
     [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t {
       scalar_t partial_out = ident;
       for (int64_t i = start; i < end; i++) {
@@ -108,9 +108,114 @@ static void max_all_kernel_impl(Tensor& result, const Tensor& input) {
   }
 }
 
+// For operation not support in avx/avx2
+template <typename scalar_t, typename func_t1, typename func_t2>
+inline void reduce_all_impl_two_outputs(
+    Tensor& output1,
+    Tensor& output2,
+    const Tensor& input,
+    const std::pair<scalar_t, scalar_t>& ident_v,
+    func_t1 reduce_chunk_func,
+    func_t2 reduce_acc_func) {
+  using scalar_t_pair = std::pair<scalar_t, scalar_t>;
+  const int64_t input_numel = input.numel();
+  auto input_data = input.data_ptr<scalar_t>();
+  scalar_t_pair result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
+    [&](int64_t start, int64_t end, const scalar_t_pair& ident) -> scalar_t_pair {
+      scalar_t_pair partial_out(ident);
+      for (int64_t i = start; i < end; i++) {
+         partial_out = reduce_chunk_func(partial_out, input_data[i]);
+      }
+      return partial_out;
+    },
+    reduce_acc_func
+  );
+  output1.fill_(result.first);
+  output2.fill_(result.second);
+}
+
+template <typename scalar_t, typename func_t, typename vec_func_t1, typename vec_func_t2>
+inline void reduce_all_impl_vec_two_outputs(
+    Tensor& output1,
+    Tensor& output2,
+    const Tensor& input,
+    const std::pair<scalar_t, scalar_t>& ident_v,
+    func_t reduce_acc_func,
+    vec_func_t1 reduce_chunk_func1,
+    vec_func_t2 reduce_chunk_func2) {
+  using Vec = Vec256<scalar_t>;
+  using scalar_t_pair = std::pair<scalar_t, scalar_t>;
+  const int64_t input_numel = input.numel();
+  auto input_data = input.data_ptr<scalar_t>();
+  // NOTE: parallel_reduce not support bool type
+  std::pair<scalar_t, scalar_t> result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
+    [&](int64_t start, int64_t end, const scalar_t_pair& /* ident */) -> scalar_t_pair {
+    scalar_t_pair partial_out = vec256::reduce2_all<scalar_t>(
+        [=](Vec x, Vec y) { return reduce_chunk_func1(x, y); },
+        [=](Vec x, Vec y) { return reduce_chunk_func2(x, y); },
+        input_data + start,
+        end - start);
+      return partial_out;
+    },
+    reduce_acc_func
+  );
+  output1.fill_(result.first);
+  output2.fill_(result.second);
+}
+
+static void _min_max_all_kernel_impl(Tensor& min_result, Tensor& max_result,
+    const Tensor& input) {
+  if (input.scalar_type() == ScalarType::Bool) {
+    TensorIterator iter = TensorIteratorConfig()
+      .add_input(input)
+      .build();
+    bool min_result_data = true;
+    bool max_result_data = false;
+    cpu_serial_kernel(iter, [&](const bool a) -> void {
+      min_result_data = min_result_data && a;
+      max_result_data = max_result_data || a;
+    });
+    min_result.fill_(min_result_data);
+    max_result.fill_(max_result_data);
+  } else if (input.scalar_type() == ScalarType::Long) {
+    // for int64_t, vectorized implementation have performance issue,
+    // just use scalar path
+    using int64_t_pair = std::pair<int64_t, int64_t>;
+    reduce_all_impl_two_outputs<int64_t>(min_result, max_result, input,
+      int64_t_pair(upper_bound<int64_t>(), lower_bound<int64_t>()),
+      // reduce over chunk
+      [=](int64_t_pair a, int64_t b) -> int64_t_pair {
+        return int64_t_pair(min_impl(a.first, b), max_impl(a.second, b));
+      },
+      // combine two inputs
+      [=](int64_t_pair a, int64_t_pair b) -> int64_t_pair {
+        return int64_t_pair(min_impl(a.first, b.first), max_impl(a.second, b.second));
+      }
+    );
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(input.scalar_type(), "_min_max_all", [&] {
+      using Vec = vec256::Vec256<scalar_t>;
+      using scalar_t_pair = std::pair<scalar_t, scalar_t>;
+      reduce_all_impl_vec_two_outputs<scalar_t>(
+        min_result,
+        max_result,
+        input,
+        scalar_t_pair(upper_bound<scalar_t>(), lower_bound<scalar_t>()),
+        [=] (scalar_t_pair a , scalar_t_pair b) -> scalar_t_pair {
+          return scalar_t_pair(
+            min_impl(a.first, b.first), max_impl(a.second, b.second));
+        },
+        [=](Vec a, Vec b) -> Vec { return minimum(a, b); },
+        [=](Vec a, Vec b) -> Vec { return maximum(a, b); }
+      );
+    });
+  }
+}
+
 } // namespace
 
 REGISTER_DISPATCH(min_all_stub, &min_all_kernel_impl);
 REGISTER_DISPATCH(max_all_stub, &max_all_kernel_impl);
+REGISTER_DISPATCH(_min_max_all_stub, &_min_max_all_kernel_impl);
 
 }}
@@ -5076,6 +5076,13 @@
     CPU, CUDA: max
     QuantizedCPU: max_quant
 
+# Return: (Tensor min, Tensor max)
+- func: _min_max(Tensor self) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU: _min_max
+
 - func: median(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
@@ -275,7 +275,7 @@ def test_has_storage(self):
             self.assertIsNotNone(torch.Tensor([0, 0, 0]).nonzero().storage())
             self.assertIsNotNone(torch.Tensor().new().storage())
 
-        def _testSelection(self, torchfn, mathfn):
+        def _testSelection(self, torchfn, mathfn, skip_indices=False):
             # contiguous
             m1 = torch.randn(100, 100)
             res1 = torchfn(m1)
@@ -294,20 +294,21 @@ def _testSelection(self, torchfn, mathfn):
             self.assertEqual(res1, res2)
 
             # with indices
-            m1 = torch.randn(100, 100)
-            res1val, res1ind = torchfn(m1, 1, False)
-            res2val = m1[:, 0:1].clone().squeeze()
-            res2ind = res1ind.clone().fill_(0)
-            for i, j in iter_indices(m1):
-                if mathfn(res2val[i], m1[i, j]) != res2val[i]:
-                    res2val[i] = m1[i, j]
-                    res2ind[i] = j
+            if not skip_indices:
+                m1 = torch.randn(100, 100)
+                res1val, res1ind = torchfn(m1, 1, False)
+                res2val = m1[:, 0:1].clone().squeeze()
+                res2ind = res1ind.clone().fill_(0)
+                for i, j in iter_indices(m1):
+                    if mathfn(res2val[i], m1[i, j]) != res2val[i]:
+                        res2val[i] = m1[i, j]
+                        res2ind[i] = j
 
-            maxerr = 0
-            for i in range(res1val.size(0)):
-                maxerr = max(maxerr, abs(res1val[i] - res2val[i]))
-                self.assertEqual(res1ind[i], res2ind[i])
-            self.assertLessEqual(abs(maxerr), 1e-5)
+                maxerr = 0
+                for i in range(res1val.size(0)):
+                    maxerr = max(maxerr, abs(res1val[i] - res2val[i]))
+                    self.assertEqual(res1ind[i], res2ind[i])
+                self.assertLessEqual(abs(maxerr), 1e-5)
 
             # NaNs
             for index in (0, 4, 99):
@@ -327,12 +328,32 @@ def _testSelection(self, torchfn, mathfn):
                 res2 = mathfn(res2, m1[i])
             self.assertEqual(res1, res2)
 
+            # Long
+            m1 = torch.LongTensor(100).random_(-1000, 1000)
+            res1 = torchfn(m1)
+            res2 = m1[0]
+            for i in iter_indices(m1):
+                res2 = mathfn(res2, m1[i])
+            self.assertEqual(res1, res2)
+
+
         def test_max(self):
             self._testSelection(torch.max, max)
 
         def test_min(self):
             self._testSelection(torch.min, min)
 
+        def test_min_max(self):
+            # TODO: implement indices, in a future PR
+            # min correctness
+            self._testSelection(lambda x: torch._min_max(x)[0],
+                                lambda x, y: min(x, y),
+                                skip_indices=True)
+            # max correctness
+            self._testSelection(lambda x: torch._min_max(x)[1],
+                                lambda x, y: max(x, y),
+                                skip_indices=True)
+
         def test_dim_reduction_uint8_overflow(self):
             example = [[-1, 2, 1], [5, 3, 6]]
             x = torch.tensor(example, dtype=torch.uint8)