pytorch
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_float.h
Lines changed: 26 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec256/vec256_float.h
Lines changed: 26 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_int.h
Lines changed: 28 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec256/vec256_int.h
Lines changed: 28 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_float.h
Lines changed: 6 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec512/vec512_float.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_int.h
Lines changed: 6 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec512/vec512_int.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec_base.h
Lines changed: 29 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/vec_base.h
Lines changed: 29 additions & 0 deletions
@@ -380,6 +380,32 @@ template <> class Vectorized<float> {
   Vectorized<float> pow(const Vectorized<float> &b) const {
     return Vectorized<float>(Sleef_powf8_u10(values, b));
   }
+  float reduce_add() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = _mm256_add_ps(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = _mm256_add_ps(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = _mm256_add_ps(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
+  float reduce_max() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = _mm256_max_ps(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = _mm256_max_ps(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = _mm256_max_ps(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
 
@@ -251,6 +251,34 @@ class Vectorized<int32_t> : public Vectorizedi {
     return *this;
   }
   Vectorized<int32_t> neg() const;
+  int32_t reduce_add() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_si256(v, v, 0x1);
+    v = _mm256_add_epi32(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0x4E);
+    v = _mm256_add_epi32(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0xB1);
+    v = _mm256_add_epi32(v, v1);
+    __m128i lo = _mm256_castsi256_si128(v);
+    return _mm_cvtsi128_si32(lo);
+  }
+  int32_t reduce_max() const {
+    auto v = values;
+    // 128-bit shuffle
+    auto v1 = _mm256_permute2f128_si256(v, v, 0x1);
+    v = _mm256_max_epi32(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0x4E);
+    v = _mm256_max_epi32(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_epi32(v, 0xB1);
+    v = _mm256_max_epi32(v, v1);
+    __m128i lo = _mm256_castsi256_si128(v);
+    return _mm_cvtsi128_si32(lo);
+  }
   Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
     return _mm256_cmpeq_epi32(values, other.values);
   }
 
@@ -403,6 +403,12 @@ template <> class Vectorized<float> {
   Vectorized<float> pow(const Vectorized<float> &b) const {
     return Vectorized<float>(Sleef_powf16_u10(values, b));
   }
+  float reduce_add() const {
+    return _mm512_reduce_add_ps(values);
+  }
+  float reduce_max() const {
+    return _mm512_reduce_max_ps(values);
+  }
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
 
@@ -277,6 +277,12 @@ class Vectorized<int32_t> : public Vectorizedi {
     return *this;
   }
   Vectorized<int32_t> neg() const;
+  int32_t reduce_add() const {
+    return _mm512_reduce_add_epi32(values);
+  }
+  int32_t reduce_max() const {
+    return _mm512_reduce_max_epi32(values);
+  }
   Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
     auto mask = _mm512_cmpeq_epi32_mask(values, other.values);
     return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
 
@@ -294,6 +294,15 @@ struct Vectorized {
     }
     return ret;
   }
+  T reduce(T (*const f)(T)) const {
+    T ret = 0;
+    for (int64_t i = 0; i < size(); i++) {
+      ret = f(ret, values[i]);
+      if (++i < size())
+        ret = f(ret, values[i]);
+    }
+    return ret;
+  }
 #else
   Vectorized<T> map(T (*const f)(T)) const {
     Vectorized<T> ret;
@@ -302,6 +311,13 @@ struct Vectorized {
     }
     return ret;
   }
+  T reduce(T (*const f)(T)) const {
+    T ret = 0;
+    for (int64_t i = 0; i != size(); i++) {
+      ret = f(ret, values[i]);
+    }
+    return ret;
+  }
 #endif
   Vectorized<T> map(T (*const f)(const T &)) const {
     Vectorized<T> ret;
@@ -310,6 +326,13 @@ struct Vectorized {
     }
     return ret;
   }
+  T reduce(T (*const f)(const T &)) const {
+    T ret = 0;
+    for (int64_t i = 0; i != size(); i++) {
+      ret = f(ret, values[i]);
+    }
+    return ret;
+  }
   template <typename other_t_abs = T,
             typename std::enable_if_t<!is_floating_point_v<other_t_abs> && !c10::is_complex<other_t_abs>::value, int> = 0>
   Vectorized<T> abs() const {
@@ -585,6 +608,12 @@ struct Vectorized {
     }
     return ret;
   }
+   T reduce_add() const {
+    return reduce([](T x, T y) -> T { return x + y; });
+  }
+  T reduce_max() const {
+    return reduce(std::max);
+  }
 private:
   template <typename Op>
   inline Vectorized<T> binary_pred(const Vectorized<T>& other, Op op) const {