pytorch
diff --git a/‎aten/src/ATen/cpu/vec/functional_base.h
Lines changed: 24 additions & 0 deletions b/‎aten/src/ATen/cpu/vec/functional_base.h
Lines changed: 24 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cpp_prefix.h
Lines changed: 1 addition & 1 deletion b/‎torch/_inductor/codegen/cpp_prefix.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/_inductor/cpu_vec_isa.py
Lines changed: 35 additions & 3 deletions b/‎torch/_inductor/cpu_vec_isa.py
Lines changed: 35 additions & 3 deletions
@@ -107,6 +107,30 @@ struct VecReduceAllSIMD<float, Op> {
 };
 #endif // defined(__aarch64__)
 
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && defined(CPU_CAPABILITY_SVE256)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#endif // defined(__aarch64__)
+
+
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized<scalar_t>& acc_vec) {
   return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
 
@@ -28,7 +28,7 @@
 #include <c10/util/TypeCast.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX)
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE256)
 #define INDUCTOR_USE_VECTOR_TYPES() 1
 #else
 #define INDUCTOR_USE_VECTOR_TYPES() 0
 
@@ -52,7 +52,7 @@ class VecISA:
     # In fbcode however, we are using the same compiler for pytorch and for inductor codegen,
     # making the runtime check unnecessary.
     _avx_code = """
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX)
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE)
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #endif
@@ -160,6 +160,20 @@ def __str__(self) -> str:
 
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
 
+@dataclasses.dataclass
+class VecSVE(VecISA):
+    # this function can be repurposed for SVE with variable vec length
+    _bit_width = 256
+    _macro =  ["CPU_CAPABILITY_SVE", "CPU_CAPABILITY_SVE256"]
+
+    _arch_flags = "-march=armv8-a+sve -msve-vector-bits=256"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+
+    def __str__(self) -> str:
+        return "asimd"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
 
 @dataclasses.dataclass
 class VecAVX512(VecISA):
@@ -304,9 +318,24 @@ def _check_and_append_supported_isa(
 
     return supported_isa
 
+@functools.lru_cache(maxsize=None)
+def _is_arm_neoverse_v1() -> bool:
+    # reference: https://github.com/ARM-software/ComputeLibrary/blob/main/src/common/cpuinfo/CpuModel.cpp
+    try:
+        with open("/proc/cpuinfo") as _cpuinfo:
+            line = _cpuinfo.readline()
+            while line:
+                is_v1 = re.match(r"^CPU\spart(.*):\s0xd40(\n)$", line)
+                if is_v1:
+                    return True
+                line = _cpuinfo.readline()
+            return False
+    except:
+        return False
+
 
 invalid_vec_isa = InvalidVecISA()
-supported_vec_isa_list = [VecAMX(), VecAVX512(), VecAVX2(), VecNEON()]
+supported_vec_isa_list = [VecAMX(), VecAVX512(), VecAVX2(), VecNEON(), VecSVE()]
 
 
 # Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
@@ -338,7 +367,10 @@ def valid_vec_isa_list() -> List[VecISA]:
     elif arch == "ppc64le":
         isa_list.append(VecVSX())
     elif arch == "aarch64":
-        isa_list.append(VecNEON())
+        if _is_arm_neoverse_v1():
+            isa_list.append(VecSVE())
+        else:
+            isa_list.append(VecNEON())
     elif arch in ["x86_64", "AMD64"]:
         """
         arch value is x86_64 on Linux, and the value is AMD64 on Windows.