From 7227b49e769d7f5cbf6cf5f91879b435ffc61404 Mon Sep 17 00:00:00 2001
From: Developer-Ecosystem-Engineering
 <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>
Date: Wed, 22 Sep 2021 16:07:46 -0700
Subject: [PATCH 1/3] Resolve divide by zero in reciprocal #18555

clang has an optimization bug where a vector that is only partially loaded / stored will get narrowed to only the lanes used, which can be fine in some cases. However, in numpy's `reciprocal` function a partial load is explicitly extended to the full width of the register (filled with '1's) to avoid divide-by-zero. clang's optimization ignores the explicit filling with '1's.

The changes here insert a dummy `volatile` variable. This convinces clang not to narrow the load and ignore the explicit filling of '1's.

`volatile` can be expensive since it forces loads / stores to refresh contents whenever the variable is used. numpy has its own template / macro system that'll expand the loop function below for sqrt, absolute, square, and reciprocal. Additionally, the loop can be called on a full array if there's overlap between src and dst. Consequently, we try to limit the scope that we need to apply `volatile`. Intention is it should only be needed when compiling with clang, against Apple arm64, and only for the `reciprocal` function. Moreover, `volatile` is only needed when a vector is partially loaded.

Testing:
Beyond fixing the cases mentioned in the GitHub issue, the changes here also resolve several failures in numpy's test suite.

Before:
```
FAILED numpy/core/tests/test_scalarmath.py::TestBaseMath::test_blocked - RuntimeWarning: divide by zero encountered in reciprocal
FAILED numpy/core/tests/test_ufunc.py::TestUfuncGenericLoops::test_unary_PyUFunc_O_O_method_full[reciprocal] - AssertionError: FloatingPointError not raised
FAILED numpy/core/tests/test_umath.py::TestPower::test_power_float - RuntimeWarning: divide by zero encountered in reciprocal
FAILED numpy/core/tests/test_umath.py::TestSpecialFloats::test_tan - AssertionError: FloatingPointError not raised by tan
FAILED numpy/core/tests/test_umath.py::TestAVXUfuncs::test_avx_based_ufunc - RuntimeWarning: divide by zero encountered in reciprocal
FAILED numpy/linalg/tests/test_linalg.py::TestNormDouble::test_axis - RuntimeWarning: divide by zero encountered in reciprocal
FAILED numpy/linalg/tests/test_linalg.py::TestNormSingle::test_axis - RuntimeWarning: divide by zero encountered in reciprocal
FAILED numpy/linalg/tests/test_linalg.py::TestNormInt64::test_axis - RuntimeWarning: divide by zero encountered in reciprocal
8 failed, 14759 passed, 204 skipped, 1268 deselected, 34 xfailed in 69.90s (0:01:09)
```

After:
```
FAILED numpy/core/tests/test_umath.py::TestSpecialFloats::test_tan - AssertionError: FloatingPointError not raised by tan
1 failed, 14766 passed, 204 skipped, 1268 deselected, 34 xfailed in 70.37s (0:01:10)
```
---
 .../src/umath/loops_unary_fp.dispatch.c.src   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 3a1ea82f9460..2ba0c7b05314 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -77,6 +77,25 @@ NPY_FINLINE double c_square_f64(double a)
  */
 #define CONTIG  0
 #define NCONTIG 1
+
+/*
+ * clang has a bug on at least v13 and prior. The bug is present at -O1 or
+ * greater.  When partially loading a NEON register for a reciprocal operation,
+ * the remaining elements are set to 1 to avoid divide-by-zero.  The partial
+ * load is paired with a partial store after the reciprocal operation.  clang
+ * notices that the entire NEON register is not needed for the store and
+ * optimizes out the fill of 1 to the remaining elements.  This causes a
+ * divide-by-zero error that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements.
+ */
+#if defined(__clang__) && defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG 1
+#else
+#define WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG 0
+#endif
+
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
@@ -87,6 +106,7 @@ NPY_FINLINE double c_square_f64(double a)
  * #kind     = sqrt, absolute, square, reciprocal#
  * #intr     = sqrt, abs,      square, recip#
  * #repl_0w1 = 0,    0,        0,      1#
+ * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -126,6 +146,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
         #endif
         /**end repeat3**/
     }
+
     for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
     #if @STYPE@ == CONTIG
         #if @repl_0w1@
@@ -140,6 +161,17 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
             npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
         #endif
     #endif
+        #if @RECIP_WORKAROUND@
+            /*
+             * Workaround clang bug.  We use a dummy variable marked 'volatile'
+             * to convince clang that the entire vector is needed.  We only
+             * want to do this for the last iteration / partial load-store of
+             * the loop since 'volatile' forces a refresh of the contents.
+             */
+            if(len < vstep){
+                volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
+            }
+        #endif // @RECIP_WORKAROUND@
         npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
     #if @DTYPE@ == CONTIG
         npyv_store_till_@sfx@(dst, len, v_unary0);
@@ -154,6 +186,8 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 #endif // @VCHK@
 /**end repeat**/
 
+#undef WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG
+
 /********************************************************************************
  ** Defining ufunc inner functions
  ********************************************************************************/

From a3d1a003cd4cf0106165c490452e772a0d10398e Mon Sep 17 00:00:00 2001
From: Developer-Ecosystem-Engineering
 <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>
Date: Wed, 22 Sep 2021 16:08:40 -0700
Subject: [PATCH 2/3] Enhancement on top of workaround for clang bug in
 reciprocal

Enhancement on top of workaround for clang bug in reciprocal (numpy/numpy#18555)
Numpy's FP unary loops use a partial load / store on every iteration. The partial load / store helpers each insert a switch statement to know how many elements to handle. This causes a lot of unnecessary branches to be inserted in the loops. The partial load / store is only needed on the final iteration of the loop if it isn't a full vector.

The changes here breakout the final iteration to use the partial load / stores. The loop has been changed to use full load / stores. Additionally, this means we don't need to conditionalize the volatile workaround in the loop.
---
 .../src/umath/loops_unary_fp.dispatch.c.src   | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 2ba0c7b05314..fe5dd7539e69 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -121,6 +121,8 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 
     const int vstep = npyv_nlanes_@sfx@;
     const int wstep = vstep * @unroll@;
+
+    // unrolled iterations
     for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
         /**begin repeat3
          * #N  = 0, 1, 2, 3#
@@ -147,7 +149,23 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
         /**end repeat3**/
     }
 
-    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if @STYPE@ == CONTIG
+        npyv_@sfx@ v_src0 = npyv_load_@sfx@(src);
+    #else
+        npyv_@sfx@ v_src0 = npyv_loadn_@sfx@(src, ssrc);
+    #endif
+        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
+    #if @DTYPE@ == CONTIG
+        npyv_store_@sfx@(dst, v_unary0);
+    #else
+        npyv_storen_@sfx@(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
     #if @STYPE@ == CONTIG
         #if @repl_0w1@
             npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
@@ -168,9 +186,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
              * want to do this for the last iteration / partial load-store of
              * the loop since 'volatile' forces a refresh of the contents.
              */
-            if(len < vstep){
-                volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
-            }
+             volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
         #endif // @RECIP_WORKAROUND@
         npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
     #if @DTYPE@ == CONTIG
@@ -179,6 +195,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
         npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
     #endif
     }
+
     npyv_cleanup();
 }
 /**end repeat2**/

From 5f93ba4fbe1a3ebb13cf125809552cfa3200a43e Mon Sep 17 00:00:00 2001
From: Developer-Ecosystem-Engineering
 <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>
Date: Fri, 24 Sep 2021 10:28:44 -0700
Subject: [PATCH 3/3] Address Azure CI failures with older versions of clang

- -ftrapping-math is default enabled for Numpy, but support in clang is mainly for x86_64
- Apple Clang and Clang have different, but overlapping versions
- Non-Apple Clang versions come from looking at when they started supporting -ftrapping-math for x86_64

Testing was done against Apple Clang versions
- v11 / x86_64 - failed previously, now passes (azure failure)
- v12+ / x86_64 - passes before and after
- v13 / arm64 - failed before initial patch, passes after
---
 .../src/umath/loops_unary_fp.dispatch.c.src   | 57 ++++++++++++++-----
 1 file changed, 44 insertions(+), 13 deletions(-)

diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index fe5dd7539e69..2d5917282c26 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -79,21 +79,52 @@ NPY_FINLINE double c_square_f64(double a)
 #define NCONTIG 1
 
 /*
- * clang has a bug on at least v13 and prior. The bug is present at -O1 or
- * greater.  When partially loading a NEON register for a reciprocal operation,
- * the remaining elements are set to 1 to avoid divide-by-zero.  The partial
- * load is paired with a partial store after the reciprocal operation.  clang
- * notices that the entire NEON register is not needed for the store and
- * optimizes out the fill of 1 to the remaining elements.  This causes a
- * divide-by-zero error that we were trying to avoid by filling.
+ * clang has a bug that's present at -O1 or greater.  When partially loading a
+ * vector register for a reciprocal operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
+ * store after the reciprocal operation.  clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
  *
  * Using a dummy variable marked 'volatile' convinces clang not to ignore
- * the explicit fill of remaining elements.
+ * the explicit fill of remaining elements.  If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
  */
-#if defined(__clang__) && defined(__APPLE__) && defined(__arm64__)
-#define WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG 1
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ < 12000000
+        // Apple Clang before v12
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Apple Clang after v12, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+        #else
+        // Apple Clang after v12, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ < 10
+        // Clang before v10
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Clang v10+, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+        #else
+        // Clang v10+, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #endif
+    #endif
 #else
-#define WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG 0
+// Not a Clang compiler
+#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
 #endif
 
 /**begin repeat
@@ -106,7 +137,7 @@ NPY_FINLINE double c_square_f64(double a)
  * #kind     = sqrt, absolute, square, reciprocal#
  * #intr     = sqrt, abs,      square, recip#
  * #repl_0w1 = 0,    0,        0,      1#
- * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG#
+ * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -203,7 +234,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 #endif // @VCHK@
 /**end repeat**/
 
-#undef WORKAROUND_CLANG_ARM64_RECIPROCAL_BUG
+#undef WORKAROUND_CLANG_RECIPROCAL_BUG
 
 /********************************************************************************
  ** Defining ufunc inner functions