ENH: use gcc intrinsic for overflow checked multiplication

juliantaylor · juliantaylor · commit 668668600d00 · 2014-11-21T00:00:33.000+01:00
More efficient as it uses processor overflow flags if available.
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
@@ -118,6 +118,7 @@ def check_api_version(apiversion, codegen_dir):
                        ("__builtin_bswap32", '5u'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
+                       ("__builtin_mul_overflow", '5, 5, (int*)5'),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
                        ("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
                        ]
diff --git a/numpy/core/src/private/templ_common.h.src b/numpy/core/src/private/templ_common.h.src
@@ -21,20 +21,22 @@
 static NPY_INLINE int
 npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b)
 {
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
     const @type@ half_sz = (((@type@)1 << (sizeof(a) * 8 / 2)) - 1);
 
     *r = a * b;
     /*
      * avoid expensive division on common no overflow case
-     * could be improved via compiler intrinsics e.g. via clang
-     * __builtin_mul_with_overflow, gcc __int128 or cpu overflow flags
      */
     if (NPY_UNLIKELY((a | b) >= half_sz) &&
         a != 0 && b > @MAX@ / a) {
         return 1;
     }
 
     return 0;
+#endif
 }
 /**end repeat**/
 

Or FE8E iginal file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ def check_api_version(apiversion, codegen_dir):`
`118`	`118`	`("__builtin_bswap32", '5u'),`
`119`	`119`	`("__builtin_bswap64", '5u'),`
`120`	`120`	`("__builtin_expect", '5, 0'),`
	`121`	`+ ("__builtin_mul_overflow", '5, 5, (int*)5'),`
`121`	`122`	`("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE`
`122`	`123`	`("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2`
`123`	`124`	`]`