diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index a4d2050122c6..cd46a20b0246 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -1117,6 +1117,7 @@ src_multiarray_umath_common = [ ] if have_blas src_multiarray_umath_common += [ + 'src/common/blas_utils.c', 'src/common/cblasfuncs.c', 'src/common/python_xerbla.c', ] diff --git a/numpy/_core/src/common/blas_utils.c b/numpy/_core/src/common/blas_utils.c new file mode 100644 index 000000000000..40bbc773a92b --- /dev/null +++ b/numpy/_core/src/common/blas_utils.c @@ -0,0 +1,134 @@ +#include +#include +#include + +#ifdef __APPLE__ +#include +#endif + +#include "numpy/numpyconfig.h" // NPY_VISIBILITY_HIDDEN +#include "numpy/npy_math.h" // npy_get_floatstatus_barrier +#include "blas_utils.h" + +#if NPY_BLAS_CHECK_FPE_SUPPORT + +/* Return whether we're running on macOS 15.4 or later + */ +static inline bool +is_macOS_version_15_4_or_later(void){ +#if !defined(__APPLE__) + return false; +#else + char *osProductVersion = NULL; + size_t size = 0; + bool ret = false; + + // Query how large OS version string should be + if(-1 == sysctlbyname("kern.osproductversion", NULL, &size, NULL, 0)){ + goto cleanup; + } + + osProductVersion = malloc(size + 1); + + // Get the OS version string + if(-1 == sysctlbyname("kern.osproductversion", osProductVersion, &size, NULL, 0)){ + goto cleanup; + } + + osProductVersion[size] = '\0'; + + // Parse the version string + int major = 0, minor = 0; + if(2 > sscanf(osProductVersion, "%d.%d", &major, &minor)) { + goto cleanup; + } + + if(major > 15 || (major == 15 && minor >= 4)) { + ret = true; + } + +cleanup: + if(osProductVersion){ + free(osProductVersion); + } + + return ret; +#endif +} + +/* ARM Scalable Matrix Extension (SME) raises all floating-point error flags + * when it's used regardless of values or operations. As a consequence, + * when SME is used, all FPE state is lost and special handling is needed. + * + * For NumPy, SME is not currently used directly, but can be used via + * BLAS / LAPACK libraries. This function does a runtime check for whether + * BLAS / LAPACK can use SME and special handling around FPE is required. + */ +static inline bool +BLAS_can_use_ARM_SME(void) +{ +#if defined(__APPLE__) && defined(__aarch64__) && defined(ACCELERATE_NEW_LAPACK) + // ARM SME can be used by Apple's Accelerate framework for BLAS / LAPACK + // - macOS 15.4+ + // - Apple silicon M4+ + + // Does OS / Accelerate support ARM SME? + if(!is_macOS_version_15_4_or_later()){ + return false; + } + + // Does hardware support SME? + int has_SME = 0; + size_t size = sizeof(has_SME); + if(-1 == sysctlbyname("hw.optional.arm.FEAT_SME", &has_SME, &size, NULL, 0)){ + return false; + } + + if(has_SME){ + return true; + } +#endif + + // default assume SME is not used + return false; +} + +/* Static variable to cache runtime check of BLAS FPE support. + */ +static bool blas_supports_fpe = true; + +#endif // NPY_BLAS_CHECK_FPE_SUPPORT + + +NPY_VISIBILITY_HIDDEN bool +npy_blas_supports_fpe(void) +{ +#if NPY_BLAS_CHECK_FPE_SUPPORT + return blas_supports_fpe; +#else + return true; +#endif +} + +NPY_VISIBILITY_HIDDEN void +npy_blas_init(void) +{ +#if NPY_BLAS_CHECK_FPE_SUPPORT + blas_supports_fpe = !BLAS_can_use_ARM_SME(); +#endif +} + +NPY_VISIBILITY_HIDDEN int +npy_get_floatstatus_after_blas(void) +{ +#if NPY_BLAS_CHECK_FPE_SUPPORT + if(!blas_supports_fpe){ + // BLAS does not support FPE and we need to return FPE state. + // Instead of clearing and then grabbing state, just return + // that no flags are set. + return 0; + } +#endif + char *param = NULL; + return npy_get_floatstatus_barrier(param); +} diff --git a/numpy/_core/src/common/blas_utils.h b/numpy/_core/src/common/blas_utils.h new file mode 100644 index 000000000000..8c1437f88899 --- /dev/null +++ b/numpy/_core/src/common/blas_utils.h @@ -0,0 +1,30 @@ +#include + +#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN + +/* NPY_BLAS_CHECK_FPE_SUPPORT controls whether we need a runtime check + * for floating-point error (FPE) support in BLAS. + */ +#if defined(__APPLE__) && defined(__aarch64__) && defined(ACCELERATE_NEW_LAPACK) +#define NPY_BLAS_CHECK_FPE_SUPPORT 1 +#else +#define NPY_BLAS_CHECK_FPE_SUPPORT 0 +#endif + +/* Initialize BLAS environment, if needed + */ +NPY_VISIBILITY_HIDDEN void +npy_blas_init(void); + +/* Runtime check if BLAS supports floating-point errors. + * true - BLAS supports FPE and one can rely on them to indicate errors + * false - BLAS does not support FPE. Special handling needed for FPE state + */ +NPY_VISIBILITY_HIDDEN bool +npy_blas_supports_fpe(void); + +/* If BLAS supports FPE, exactly the same as npy_get_floatstatus_barrier(). + * Otherwise, we can't rely on FPE state and need special handling. + */ +NPY_VISIBILITY_HIDDEN int +npy_get_floatstatus_after_blas(void); diff --git a/numpy/_core/src/common/cblasfuncs.c b/numpy/_core/src/common/cblasfuncs.c index f9d683d812d4..66a215dfeb64 100644 --- a/numpy/_core/src/common/cblasfuncs.c +++ b/numpy/_core/src/common/cblasfuncs.c @@ -12,6 +12,7 @@ #include "numpy/arrayobject.h" #include "numpy/npy_math.h" #include "numpy/ufuncobject.h" +#include "blas_utils.h" #include "npy_cblas.h" #include "arraytypes.h" #include "common.h" @@ -693,7 +694,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2, NPY_END_ALLOW_THREADS; } - int fpes = npy_get_floatstatus_barrier((char *) result); + int fpes = npy_get_floatstatus_after_blas(); if (fpes && PyUFunc_GiveFloatingpointErrors("dot", fpes) < 0) { goto fail; } diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c index 8ba38b555edb..1a3b5fb1748e 100644 --- a/numpy/_core/src/multiarray/multiarraymodule.c +++ b/numpy/_core/src/multiarray/multiarraymodule.c @@ -43,6 +43,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "arraytypes.h" #include "arrayobject.h" #include "array_converter.h" +#include "blas_utils.h" #include "hashdescr.h" #include "descriptor.h" #include "dragon4.h" @@ -4805,6 +4806,10 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { goto err; } +#if NPY_BLAS_CHECK_FPE_SUPPORT + npy_blas_init(); +#endif + #if defined(MS_WIN64) && defined(__GNUC__) PyErr_WarnEx(PyExc_Warning, "Numpy built with MINGW-W64 on Windows 64 bits is experimental, " \ diff --git a/numpy/_core/src/umath/matmul.c.src b/numpy/_core/src/umath/matmul.c.src index 02c4fde56bf2..39427a9d46ee 100644 --- a/numpy/_core/src/umath/matmul.c.src +++ b/numpy/_core/src/umath/matmul.c.src @@ -16,6 +16,7 @@ +#include "blas_utils.h" #include "npy_cblas.h" #include "arraytypes.h" /* For TYPE_dot functions */ @@ -120,7 +121,7 @@ static inline void } } -NPY_NO_EXPORT void +static void @name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n, void *ip2, npy_intp is2_n, void *op, npy_intp op_m, @@ -156,7 +157,7 @@ NPY_NO_EXPORT void is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@)); } -NPY_NO_EXPORT void +static void @name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n, void *ip2, npy_intp is2_n, npy_intp is2_p, void *op, npy_intp os_m, npy_intp os_p, @@ -260,7 +261,7 @@ NPY_NO_EXPORT void * #IS_HALF = 0, 0, 0, 1, 0*13# */ -NPY_NO_EXPORT void +static void @TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n, void *_ip2, npy_intp is2_n, npy_intp is2_p, void *_op, npy_intp os_m, npy_intp os_p, @@ -318,7 +319,7 @@ NPY_NO_EXPORT void } /**end repeat**/ -NPY_NO_EXPORT void +static void BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n, void *_ip2, npy_intp is2_n, npy_intp is2_p, void *_op, npy_intp os_m, npy_intp os_p, @@ -357,7 +358,7 @@ BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n, } } -NPY_NO_EXPORT void +static void OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n, void *_ip2, npy_intp is2_n, npy_intp is2_p, void *_op, npy_intp os_m, npy_intp os_p, @@ -629,6 +630,11 @@ NPY_NO_EXPORT void #endif } #if @USEBLAS@ && defined(HAVE_CBLAS) +#if NPY_BLAS_CHECK_FPE_SUPPORT + if (!npy_blas_supports_fpe()) { + npy_clear_floatstatus_barrier((char*)args); + } +#endif if (allocate_buffer) free(tmp_ip12op); #endif } @@ -653,7 +659,7 @@ NPY_NO_EXPORT void * #prefix = c, z, 0# * #USE_BLAS = 1, 1, 0# */ -NPY_NO_EXPORT void +static void @name@_dotc(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n, void *NPY_UNUSED(ignore)) { @@ -749,6 +755,7 @@ OBJECT_dotc(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp * CFLOAT, CDOUBLE, CLONGDOUBLE, OBJECT# * #DOT = dot*15, dotc*4# * #CHECK_PYERR = 0*18, 1# + * #CHECK_BLAS = 1*2, 0*13, 1*2, 0*2# */ NPY_NO_EXPORT void @TYPE@_vecdot(char **args, npy_intp const *dimensions, npy_intp const *steps, @@ -772,6 +779,11 @@ NPY_NO_EXPORT void } #endif } +#if @CHECK_BLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT + if (!npy_blas_supports_fpe()) { + npy_clear_floatstatus_barrier((char*)args); + } +#endif } /**end repeat**/ @@ -787,7 +799,7 @@ NPY_NO_EXPORT void * #step1 = &oneF, &oneD# * #step0 = &zeroF, &zeroD# */ -NPY_NO_EXPORT void +static void @name@_vecmat_via_gemm(void *ip1, npy_intp is1_n, void *ip2, npy_intp is2_n, npy_intp is2_m, void *op, npy_intp os_m, @@ -878,6 +890,11 @@ NPY_NO_EXPORT void #endif } } +#if @USEBLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT + if (!npy_blas_supports_fpe()) { + npy_clear_floatstatus_barrier((char*)args); + } +#endif } /**end repeat**/ @@ -943,5 +960,10 @@ NPY_NO_EXPORT void #endif } } +#if @USEBLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT + if (!npy_blas_supports_fpe()) { + npy_clear_floatstatus_barrier((char*)args); + } +#endif } /**end repeat**/ diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py index acf053b41490..12c547935ee7 100644 --- a/numpy/_core/tests/test_multiarray.py +++ b/numpy/_core/tests/test_multiarray.py @@ -31,6 +31,7 @@ from numpy.exceptions import AxisError, ComplexWarning from numpy.lib.recfunctions import repack_fields from numpy.testing import ( + BLAS_SUPPORTS_FPE, HAS_REFCOUNT, IS_64BIT, IS_PYPY, @@ -3363,6 +3364,11 @@ def test_dot(self): @pytest.mark.parametrize("dtype", [np.half, np.double, np.longdouble]) @pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support") def test_dot_errstate(self, dtype): + # Some dtypes use BLAS for 'dot' operation and + # not all BLAS support floating-point errors. + if not BLAS_SUPPORTS_FPE and dtype == np.double: + pytest.skip("BLAS does not support FPE") + a = np.array([1, 1], dtype=dtype) b = np.array([-np.inf, np.inf], dtype=dtype) diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py index d7ceaeab72cc..65f4059f98fd 100644 --- a/numpy/testing/_private/utils.py +++ b/numpy/testing/_private/utils.py @@ -42,6 +42,7 @@ 'assert_no_gc_cycles', 'break_cycles', 'HAS_LAPACK64', 'IS_PYSTON', 'IS_MUSL', 'check_support_sve', 'NOGIL_BUILD', 'IS_EDITABLE', 'IS_INSTALLED', 'NUMPY_ROOT', 'run_threaded', 'IS_64BIT', + 'BLAS_SUPPORTS_FPE', ] @@ -89,6 +90,15 @@ class KnownFailureException(Exception): IS_PYPY = sys.implementation.name == 'pypy' IS_PYSTON = hasattr(sys, "pyston_version_info") HAS_REFCOUNT = getattr(sys, 'getrefcount', None) is not None and not IS_PYSTON +BLAS_SUPPORTS_FPE = True +if platform.system() == 'Darwin' or platform.machine() == 'arm64': + try: + blas = np.__config__.CONFIG['Build Dependencies']['blas'] + if blas['name'] == 'accelerate': + BLAS_SUPPORTS_FPE = False + except KeyError: + pass + HAS_LAPACK64 = numpy.linalg._umath_linalg._ilp64 IS_MUSL = False