diff --git a/numpy/build_utils/apple_accelerate.py b/numpy/build_utils/apple_accelerate.py new file mode 100644 index 000000000000..d7351f4c52d4 --- /dev/null +++ b/numpy/build_utils/apple_accelerate.py @@ -0,0 +1,21 @@ +import os +import sys +import re + +__all__ = ['uses_accelerate_framework', 'get_sgemv_fix'] + +def uses_accelerate_framework(info): + """ Returns True if Accelerate framework is used for BLAS/LAPACK """ + if sys.platform != "darwin": + return False + r_accelerate = re.compile("Accelerate") + extra_link_args = info.get('extra_link_args', '') + for arg in extra_link_args: + if r_accelerate.search(arg): + return True + return False + +def get_sgemv_fix(): + """ Returns source file needed to correct SGEMV """ + path = os.path.abspath(os.path.dirname(__file__)) + return [os.path.join(path, 'src', 'apple_sgemv_fix.c')] diff --git a/numpy/build_utils/src/apple_sgemv_fix.c b/numpy/build_utils/src/apple_sgemv_fix.c new file mode 100644 index 000000000000..a6f929251f16 --- /dev/null +++ b/numpy/build_utils/src/apple_sgemv_fix.c @@ -0,0 +1,230 @@ +/* This is a collection of ugly hacks to circumvent a bug in + * Apple Accelerate framework's SGEMV subroutine. + * + * See: https://github.com/numpy/numpy/issues/4007 + * + * SGEMV in Accelerate framework will segfault on MacOS X version 10.9 + * (aka Mavericks) if arrays are not aligned to 32 byte boundaries + * and the CPU supports AVX instructions. This can produce segfaults + * in np.dot. + * + * This patch overshadows the symbols cblas_sgemv, sgemv_ and sgemv + * exported by Accelerate to produce the correct behavior. The MacOS X + * version and CPU specs are checked on module import. If Mavericks and + * AVX are detected the call to SGEMV is emulated with a call to SGEMM + * if the arrays are not 32 byte aligned. If the exported symbols cannot + * be overshadowed on module import, a fatal error is produced and the + * process aborts. All the fixes are in a self-contained C file + * and do not alter the multiarray C code. The patch is not applied + * unless NumPy is configured to link with Apple's Accelerate + * framework. + * + */ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#include "Python.h" +#include "numpy/arrayobject.h" + +#include +#include +#include +#include + +/* ----------------------------------------------------------------- */ +/* Original cblas_sgemv */ + +#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib" + +enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; +enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; +extern void cblas_xerbla(int info, const char *rout, const char *form, ...); + +typedef void cblas_sgemv_t(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, + const float beta, float *Y, const int incY); + +typedef void cblas_sgemm_t(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, + const float beta, float *C, const int incC); + +typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n, + const float* alpha, const float* A, const int* ldA, + const float* X, const int* incX, + const float* beta, float* Y, const int* incY ); + +static void *veclib = NULL; +static cblas_sgemv_t *accelerate_cblas_sgemv = NULL; +static cblas_sgemm_t *accelerate_cblas_sgemm = NULL; +static fortran_sgemv_t *accelerate_sgemv = NULL; +static int AVX_and_10_9 = 0; + +/* Dynamic check for AVX support + * __builtin_cpu_supports("avx") is available in gcc 4.8, + * but clang and icc do not currently support it. */ +#define cpu_supports_avx()\ +(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0) + +/* Check if we are using MacOS X version 10.9 */ +#define using_mavericks()\ +(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0) + +__attribute__((destructor)) +static void unloadlib(void) +{ + if (veclib) dlclose(veclib); +} + +__attribute__((constructor)) +static void loadlib() +/* automatically executed on module import */ +{ + char errormsg[1024]; + int AVX, MAVERICKS; + memset((void*)errormsg, 0, sizeof(errormsg)); + /* check if the CPU supports AVX */ + AVX = cpu_supports_avx(); + /* check if the OS is MacOS X Mavericks */ + MAVERICKS = using_mavericks(); + /* we need the workaround when the CPU supports + * AVX and the OS version is Mavericks */ + AVX_and_10_9 = AVX && MAVERICKS; + /* load vecLib */ + veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST); + if (!veclib) { + veclib = NULL; + sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE); + Py_FatalError(errormsg); /* calls abort() and dumps core */ + } + /* resolve Fortran SGEMV from Accelerate */ + accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_"); + if (!accelerate_sgemv) { + unloadlib(); + sprintf(errormsg,"Failed to resolve symbol 'sgemv_'."); + Py_FatalError(errormsg); + } + /* resolve cblas_sgemv from Accelerate */ + accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv"); + if (!accelerate_cblas_sgemv) { + unloadlib(); + sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'."); + Py_FatalError(errormsg); + } + /* resolve cblas_sgemm from Accelerate */ + accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm"); + if (!accelerate_cblas_sgemm) { + unloadlib(); + sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'."); + Py_FatalError(errormsg); + } +} + +/* ----------------------------------------------------------------- */ +/* Fortran SGEMV override */ + +void sgemv_( const char* trans, const int* m, const int* n, + const float* alpha, const float* A, const int* ldA, + const float* X, const int* incX, + const float* beta, float* Y, const int* incY ) +{ + /* It is safe to use the original SGEMV if we are not using AVX on Mavericks + * or the input arrays A, X and Y are all aligned on 32 byte boundaries. */ + #define BADARRAY(x) (((npy_intp)(void*)x) % 32) + const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y)); + if (!use_sgemm) { + accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY); + return; + } + + /* Arrays are misaligned, the CPU supports AVX, and we are running + * Mavericks. + * + * Emulation of SGEMV with SGEMM: + * + * SGEMV allows vectors to be strided. SGEMM requires all arrays to be + * contiguous along the leading dimension. To emulate striding in SGEMV + * with the leading dimension arguments in SGEMM we compute + * + * Y = alpha * op(A) @ X + beta * Y + * + * as + * + * Y.T = alpha * X.T @ op(A).T + beta * Y.T + * + * Because Fortran uses column major order and X.T and Y.T are row vectors, + * the leading dimensions of X.T and Y.T in SGEMM become equal to the + * strides of the the column vectors X and Y in SGEMV. */ + + switch (*trans) { + case 'T': + case 't': + case 'C': + case 'c': + accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + 1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); + break; + case 'N': + case 'n': + accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, + 1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); + break; + default: + cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans); + } +} + +/* ----------------------------------------------------------------- */ +/* Override for an alias symbol for sgemv_ in Accelerate */ + +void sgemv (char *trans, + const int *m, const int *n, + const float *alpha, + const float *A, const int *lda, + const float *B, const int *incB, + const float *beta, + float *C, const int *incC) +{ + sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC); +} + +/* ----------------------------------------------------------------- */ +/* cblas_sgemv override, based on Netlib CBLAS code */ + +void cblas_sgemv(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY) +{ + char TA; + if (order == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA); + } + sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY); + } + else if (order == CblasRowMajor) + { + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA); + return; + } + sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY); + } + else + cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order); +} + diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 007a381ae3bf..2556a31a5498 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -12,6 +12,7 @@ from numpy.distutils import log from distutils.dep_util import newer from distutils.sysconfig import get_config_var +from numpy.build_utils.apple_accelerate import uses_accelerate_framework, get_sgemv_fix from setup_common import * @@ -839,6 +840,8 @@ def generate_multiarray_templated_sources(ext, build_dir): if blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', []): extra_info = blas_info multiarray_src.append(join('src', 'multiarray', 'cblasfuncs.c')) + if uses_accelerate_framework(blas_info): + multiarray_src += get_sgemv_fix() else: extra_info = {} diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index d6f98e0b722b..c008b3349575 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -3611,6 +3611,73 @@ def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs): assert_equal(c.dot(a), "A") assert_raises(TypeError, np.dot, b, c) assert_raises(TypeError, c.dot, b) + + def test_accelerate_framework_sgemv_fix(self): + from itertools import product + if sys.platform != 'darwin': + return + + def aligned_array(shape, align, dtype, order='C'): + d = dtype() + N = np.prod(shape) + tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8) + address = tmp.__array_interface__["data"][0] + for offset in range(align): + if (address + offset) % align == 0: + break + tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype) + return tmp.reshape(shape, order=order) + + def as_aligned(arr, align, dtype, order='C'): + aligned = aligned_array(arr.shape, align, dtype, order) + aligned[:] = arr[:] + return aligned + + def assert_dot_close(A, X, desired): + assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7) + + m = aligned_array(100, 15, np.float32) + s = aligned_array((100, 100), 15, np.float32) + np.dot(s, m) # this will always segfault if the bug is present + + testdata = product((15,32), (10000,), (200,89), ('C','F')) + for align, m, n, a_order in testdata: + # Calculation in double precision + A_d = np.random.rand(m, n) + X_d = np.random.rand(n) + desired = np.dot(A_d, X_d) + # Calculation with aligned single precision + A_f = as_aligned(A_d, align, np.float32, order=a_order) + X_f = as_aligned(X_d, align, np.float32) + assert_dot_close(A_f, X_f, desired) + # Strided A rows + A_d_2 = A_d[::2] + desired = np.dot(A_d_2, X_d) + A_f_2 = A_f[::2] + assert_dot_close(A_f_2, X_f, desired) + # Strided A columns, strided X vector + A_d_22 = A_d_2[:, ::2] + X_d_2 = X_d[::2] + desired = np.dot(A_d_22, X_d_2) + A_f_22 = A_f_2[:, ::2] + X_f_2 = X_f[::2] + assert_dot_close(A_f_22, X_f_2, desired) + # Check the strides are as expected + if a_order == 'F': + assert_equal(A_f_22.strides, (8, 8 * m)) + else: + assert_equal(A_f_22.strides, (8 * n, 8)) + assert_equal(X_f_2.strides, (8,)) + # Strides in A rows + cols only + X_f_2c = as_aligned(X_f_2, align, np.float32) + assert_dot_close(A_f_22, X_f_2c, desired) + # Strides just in A cols + A_d_12 = A_d[:, ::2] + desired = np.dot(A_d_12, X_d_2) + A_f_12 = A_f[:, ::2] + assert_dot_close(A_f_12, X_f_2c, desired) + # Strides in A cols and X + assert_dot_close(A_f_12, X_f_2, desired) class TestInner(TestCase):