diff --git a/numpy/core/blasdot/apple_sgemv_patch.c b/numpy/core/blasdot/apple_sgemv_patch.c new file mode 100644 index 000000000000..c950e0b4ef15 --- /dev/null +++ b/numpy/core/blasdot/apple_sgemv_patch.c @@ -0,0 +1,224 @@ +#ifdef APPLE_ACCELERATE_SGEMV_PATCH + +/* This is an ugly hack to circumvent a bug in Accelerate's cblas_sgemv. + * + * See: https://github.com/numpy/numpy/issues/4007 + * + */ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#include "Python.h" +#include "numpy/arrayobject.h" + +#include +#include +#include +#include + +/* ----------------------------------------------------------------- */ +/* Original cblas_sgemv */ + +#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib" + +enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; +enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; +extern void cblas_xerbla(int info, const char *rout, const char *form, ...); + +typedef void cblas_sgemv_t(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, + const float beta, float *Y, const int incY); + +typedef void cblas_sgemm_t(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, + const float beta, float *C, const int incC); + +typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n, + const float* alpha, const float* A, const int* ldA, + const float* X, const int* incX, + const float* beta, float* Y, const int* incY ); + +static void *veclib = NULL; +static cblas_sgemv_t *accelerate_cblas_sgemv = NULL; +static cblas_sgemm_t *accelerate_cblas_sgemm = NULL; +static fortran_sgemv_t *accelerate_sgemv = NULL; +static int AVX_and_10_9 = 0; + +/* Dynamic check for AVX support + * __builtin_cpu_supports("avx") is available in gcc 4.8, + * but clang and icc do not currently support it. */ +#define cpu_supports_avx()\ +(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0) + +/* Check if we are using MacOS X version 10.9 */ +#define using_mavericks()\ +(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0) + +__attribute__((destructor)) +static void unloadlib(void) +{ + if (veclib) dlclose(veclib); +} + +__attribute__((constructor)) +static void loadlib() +/* automatically executed on module import */ +{ + char errormsg[1024]; + int AVX, MAVERICKS; + memset((void*)errormsg, 0, sizeof(errormsg)); + /* check if the CPU supports AVX */ + AVX = cpu_supports_avx(); + if (AVX < 0) { + /* Could not determine if CPU supports AVX, + * assume for safety that it does */ + AVX = 1; + } + /* check if the OS is MacOS X Mavericks */ + MAVERICKS = using_mavericks(); + if (MAVERICKS < 0) { + /* Could not determine if the OS is Mavericks, + * assume for safety that it is */ + MAVERICKS = 1; + } + /* we need the workaround when the CPU supports + * AVX and the OS version is Mavericks */ + AVX_and_10_9 = AVX && MAVERICKS; + /* load vecLib */ + veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST); + if (!veclib) { + veclib = NULL; + sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE); + Py_FatalError(errormsg); /* calls abort() and dumps core */ + } + /* resolve Fortran SGEMV from Accelerate */ + accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_"); + if (!accelerate_sgemv) { + unloadlib(); + sprintf(errormsg,"Failed to resolve symbol 'sgemv_'."); + Py_FatalError(errormsg); + } + /* resolve cblas_sgemv from Accelerate */ + accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv"); + if (!accelerate_cblas_sgemv) { + unloadlib(); + sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'."); + Py_FatalError(errormsg); + } + /* resolve cblas_sgemm from Accelerate */ + accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm"); + if (!accelerate_cblas_sgemm) { + unloadlib(); + sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'."); + Py_FatalError(errormsg); + } +} + +/* ----------------------------------------------------------------- */ +/* Fortran SGEMV override */ + +#define BADARRAY(x) (((npy_intp)(void*)x) % 32) + +void sgemv_( const char* trans, const int* m, const int* n, + const float* alpha, const float* A, const int* ldA, + const float* X, const int* incX, + const float* beta, float* Y, const int* incY ) +{ + const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y)); + + if (!use_sgemm) { + /* Safe to use the original SGEMV */ + accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY); + return; + } + + /* Emulate SGEMV with SGEMM: Arrays are misaligned, the CPU supports AVX, + * and we are running Mavericks. + * + * SGEMV allows vectors to be strided. SGEMM requires all arrays to be + * contiguous along the leading dimension. To emulate striding with leading + * dimension argument we compute + * + * Y = alpha * op(A) @ X + beta * Y + * + * as + * + * Y.T = alpha * X.T @ op(A).T + beta * Y.T + * + * Since X.T and Y.T are row vectors, their leading dimension in + * SGEMM becomes equal to their stride in SGEMV. */ + + switch (*trans) { + case 'T': + case 't': + case 'C': + case 'c': + accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + 1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); + break; + case 'N': + case 'n': + accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, + 1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); + break; + default: + cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans); + } +} + +/* ----------------------------------------------------------------- */ +/* Override for an alias symbol for sgemv_ in Accelerate */ + +void sgemv (char *trans, + const int *m, const int *n, + const float *alpha, + const float *A, const int *lda, + const float *B, const int *incB, + const float *beta, + float *C, const int *incC) +{ + sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC); +} + +/* ----------------------------------------------------------------- */ +/* cblas_sgemv override, based on Netlib CBLAS code */ + +void cblas_sgemv(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY) +{ + char TA; + if (order == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA); + } + sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY); + } + else if (order == CblasRowMajor) + { + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA); + return; + } + sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY); + } + else + cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order); +} + +#endif diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 5da04241317e..1cd1ee1c2db7 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -954,13 +954,14 @@ def get_dotblas_sources(ext, build_dir): if blas_info: if ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', []): return None # dotblas needs ATLAS, Fortran compiled blas will not be sufficient. - return ext.depends[:1] + return ext.depends[:2] return None # no extension module will be built config.add_extension('_dotblas', sources = [get_dotblas_sources], depends = [join('blasdot', '_dotblas.c'), - join('blasdot', 'cblas.h'), + join('blasdot', 'apple_sgemv_patch.c'), + join('blasdot', 'cblas.h'), ], include_dirs = ['blasdot'], extra_info = blas_info diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py index 264663835644..6b5afef14f86 100644 --- a/numpy/core/tests/test_blasdot.py +++ b/numpy/core/tests/test_blasdot.py @@ -1,7 +1,9 @@ from __future__ import division, absolute_import, print_function -import numpy as np import sys +from itertools import product + +import numpy as np from numpy.core import zeros, float64 from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \ assert_raises, assert_array_equal, assert_allclose, assert_equal @@ -170,3 +172,78 @@ def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs): assert_equal(c.dot(a), "A") assert_raises(TypeError, np.dot, b, c) assert_raises(TypeError, c.dot, b) + + +def test_npdot_segfault(): + if sys.platform != 'darwin': return + # Test for float32 np.dot segfault + # https://github.com/numpy/numpy/issues/4007 + + def aligned_array(shape, align, dtype, order='C'): + # Make array shape `shape` with aligned at `align` bytes + d = dtype() + # Make array of correct size with `align` extra bytes + N = np.prod(shape) + tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8) + address = tmp.__array_interface__["data"][0] + # Find offset into array giving desired alignment + for offset in range(align): + if (address + offset) % align == 0: break + tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype) + return tmp.reshape(shape, order=order) + + def as_aligned(arr, align, dtype, order='C'): + # Copy `arr` into an aligned array with same shape + aligned = aligned_array(arr.shape, align, dtype, order) + aligned[:] = arr[:] + return aligned + + def assert_dot_close(A, X, desired): + assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7) + + m = aligned_array(100, 15, np.float32) + s = aligned_array((100, 100), 15, np.float32) + # This always segfaults when the sgemv alignment bug is present + np.dot(s, m) + # test the sanity of np.dot after applying patch + for align, m, n, a_order in product( + (15, 32), + (10000,), + (200, 89), + ('C', 'F')): + # Calculation in double precision + A_d = np.random.rand(m, n) + X_d = np.random.rand(n) + desired = np.dot(A_d, X_d) + # Calculation with aligned single precision + A_f = as_aligned(A_d, align, np.float32, order=a_order) + X_f = as_aligned(X_d, align, np.float32) + assert_dot_close(A_f, X_f, desired) + # Strided A rows + A_d_2 = A_d[::2] + desired = np.dot(A_d_2, X_d) + A_f_2 = A_f[::2] + assert_dot_close(A_f_2, X_f, desired) + # Strided A columns, strided X vector + A_d_22 = A_d_2[:, ::2] + X_d_2 = X_d[::2] + desired = np.dot(A_d_22, X_d_2) + A_f_22 = A_f_2[:, ::2] + X_f_2 = X_f[::2] + assert_dot_close(A_f_22, X_f_2, desired) + # Check the strides are as expected + if a_order == 'F': + assert_equal(A_f_22.strides, (8, 8 * m)) + else: + assert_equal(A_f_22.strides, (8 * n, 8)) + assert_equal(X_f_2.strides, (8,)) + # Strides in A rows + cols only + X_f_2c = as_aligned(X_f_2, align, np.float32) + assert_dot_close(A_f_22, X_f_2c, desired) + # Strides just in A cols + A_d_12 = A_d[:, ::2] + desired = np.dot(A_d_12, X_d_2) + A_f_12 = A_f[:, ::2] + assert_dot_close(A_f_12, X_f_2c, desired) + # Strides in A cols and X + assert_dot_close(A_f_12, X_f_2, desired) diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py index 48c92c548224..31cb13b86330 100644 --- a/numpy/distutils/system_info.py +++ b/numpy/distutils/system_info.py @@ -1400,7 +1400,7 @@ def calc_info(self): if os.path.exists('/System/Library/Frameworks' '/Accelerate.framework/'): if intel: - args.extend(['-msse3']) + args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH']) else: args.extend(['-faltivec']) link_args.extend(['-Wl,-framework', '-Wl,Accelerate']) @@ -1497,7 +1497,7 @@ def calc_info(self): if os.path.exists('/System/Library/Frameworks' '/Accelerate.framework/'): if intel: - args.extend(['-msse3']) + args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH']) else: args.extend(['-faltivec']) args.extend([