numpy · sturlamolden · Oct 18, 2014 · juliantaylor · Oct 21, 2014 · sturlamolden
diff --git a/numpy/core/blasdot/apple_sgemv_patch.c b/numpy/core/blasdot/apple_sgemv_patch.c
@@ -0,0 +1,224 @@
+#ifdef APPLE_ACCELERATE_SGEMV_PATCH  
+
+/* This is an ugly hack to circumvent a bug in Accelerate's cblas_sgemv.
+ *
+ * See: https://github.com/numpy/numpy/issues/4007
+ *
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+#include <string.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/* ----------------------------------------------------------------- */
+/* Original cblas_sgemv */
+
+#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"
+
+enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+extern void cblas_xerbla(int info, const char *rout, const char *form, ...);
+
+typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, 
+                 const float beta, float  *Y, const int incY);
+
+typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
+                 const int M, const int N, const int K,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *B, const int ldb, 
+                 const float beta, float  *C, const int incC); 
+
+typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY );
+
+static void *veclib = NULL;
+static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
+static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
+static fortran_sgemv_t *accelerate_sgemv = NULL;
+static int AVX_and_10_9 = 0;
+
+/* Dynamic check for AVX support
+ * __builtin_cpu_supports("avx") is available in gcc 4.8,
+ * but clang and icc do not currently support it. */
+#define cpu_supports_avx()\
+(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)
+
+/* Check if we are using MacOS X version 10.9 */
+#define using_mavericks()\
+(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0) 
+
+__attribute__((destructor))
+static void unloadlib(void)
+{
+   if (veclib) dlclose(veclib);
+}
+
+__attribute__((constructor))
+static void loadlib()
+/* automatically executed on module import */
+{
+    char errormsg[1024];
+    int AVX, MAVERICKS;
+    memset((void*)errormsg, 0, sizeof(errormsg));
+    /* check if the CPU supports AVX */
+    AVX = cpu_supports_avx();
+    if (AVX < 0) {
+        /* Could not determine if CPU supports AVX,
+         * assume for safety that it does */
+        AVX = 1;
+    }
+    /* check if the OS is MacOS X Mavericks */
+    MAVERICKS = using_mavericks();
+    if (MAVERICKS < 0) {
+        /* Could not determine if the OS is Mavericks,
+         * assume for safety that it is */
+        MAVERICKS = 1;
+    }
+    /* we need the workaround when the CPU supports
+     * AVX and the OS version is Mavericks */
+    AVX_and_10_9 = AVX && MAVERICKS;
+    /* load vecLib */
+    veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
+    if (!veclib) {
+        veclib = NULL;
+        sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
+        Py_FatalError(errormsg); /* calls abort() and dumps core */
+    }
+    /* resolve Fortran SGEMV from Accelerate */
+    accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
+    if (!accelerate_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemv from Accelerate */
+        accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
+    if (!accelerate_cblas_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemm from Accelerate */
+    accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
+    if (!accelerate_cblas_sgemm) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
+        Py_FatalError(errormsg);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Fortran SGEMV override */
+
+#define BADARRAY(x) (((npy_intp)(void*)x) % 32)
+
+void sgemv_( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY )
+{
+    const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
+
+    if (!use_sgemm) {
+        /* Safe to use the original SGEMV */
+        accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
+        return;
+    }
+
+    /* Emulate SGEMV with SGEMM: Arrays are misaligned, the CPU supports AVX,
+     * and we are running Mavericks.
+     *
+     * SGEMV allows vectors to be strided. SGEMM requires all arrays to be
+     * contiguous along the leading dimension. To emulate striding with leading
+     * dimension argument we compute
+     *
+     *    Y = alpha * op(A) @ X + beta * Y
+     *
+     * as
+     *
+     *    Y.T = alpha * X.T @ op(A).T + beta * Y.T
+     *
+     * Since X.T and Y.T are row vectors, their leading dimension in
+     * SGEMM becomes equal to their stride in SGEMV. */
+
+    switch (*trans) {
+        case 'T':
+        case 't':
+        case 'C':
+        case 'c':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        case 'N':
+        case 'n':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
+                1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        default:
+            cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Override for an alias symbol for sgemv_ in Accelerate */
+
+void sgemv (char *trans,
+            const int *m, const int *n,
+            const float *alpha,
+            const float *A, const int *lda,
+            const float *B, const int *incB,
+            const float *beta,
+            float *C, const int *incC)
+{
+    sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
+}
+
+/* ----------------------------------------------------------------- */
+/* cblas_sgemv override, based on Netlib CBLAS code */
+
+void cblas_sgemv(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, const float beta,
+                 float  *Y, const int incY)
+{
+   char TA;
+   if (order == CblasColMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'N';
+      else if (TransA == CblasTrans) TA = 'T';
+      else if (TransA == CblasConjTrans) TA = 'C';
+      else 
+      {
+         cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
+      }
+      sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else if (order == CblasRowMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'T';
+      else if (TransA == CblasTrans) TA = 'N';
+      else if (TransA == CblasConjTrans) TA = 'N';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
+         return;
+      }
+      sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else
+      cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
+}
+
+#endif
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -954,13 +954,14 @@ def get_dotblas_sources(ext, build_dir):
         if blas_info:
             if ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', []):
                 return None # dotblas needs ATLAS, Fortran compiled blas will not be sufficient.
-            return ext.depends[:1]
+            return ext.depends[:2]
         return None # no extension module will be built
 
     config.add_extension('_dotblas',
                          sources = [get_dotblas_sources],
                          depends = [join('blasdot', '_dotblas.c'),
-                                  join('blasdot', 'cblas.h'),
+                                    join('blasdot', 'apple_sgemv_patch.c'),
+                                    join('blasdot', 'cblas.h'),
                                   ],
                          include_dirs = ['blasdot'],
                          extra_info = blas_info

diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
@@ -1,7 +1,9 @@
 from __future__ import division, absolute_import, print_function
 
-import numpy as np
 import sys
+from itertools import product
+
+import numpy as np
 from numpy.core import zeros, float64
 from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
      assert_raises, assert_array_equal, assert_allclose, assert_equal
@@ -170,3 +172,78 @@ def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
     assert_equal(c.dot(a), "A")
     assert_raises(TypeError, np.dot, b, c)
     assert_raises(TypeError, c.dot, b)
+
+
+def test_npdot_segfault():
+    if sys.platform != 'darwin': return
+    # Test for float32 np.dot segfault
+    # https://github.com/numpy/numpy/issues/4007
+
+    def aligned_array(shape, align, dtype, order='C'):
+        # Make array shape `shape` with aligned at `align` bytes
+        d = dtype()
+        # Make array of correct size with `align` extra bytes
+        N = np.prod(shape)
+        tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
+        address = tmp.__array_interface__["data"][0]
+        # Find offset into array giving desired alignment
+        for offset in range(align):
+            if (address + offset) % align == 0: break
+        tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
+        return tmp.reshape(shape, order=order)
+
+    def as_aligned(arr, align, dtype, order='C'):
+        # Copy `arr` into an aligned array with same shape
+        aligned = aligned_array(arr.shape, align, dtype, order)
+        aligned[:] = arr[:]
+        return aligned
+
+    def assert_dot_close(A, X, desired):
+        assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7)
+
+    m = aligned_array(100, 15, np.float32)
+    s = aligned_array((100, 100), 15, np.float32)
+    # This always segfaults when the sgemv alignment bug is present
+    np.dot(s, m)
+    # test the sanity of np.dot after applying patch
+    for align, m, n, a_order in product(
+        (15, 32),
+        (10000,),
+        (200, 89),
+        ('C', 'F')):
+        # Calculation in double precision
+        A_d = np.random.rand(m, n)
+        X_d = np.random.rand(n)
+        desired = np.dot(A_d, X_d)
+        # Calculation with aligned single precision
+        A_f = as_aligned(A_d, align, np.float32, order=a_order)
+        X_f = as_aligned(X_d, align, np.float32)
+        assert_dot_close(A_f, X_f, desired)
+        # Strided A rows
+        A_d_2 = A_d[::2]
+        desired = np.dot(A_d_2, X_d)
+        A_f_2 = A_f[::2]
+        assert_dot_close(A_f_2, X_f, desired)
+        # Strided A columns, strided X vector
+        A_d_22 = A_d_2[:, ::2]
+        X_d_2 = X_d[::2]
+        desired = np.dot(A_d_22, X_d_2)
+        A_f_22 = A_f_2[:, ::2]
+        X_f_2 = X_f[::2]
+        assert_dot_close(A_f_22, X_f_2, desired)
+        # Check the strides are as expected
+        if a_order == 'F':
+            assert_equal(A_f_22.strides, (8, 8 * m))
+        else:
+            assert_equal(A_f_22.strides, (8 * n, 8))
+        assert_equal(X_f_2.strides, (8,))
+        # Strides in A rows + cols only
+        X_f_2c = as_aligned(X_f_2, align, np.float32)
+        assert_dot_close(A_f_22, X_f_2c, desired)
+        # Strides just in A cols
+        A_d_12 = A_d[:, ::2]
+        desired = np.dot(A_d_12, X_d_2)
+        A_f_12 = A_f[:, ::2]
+        assert_dot_close(A_f_12, X_f_2c, desired)
+        # Strides in A cols and X
+        assert_dot_close(A_f_12, X_f_2, desired)
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
@@ -1400,7 +1400,7 @@ def calc_info(self):
             if os.path.exists('/System/Library/Frameworks'
                               '/Accelerate.framework/'):
                 if intel:
-                    args.extend(['-msse3'])
+                    args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH'])
                 else:
                     args.extend(['-faltivec'])
                 link_args.extend(['-Wl,-framework', '-Wl,Accelerate'])
@@ -1497,7 +1497,7 @@ def calc_info(self):
             if os.path.exists('/System/Library/Frameworks'
                               '/Accelerate.framework/'):
                 if intel:
-                    args.extend(['-msse3'])
+                    args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH'])
                 else:
                     args.extend(['-faltivec'])
                 args.extend([