8000 BUG: Workaround segfault in Apple Accelerate framework SGEMV function by sturlamolden · Pull Request #5237 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG: Workaround segfault in Apple Accelerate framework SGEMV function #5237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions numpy/build_utils/apple_accelerate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import sys
import re

__all__ = ['uses_accelerate_framework', 'get_sgemv_fix']

def uses_accelerate_framework(info):
""" Returns True if Accelerate framework is used for BLAS/LAPACK """
if sys.platform != "darwin":
return False
r_accelerate = re.compile("Accelerate")
extra_link_args = info.get('extra_link_args', '')
for arg in extra_link_args:
if r_accelerate.search(arg):
return True
return False

def get_sgemv_fix():
""" Returns source file needed to correct SGEMV """
path = os.path.abspath(os.path.dirname(__file__))
return [os.path.join(path, 'src', 'apple_sgemv_fix.c')]
230 changes: 230 additions & 0 deletions numpy/build_utils/src/apple_sgemv_fix.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
/* This is a collection of ugly hacks to circumvent a bug in
* Apple Accelerate framework's SGEMV subroutine.
*
* See: https://github.com/numpy/numpy/issues/4007
*
* SGEMV in Accelerate framework will segfault on MacOS X version 10.9
* (aka Mavericks) if arrays are not aligned to 32 byte boundaries
* and the CPU supports AVX instructions. This can produce segfaults
* in np.dot.
*
* This patch overshadows the symbols cblas_sgemv, sgemv_ and sgemv
* exported by Accelerate to produce the correct behavior. The MacOS X
* version and CPU specs are checked on module import. If Mavericks and
* AVX are detected the call to SGEMV is emulated with a call to SGEMM
* if the arrays are not 32 byte aligned. If the exported symbols cannot
* be overshadowed on module import, a fatal error is produced and the
* process aborts. All the fixes are in a self-contained C file
* and do not alter the multiarray C code. The patch is not applied
* unless NumPy is configured to link with Apple's Accelerate
* framework.
*
*/

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#include "Python.h"
#include "numpy/arrayobject.h"

#include <string.h>
#include <dlfcn.h>
#include <stdlib.h>
#include <stdio.h>

/* ----------------------------------------------------------------- */
/* Original cblas_sgemv */

#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"

enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
extern void cblas_xerbla(int info, const char *rout, const char *form, ...);

typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const float alpha, const float *A, const int lda,
const float *X, const int incX,
const float beta, float *Y, const int incY);

typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
const int M, const int N, const int K,
const float alpha, const float *A, const int lda,
const float *B, const int ldb,
const float beta, float *C, const int incC);

typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
const float* alpha, const float* A, const int* ldA,
const float* X, const int* incX,
const float* beta, float* Y, const int* incY );

static void *veclib = NULL;
static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
static fortran_sgemv_t *accelerate_sgemv = NULL;
static int AVX_and_10_9 = 0;

/* Dynamic check for AVX support
* __builtin_cpu_supports("avx") is available in gcc 4.8,
* but clang and icc do not currently support it. */
#define cpu_supports_avx()\
(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)

/* Check if we are using MacOS X version 10.9 */
#define using_mavericks()\
(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0)

__attribute__((destructor))
static void unloadlib(void)
{
if (veclib) dlclose(veclib);
}

__attribute__((constructor))
static void loadlib()
/* automatically executed on module import */
{
char errormsg[1024];
int AVX, MAVERICKS;
memset((void*)errormsg, 0, sizeof(errormsg));
/* check if the CPU supports AVX */
AVX = cpu_supports_avx();
/* check if the OS is MacOS X Mavericks */
MAVERICKS = using_mavericks();
/* we need the workaround when the CPU supports
* AVX and the OS version is Mavericks */
AVX_and_10_9 = AVX && MAVERICKS;
/* load vecLib */
veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
if (!veclib) {
veclib = NULL;
sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
Py_FatalError(errormsg); /* calls abort() and dumps core */
}
/* resolve Fortran SGEMV from Accelerate */
accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
if (!accelerate_sgemv) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
Py_FatalError(errormsg);
}
/* resolve cblas_sgemv from Accelerate */
accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
if (!accelerate_cblas_sgemv) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
Py_FatalError(errormsg);
}
/* resolve cblas_sgemm from Accelerate */
accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
if (!accelerate_cblas_sgemm) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
Py_FatalError(errormsg);
}
}

/* ----------------------------------------------------------------- */
/* Fortran SGEMV override */

void sgemv_( const char* trans, const int* m, const int* n,
const float* alpha, const float* A, const int* ldA,
const float* X, const int* incX,
const float* beta, float* Y, const int* incY )
{
/* It is safe to use the original SGEMV if we are not using AVX on Mavericks
* or the input arrays A, X and Y are all aligned on 32 byte boundaries. */
#define BADARRAY(x) (((npy_intp)(void*)x) % 32)
const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
if (!use_sgemm) {
accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
return;
}

/* Arrays are misaligned, the CPU supports AVX, and we are running
* Mavericks.
*
* Emulation of SGEMV with SGEMM:
*
* SGEMV allows vectors to be strided. SGEMM requires all arrays to be
* contiguous along the leading dimension. To emulate striding in SGEMV
* with the leading dimension arguments in SGEMM we compute
*
* Y = alpha * op(A) @ X + beta * Y
*
* as
*
* Y.T = alpha * X.T @ op(A).T + beta * Y.T
*
* Because Fortran uses column major order and X.T and Y.T are row vectors,
* the leading dimensions of X.T and Y.T in SGEMM become equal to the
* strides of the the column vectors X and Y in SGEMV. */

switch (*trans) {
case 'T':
case 't':
case 'C':
case 'c':
accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
break;
case 'N':
case 'n':
accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
break;
default:
cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
}
}

/* ----------------------------------------------------------------- */
/* Override for an alias symbol for sgemv_ in Accelerate */

void sgemv (char *trans,
const int *m, const int *n,
const float *alpha,
const float *A, const int *lda,
const float *B, const int *incB,
const float *beta,
float *C, const int *incC)
{
sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
}

/* ----------------------------------------------------------------- */
/* cblas_sgemv override, based on Netlib CBLAS code */

void cblas_sgemv(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const float alpha, const float *A, const int lda,
const float *X, const int incX, const float beta,
float *Y, const int incY)
{
char TA;
if (order == CblasColMajor)
{
if (TransA == CblasNoTrans) TA = 'N';
else if (TransA == CblasTrans) TA = 'T';
else if (TransA == CblasConjTrans) TA = 'C';
else
{
cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
}
sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
}
else if (order == CblasRowMajor)
{
if (TransA == CblasNoTrans) TA = 'T';
else if (TransA == CblasTrans) TA = 'N';
else if (TransA == CblasConjTrans) TA = 'N';
else
{
cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
return;
}
sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
}
else
cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
}

3 changes: 3 additions & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from numpy.distutils import log
from distutils.dep_util import newer
from distutils.sysconfig import get_config_var
from numpy.build_utils.apple_accelerate import uses_accelerate_framework, get_sgemv_fix

from setup_common import *

Expand Down Expand Up @@ -839,6 +840,8 @@ def generate_multiarray_templated_sources(ext, build_dir):
if blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
extra_info = blas_info
multiarray_src.append(join('src', 'multiarray', 'cblasfuncs.c'))
if uses_accelerate_framework(blas_info):
multiarray_src += get_sgemv_fix()
else:
extra_info = {}

67E6 Expand Down
67 changes: 67 additions & 0 deletions numpy/core/tests/test_multiarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3611,6 +3611,73 @@ def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
assert_equal(c.dot(a), "A")
assert_raises(TypeError, np.dot, b, c)
assert_raises(TypeError, c.dot, b)

def test_accelerate_framework_sgemv_fix(self):
from itertools import product
if sys.platform != 'darwin':
return

def aligned_array(shape, align, dtype, order='C'):
d = dtype()
N = np.prod(shape)
tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
address = tmp.__array_interface__["data"][0]
for offset in range(align):
if (address + offset) % align == 0:
break
tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
return tmp.reshape(shape, order=order)

def as_aligned(arr, align, dtype, order='C'):
aligned = aligned_array(arr.shape, align, dtype, order)
aligned[:] = arr[:]
return aligned

def assert_dot_close(A, X, desired):
assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7)

m = aligned_array(100, 15, np.float32)
s = aligned_array((100, 100), 15, np.float32)
np.dot(s, m) # this will always segfault if the bug is present

testdata = product((15,32), (10000,), (200,89), ('C','F'))
for align, m, n, a_order in testdata:
# Calculation in double precision
A_d = np.random.rand(m, n)
X_d = np.random.rand(n)
desired = np.dot(A_d, X_d)
# Calculation with aligned single precision
A_f = as_aligned(A_d, align, np.float32, order=a_order)
X_f = as_aligned(X_d, align, np.float32)
assert_dot_close(A_f, X_f, desired)
# Strided A rows
A_d_2 = A_d[::2]
desired = np.dot(A_d_2, X_d)
A_f_2 = A_f[::2]
assert_dot_close(A_f_2, X_f, desired)
# Strided A columns, strided X vector
A_d_22 = A_d_2[:, ::2]
X_d_2 = X_d[::2]
desired = np.dot(A_d_22, X_d_2)
A_f_22 = A_f_2[:, ::2]
X_f_2 = X_f[::2]
assert_dot_close(A_f_22, X_f_2, desired)
# Check the strides are as expected
if a_order == 'F':
assert_equal(A_f_22.strides, (8, 8 * m))
else:
assert_equal(A_f_22.strides, (8 * n, 8))
assert_equal(X_f_2.strides, (8,))
# Strides in A rows + cols only
X_f_2c = as_aligned(X_f_2, align, np.float32)
assert_dot_close(A_f_22, X_f_2c, desired)
# Strides just in A cols
A_d_12 = A_d[:, ::2]
desired = np.dot(A_d_12, X_d_2)
A_f_12 = A_f[:, ::2]
assert_dot_close(A_f_12, X_f_2c, desired)
# Strides in A cols and X
assert_dot_close(A_f_12, X_f_2, desired)


class TestInner(TestCase):
Expand Down
0