8000 Circumventing cblas_sgemv segfault in Accelerate by sturlamolden · Pull Request #5205 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions numpy/core/blasdot/apple_sgemv_patch.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#ifdef APPLE_ACCELERATE_SGEMV_PATCH

/* This is an ugly hack to circumvent a bug in Accelerate's cblas_sgemv.
*
* See: https://github.com/numpy/numpy/issues/4007
*
*/

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#include "Python.h"
#include "numpy/arrayobject.h"

#include <string.h>
#include <dlfcn.h>
#include <stdlib.h>
#include <stdio.h>

/* ----------------------------------------------------------------- */
/* Original cblas_sgemv */

#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"

enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
extern void cblas_xerbla(int info, const char *rout, const char *form, ...);

typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const float alpha, const float *A, const int lda,
const float *X, const int incX,
const float beta, float *Y, const int incY);

typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
const int M, const int N, const int K,
const float alpha, const float *A, const int lda,
const float *B, const int ldb,
const float beta, float *C, const int incC);

typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
const float* alpha, const float* A, const int* ldA,
const float* X, const int* incX,
const float* beta, float* Y, const int* incY );

static void *veclib = NULL;
static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
static fortran_sgemv_t *accelerate_sgemv = NULL;
static int AVX_and_10_9 = 0;

/* Dynamic check for AVX support
* __builtin_cpu_supports("avx") is available in gcc 4.8,
* but clang and icc do not currently support it. */
#define cpu_supports_avx()\
(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)

/* Check if we are using MacOS X version 10.9 */
#define using_mavericks()\
(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0)

__attribute__((destructor))
static void unloadlib(void)
{
if (veclib) dlclose(veclib);
}

__attribute__((constructor))
static void loadlib()
/* automatically executed on module import */
{
char errormsg[1024];
int AVX, MAVERICKS;
memset((void*)errormsg, 0, sizeof(errormsg));
/* check if the CPU supports AVX */
AVX = cpu_supports_avx();
if (AVX < 0) {
/* Could not determine if CPU supports AVX,
* assume for safety that it does */
AVX = 1;
}
/* check if the OS is MacOS X Mavericks */
MAVERICKS = using_mavericks();
if (MAVERICKS < 0) {
/* Could not determine if the OS is Mavericks,
* assume for safety that it is */
MAVERICKS = 1;
}
/* we need the workaround when the CPU supports
* AVX and the OS version is Mavericks */
AVX_and_10_9 = AVX && MAVERICKS;
/* load vecLib */
veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
if (!veclib) {
veclib = NULL;
sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
Py_FatalError(errormsg); /* calls abort() and dumps core */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of aborting it would be better to just live with the broken sgemv call

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If dlsym fails the broken sgemv cannot be found because I have overshadowed the symbols sgemv_ and cblas_sgemv. The only way to get the original one is through dlsym. That is also how libVecFort solves a similar issue. I am aborting on error because I would otherwise have to change _dotblas.c to raise an import error. But the idea was to avoid changes to that file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then we better live with the shadowed sgemv_ instead of aborting, an extra copy is better than not being able to start numpy

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh right you need the originals because you actually call back into them, nevermind then

}
/* resolve Fortran SGEMV from Accelerate */
accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
if (!accelerate_sgemv) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
Py_FatalError(errormsg);
}
/* resolve cblas_sgemv from Accelerate */
accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
if (!accelerate_cblas_sgemv) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
Py_FatalError(errormsg);
}
/* resolve cblas_sgemm from Accelerate */
accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
if (!accelerate_cblas_sgemm) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
Py_FatalError(errormsg);
}
}

/* ----------------------------------------------------------------- */
/* Fortran SGEMV override */

#define BADARRAY(x) (((npy_intp)(void*)x) % 32)

void sgemv_( const char* trans, const int* m, const int* n,
const float* alpha, const float* A, const int* ldA,
const float* X, const int* incX,
const float* beta, float* Y, const int* incY )
{
const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));

if (!use_sgemm) {
/* Safe to use the original SGEMV */
accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
return;
}

/* Emulate SGEMV with SGEMM: Arrays are misaligned, the CPU supports AVX,
* and we are running Mavericks.
*
* SGEMV allows vectors to be strided. SGEMM requires all arrays to be
* contiguous along the leading dimension. To emulate striding with leading
* dimension argument we compute
*
* Y = alpha * op(A) @ X + beta * Y
*
* as
*
* Y.T = alpha * X.T @ op(A).T + beta * Y.T
*
* Since X.T and Y.T are row vectors, their leading dimension in
* SGEMM becomes equal to their stride in SGEMV. */

switch (*trans) {
case 'T':
case 't':
case 'C':
case 'c':
accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
break;
case 'N':
case 'n':
accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
break;
default:
cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
}
}

/* ----------------------------------------------------------------- */
/* Override for an alias symbol for sgemv_ in Accelerate */

void sgemv (char *trans,
const int *m, const int *n,
const float *alpha,
const float *A, const int *lda,
const float *B, const int *incB,
const float *beta,
float *C, const int *incC)
{
sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
}

/* ----------------------------------------------------------------- */
/* cblas_sgemv override, based on Netlib CBLAS code */

void cblas_sgemv(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const float alpha, const float *A, const int lda,
const float *X, const int incX, const float beta,
float *Y, const int incY)
{
char TA;
if (order == CblasColMajor)
{
if (TransA == CblasNoTrans) TA = 'N';
else if (TransA == CblasTrans) TA = 'T';
else if (TransA == CblasConjTrans) TA = 'C';
else
{
cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
}
sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
}
else if (order == CblasRowMajor)
{
if (TransA == CblasNoTrans) TA = 'T';
else if (TransA == CblasTrans) TA = 'N';
else if (TransA == CblasConjTrans) TA = 'N';
else
{
cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
return;
}
sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
}
else
cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
}

#endif
5 changes: 3 additions & 2 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,13 +954,14 @@ def get_dotblas_sources(ext, build_dir):
if blas_info:
if ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', []):
return None # dotblas needs ATLAS, Fortran compiled blas will not be sufficient.
return ext.depends[:1]
return ext.depends[:2]
return None # no extension module will be built

config.add_extension('_dotblas',
sources = [get_dotblas_sources],
depends = [join('blasdot', '_dotblas.c'),
join('blasdot', 'cblas.h'),
join('blasdot', 'apple_sgemv_patch.c'),
join('blasdot', 'cblas.h'),
],
include_dirs = ['blasdot'],
extra_info = blas_info
Expand Down
79 changes: 78 additions & 1 deletion numpy/core/tests/test_blasdot.py
A3DB
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import division, absolute_import, print_function

import numpy as np
import sys
from itertools import product

import numpy as np
from numpy.core import zeros, float64
from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
assert_raises, assert_array_equal, assert_allclose, assert_equal
Expand Down Expand Up @@ -170,3 +172,78 @@ def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
assert_equal(c.dot(a), "A")
assert_raises(TypeError, np.dot, b, c)
assert_raises(TypeError, c.dot, b)


def test_npdot_segfault():
if sys.platform != 'darwin': return
# Test for float32 np.dot segfault
# https://github.com/numpy/numpy/issues/4007

def aligned_array(shape, align, dtype, order='C'):
# Make array shape `shape` with aligned at `align` bytes
d = dtype()
# Make array of correct size with `align` extra bytes
N = np.prod(shape)
tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
address = tmp.__array_interface__["data"][0]
# Find offset into array giving desired alignment
for offset in range(align):
if (address + offset) % align == 0: break
tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
return tmp.reshape(shape, order=order)

def as_aligned(arr, align, dtype, order='C'):
# Copy `arr` into an aligned array with same shape
aligned = aligned_array(arr.shape, align, dtype, order)
aligned[:] = arr[:]
return aligned

def assert_dot_close(A, X, desired):
assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7)

m = aligned_array(100, 15, np.float32)
s = aligned_array((100, 100), 15, np.float32)
# This always segfaults when the sgemv alignment bug is present
np.dot(s, m)
# test the sanity of np.dot after applying patch
for align, m, n, a_order in product(
(15, 32),
(10000,),
(200, 89),
('C', 'F')):
# Calculation in double precision
A_d = np.random.rand(m, n)
X_d = np.random.rand(n)
desired = np.dot(A_d, X_d)
# Calculation with aligned single precision
A_f = as_aligned(A_d, align, np.float32, order=a_order)
X_f = as_aligned(X_d, align, np.float32)
assert_dot_close(A_f, X_f, desired)
# Strided A rows
A_d_2 = A_d[::2]
desired = np.dot(A_d_2, X_d)
A_f_2 = A_f[::2]
assert_dot_close(A_f_2, X_f, desired)
# Strided A columns, strided X vector
A_d_22 = A_d_2[:, ::2]
X_d_2 = X_d[::2]
desired = np.dot(A_d_22, X_d_2)
A_f_22 = A_f_2[:, ::2]
X_f_2 = X_f[::2]
assert_dot_close(A_f_22, X_f_2, desired)
# Check the strides are as expected
if a_order == 'F':
assert_equal(A_f_22.strides, (8, 8 * m))
else:
assert_equal(A_f_22.strides, (8 * n, 8))
assert_equal(X_f_2.strides, (8,))
# Strides in A rows + cols only
X_f_2c = as_aligned(X_f_2, align, np.float32)
assert_dot_close(A_f_22, X_f_2c, desired)
# Strides just in A cols
A_d_12 = A_d[:, ::2]
desired = np.dot(A_d_12, X_d_2)
A_f_12 = A_f[:, ::2]
assert_dot_close(A_f_12, X_f_2c, desired)
# Strides in A cols and X
assert_dot_close(A_f_12, X_f_2, desired)
4 changes: 2 additions & 2 deletions numpy/distutils/system_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -1400,7 +1400,7 @@ def calc_info(self):
if os.path.exists('/System/Library/Frameworks'
'/Accelerate.framework/'):
if intel:
args.extend(['-msse3'])
args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH'])
else:
args.extend(['-faltivec'])
link_args.extend(['-Wl,-framework', '-Wl,Accelerate'])
Expand Down Expand Up @@ -1497,7 +1497,7 @@ def calc_info(self):
if os.path.exists('/System/Library/Frameworks'
'/Accelerate.framework/'):
if intel:
args.extend(['-msse3'])
args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH'])
else:
args.extend(['-faltivec'])
args.extend([
Expand Down
0