-
-
Notifications
You must be signed in to change notification settings - Fork 11.7k
Circumventing cblas_sgemv segfault in Accelerate #5205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
sturlamolden
wants to merge
1
commit into
numpy:maintenance/1.9.x
from
sturlamolden:maintenance/1.9.x
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,224 @@ | ||
| #ifdef APPLE_ACCELERATE_SGEMV_PATCH | ||
|
|
||
| /* This is an ugly hack to circumvent a bug in Accelerate's cblas_sgemv. | ||
| * | ||
| * See: https://github.com/numpy/numpy/issues/4007 | ||
| * | ||
| */ | ||
|
|
||
| #define NPY_NO_DEPRECATED_API NPY_API_VERSION | ||
| #include "Python.h" | ||
| #include "numpy/arrayobject.h" | ||
|
|
||
| #include <string.h> | ||
| #include <dlfcn.h> | ||
| #include <stdlib.h> | ||
| #include <stdio.h> | ||
|
|
||
| /* ----------------------------------------------------------------- */ | ||
| /* Original cblas_sgemv */ | ||
|
|
||
| #define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib" | ||
|
|
||
| enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | ||
| enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; | ||
| extern void cblas_xerbla(int info, const char *rout, const char *form, ...); | ||
|
|
||
| typedef void cblas_sgemv_t(const enum CBLAS_ORDER order, | ||
| const enum CBLAS_TRANSPOSE TransA, const int M, const int N, | ||
| const float alpha, const float *A, const int lda, | ||
| const float *X, const int incX, | ||
| const float beta, float *Y, const int incY); | ||
|
|
||
| typedef void cblas_sgemm_t(const enum CBLAS_ORDER order, | ||
| const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, | ||
| const int M, const int N, const int K, | ||
| const float alpha, const float *A, const int lda, | ||
| const float *B, const int ldb, | ||
| const float beta, float *C, const int incC); | ||
|
|
||
| typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n, | ||
| const float* alpha, const float* A, const int* ldA, | ||
| const float* X, const int* incX, | ||
| const float* beta, float* Y, const int* incY ); | ||
|
|
||
| static void *veclib = NULL; | ||
| static cblas_sgemv_t *accelerate_cblas_sgemv = NULL; | ||
| static cblas_sgemm_t *accelerate_cblas_sgemm = NULL; | ||
| static fortran_sgemv_t *accelerate_sgemv = NULL; | ||
| static int AVX_and_10_9 = 0; | ||
|
|
||
| /* Dynamic check for AVX support | ||
| * __builtin_cpu_supports("avx") is available in gcc 4.8, | ||
| * but clang and icc do not currently support it. */ | ||
| #define cpu_supports_avx()\ | ||
| (system("sysctl -n machdep.cpu.features | grep -q AVX") == 0) | ||
|
|
||
| /* Check if we are using MacOS X version 10.9 */ | ||
| #define using_mavericks()\ | ||
| (system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0) | ||
|
|
||
| __attribute__((destructor)) | ||
| static void unloadlib(void) | ||
| { | ||
| if (veclib) dlclose(veclib); | ||
| } | ||
|
|
||
| __attribute__((constructor)) | ||
| static void loadlib() | ||
| /* automatically executed on module import */ | ||
| { | ||
| char errormsg[1024]; | ||
| int AVX, MAVERICKS; | ||
| memset((void*)errormsg, 0, sizeof(errormsg)); | ||
| /* check if the CPU supports AVX */ | ||
| AVX = cpu_supports_avx(); | ||
| if (AVX < 0) { | ||
| /* Could not determine if CPU supports AVX, | ||
| * assume for safety that it does */ | ||
| AVX = 1; | ||
| } | ||
| /* check if the OS is MacOS X Mavericks */ | ||
| MAVERICKS = using_mavericks(); | ||
| if (MAVERICKS < 0) { | ||
| /* Could not determine if the OS is Mavericks, | ||
| * assume for safety that it is */ | ||
| MAVERICKS = 1; | ||
| } | ||
| /* we need the workaround when the CPU supports | ||
| * AVX and the OS version is Mavericks */ | ||
| AVX_and_10_9 = AVX && MAVERICKS; | ||
| /* load vecLib */ | ||
| veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST); | ||
| if (!veclib) { | ||
| veclib = NULL; | ||
| sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE); | ||
| Py_FatalError(errormsg); /* calls abort() and dumps core */ | ||
| } | ||
| /* resolve Fortran SGEMV from Accelerate */ | ||
| accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_"); | ||
| if (!accelerate_sgemv) { | ||
| unloadlib(); | ||
| sprintf(errormsg,"Failed to resolve symbol 'sgemv_'."); | ||
| Py_FatalError(errormsg); | ||
| } | ||
| /* resolve cblas_sgemv from Accelerate */ | ||
| accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv"); | ||
| if (!accelerate_cblas_sgemv) { | ||
| unloadlib(); | ||
| sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'."); | ||
| Py_FatalError(errormsg); | ||
| } | ||
| /* resolve cblas_sgemm from Accelerate */ | ||
| accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm"); | ||
| if (!accelerate_cblas_sgemm) { | ||
| unloadlib(); | ||
| sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'."); | ||
| Py_FatalError(errormsg); | ||
| } | ||
| } | ||
|
|
||
| /* ----------------------------------------------------------------- */ | ||
| /* Fortran SGEMV override */ | ||
|
|
||
| #define BADARRAY(x) (((npy_intp)(void*)x) % 32) | ||
|
|
||
| void sgemv_( const char* trans, const int* m, const int* n, | ||
| const float* alpha, const float* A, const int* ldA, | ||
| const float* X, const int* incX, | ||
| const float* beta, float* Y, const int* incY ) | ||
| { | ||
| const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y)); | ||
|
|
||
| if (!use_sgemm) { | ||
| /* Safe to use the original SGEMV */ | ||
| accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY); | ||
| return; | ||
| } | ||
|
|
||
| /* Emulate SGEMV with SGEMM: Arrays are misaligned, the CPU supports AVX, | ||
| * and we are running Mavericks. | ||
| * | ||
| * SGEMV allows vectors to be strided. SGEMM requires all arrays to be | ||
| * contiguous along the leading dimension. To emulate striding with leading | ||
| * dimension argument we compute | ||
| * | ||
| * Y = alpha * op(A) @ X + beta * Y | ||
| * | ||
| * as | ||
| * | ||
| * Y.T = alpha * X.T @ op(A).T + beta * Y.T | ||
| * | ||
| * Since X.T and Y.T are row vectors, their leading dimension in | ||
| * SGEMM becomes equal to their stride in SGEMV. */ | ||
|
|
||
| switch (*trans) { | ||
| case 'T': | ||
| case 't': | ||
| case 'C': | ||
| case 'c': | ||
| accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, | ||
| 1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); | ||
| break; | ||
| case 'N': | ||
| case 'n': | ||
| accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, | ||
| 1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); | ||
| break; | ||
| default: | ||
| cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans); | ||
| } | ||
| } | ||
|
|
||
| /* ----------------------------------------------------------------- */ | ||
| /* Override for an alias symbol for sgemv_ in Accelerate */ | ||
|
|
||
| void sgemv (char *trans, | ||
| const int *m, const int *n, | ||
| const float *alpha, | ||
| const float *A, const int *lda, | ||
| const float *B, const int *incB, | ||
| const float *beta, | ||
| float *C, const int *incC) | ||
| { | ||
| sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC); | ||
| } | ||
|
|
||
| /* ----------------------------------------------------------------- */ | ||
| /* cblas_sgemv override, based on Netlib CBLAS code */ | ||
|
|
||
| void cblas_sgemv(const enum CBLAS_ORDER order, | ||
| const enum CBLAS_TRANSPOSE TransA, const int M, const int N, | ||
| const float alpha, const float *A, const int lda, | ||
| const float *X, const int incX, const float beta, | ||
| float *Y, const int incY) | ||
| { | ||
| char TA; | ||
| if (order == CblasColMajor) | ||
| { | ||
| if (TransA == CblasNoTrans) TA = 'N'; | ||
| else if (TransA == CblasTrans) TA = 'T'; | ||
| else if (TransA == CblasConjTrans) TA = 'C'; | ||
| else | ||
| { | ||
| cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA); | ||
| } | ||
| sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY); | ||
| } | ||
| else if (order == CblasRowMajor) | ||
| { | ||
| if (TransA == CblasNoTrans) TA = 'T'; | ||
| else if (TransA == CblasTrans) TA = 'N'; | ||
| else if (TransA == CblasConjTrans) TA = 'N'; | ||
| else | ||
| { | ||
| cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA); | ||
| return; | ||
| } | ||
| sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY); | ||
| } | ||
| else | ||
| cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order); | ||
| } | ||
|
|
||
| #endif | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
instead of aborting it would be better to just live with the broken sgemv call
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If dlsym fails the broken sgemv cannot be found because I have overshadowed the symbols sgemv_ and cblas_sgemv. The only way to get the original one is through dlsym. That is also how libVecFort solves a similar issue. I am aborting on error because I would otherwise have to change _dotblas.c to raise an import error. But the idea was to avoid changes to that file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
then we better live with the shadowed sgemv_ instead of aborting, an extra copy is better than not being able to start numpy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh right you need the originals because you actually call back into them, nevermind then