diff --git a/doc/release/1.10.0-notes.rst b/doc/release/1.10.0-notes.rst
index 3f52bd5af4d2..386edac45a95 100644
--- a/doc/release/1.10.0-notes.rst
+++ b/doc/release/1.10.0-notes.rst
@@ -11,6 +11,7 @@ Highlights
 Dropped Support
 ===============
 * The polytemplate.py file has been removed.
+* The _dotblas module is no longer available.
 
 
 Future Changes
@@ -33,6 +34,17 @@ Improvements
 Changes
 =======
 
+dotblas functionality moved to multiarray
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The cblas versions of dot, inner, and vdot have been integrated into
+the multiarray module. In particular, vdot is now a multiarray function,
+which it was not before.
+
 
 Deprecations
 ============
+
+alterdot, restoredot Deprecated
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The alterdot and restoredot functions no longer do anything, and are
+deprecated.
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 09311a5364d4..6f3a10de4b0c 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -2174,43 +2174,6 @@ def luf(lamdaexpr, *args, **kwargs):
 
     """)
 
-add_newdoc('numpy.core', 'alterdot',
-    """
-    Change `dot`, `vdot`, and `inner` to use accelerated BLAS functions.
-
-    Typically, as a user of Numpy, you do not explicitly call this function. If
-    Numpy is built with an accelerated BLAS, this function is automatically
-    called when Numpy is imported.
-
-    When Numpy is built with an accelerated BLAS like ATLAS, these functions
-    are replaced to make use of the faster implementations.  The faster
-    implementations only affect float32, float64, complex64, and complex128
-    arrays. Furthermore, the BLAS API only includes matrix-matrix,
-    matrix-vector, and vector-vector products. Products of arrays with larger
-    dimensionalities use the built in functions and are not accelerated.
-
-    See Also
-    --------
-    restoredot : `restoredot` undoes the effects of `alterdot`.
-
-    """)
-
-add_newdoc('numpy.core', 'restoredot',
-    """
-    Restore `dot`, `vdot`, and `innerproduct` to the default non-BLAS
-    implementations.
-
-    Typically, the user will only need to call this when troubleshooting and
-    installation problem, reproducing the conditions of a build without an
-    accelerated BLAS, or when being very careful about benchmarking linear
-    algebra operations.
-
-    See Also
-    --------
-    alterdot : `restoredot` undoes the effects of `alterdot`.
-
-    """)
-
 add_newdoc('numpy.core', 'vdot',
     """
     vdot(a, b)
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 0b8d5bb17786..371d34b588ee 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -3,7 +3,20 @@
 from .info import __doc__
 from numpy.version import version as __version__
 
+# disables OpenBLAS affinity setting of the main thread that limits
+# python threads or processes to one core
+import os
+envbak = os.environ.copy()
+if 'OPENBLAS_MAIN_FREE' not in os.environ:
+    os.environ['OPENBLAS_MAIN_FREE'] = '1'
+if 'GOTOBLAS_MAIN_FREE' not in os.environ:
+    os.environ['GOTOBLAS_MAIN_FREE'] = '1'
 from . import multiarray
+os.environ.clear()
+os.environ.update(envbak)
+del envbak
+del os
+
 from . import umath
 from . import _internal # for freeze programs
 from . import numerictypes as nt
diff --git a/numpy/core/bento.info b/numpy/core/bento.info
index 299bd8ca516c..0a22dc710e3a 100644
--- a/numpy/core/bento.info
+++ b/numpy/core/bento.info
@@ -33,9 +33,6 @@ Library:
         Sources:
             src/private/scalarmathmodule.h.src,
             src/scalarmathmodule.c.src
-    Extension: _dotblas
-        Sources:
-            blasdot/_dotblas.c
     Extension: test_rational
         Sources:
             src/umath/test_rational.c.src
diff --git a/numpy/core/blasdot/cblas.h b/numpy/core/blasdot/cblas.h
deleted file mode 100644
index 25de09edfe8c..000000000000
--- a/numpy/core/blasdot/cblas.h
+++ /dev/null
@@ -1,578 +0,0 @@
-#ifndef CBLAS_H
-#define CBLAS_H
-#include <stddef.h>
-
-/* Allow the use in C++ code.  */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/*
- * Enumerated and derived types
- */
-#define CBLAS_INDEX size_t  /* this may vary between platforms */
-
-enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
-enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
-enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
-enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
-enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS functions (complex are recast as routines)
- * ===========================================================================
- */
-float  cblas_sdsdot(const int N, const float alpha, const float *X,
-                    const int incX, const float *Y, const int incY);
-double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
-                   const int incY);
-float  cblas_sdot(const int N, const float  *X, const int incX,
-                  const float  *Y, const int incY);
-double cblas_ddot(const int N, const double *X, const int incX,
-                  const double *Y, const int incY);
-
-/*
- * Functions having prefixes Z and C only
- */
-void   cblas_cdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu);
-void   cblas_cdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc);
-
-void   cblas_zdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu);
-void   cblas_zdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc);
-
-
-/*
- * Functions having prefixes S D SC DZ
- */
-float  cblas_snrm2(const int N, const float *X, const int incX);
-float  cblas_sasum(const int N, const float *X, const int incX);
-
-double cblas_dnrm2(const int N, const double *X, const int incX);
-double cblas_dasum(const int N, const double *X, const int incX);
-
-float  cblas_scnrm2(const int N, const void *X, const int incX);
-float  cblas_scasum(const int N, const void *X, const int incX);
-
-double cblas_dznrm2(const int N, const void *X, const int incX);
-double cblas_dzasum(const int N, const void *X, const int incX);
-
-
-/*
- * Functions having standard 4 prefixes (S D C Z)
- */
-CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
-CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
-CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
-CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS routines
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (s, d, c, z)
- */
-void cblas_sswap(const int N, float *X, const int incX,
-                 float *Y, const int incY);
-void cblas_scopy(const int N, const float *X, const int incX,
-                 float *Y, const int incY);
-void cblas_saxpy(const int N, const float alpha, const float *X,
-                 const int incX, float *Y, const int incY);
-
-void cblas_dswap(const int N, double *X, const int incX,
-                 double *Y, const int incY);
-void cblas_dcopy(const int N, const double *X, const int incX,
-                 double *Y, const int incY);
-void cblas_daxpy(const int N, const double alpha, const double *X,
-                 const int incX, double *Y, const int incY);
-
-void cblas_cswap(const int N, void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_ccopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_caxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY);
-
-void cblas_zswap(const int N, void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_zcopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_zaxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY);
-
-
-/*
- * Routines with S and D prefix only
- */
-void cblas_srotg(float *a, float *b, float *c, float *s);
-void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
-void cblas_srot(const int N, float *X, const int incX,
-                float *Y, const int incY, const float c, const float s);
-void cblas_srotm(const int N, float *X, const int incX,
-                float *Y, const int incY, const float *P);
-
-void cblas_drotg(double *a, double *b, double *c, double *s);
-void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
-void cblas_drot(const int N, double *X, const int incX,
-                double *Y, const int incY, const double c, const double  s);
-void cblas_drotm(const int N, double *X, const int incX,
-                double *Y, const int incY, const double *P);
-
-
-/*
- * Routines with S D C Z CS and ZD prefixes
- */
-void cblas_sscal(const int N, const float alpha, float *X, const int incX);
-void cblas_dscal(const int N, const double alpha, double *X, const int incX);
-void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_csscal(const int N, const float alpha, void *X, const int incX);
-void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
-
-/*
- * ===========================================================================
- * Prototypes for level 2 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 const float *X, const int incX, const float beta,
-                 float *Y, const int incY);
-void cblas_sgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const float alpha,
-                 const float *A, const int lda, const float *X,
-                 const int incX, const float beta, float *Y, const int incY);
-void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *Ap, float *X, const int incX);
-void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *A, const int lda, float *X,
-                 const int incX);
-void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *Ap, float *X, const int incX);
-
-void cblas_dgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *X, const int incX, const double beta,
-                 double *Y, const int incY);
-void cblas_dgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const double alpha,
-                 const double *A, const int lda, const double *X,
-                 const int incX, const double beta, double *Y, const int incY);
-void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *Ap, double *X, const int incX);
-void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *A, const int lda, double *X,
-                 const int incX);
-void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *Ap, double *X, const int incX);
-
-void cblas_cgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY);
-void cblas_cgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const void *alpha,
-                 const void *A, const int lda, const void *X,
-                 const int incX, const void *beta, void *Y, const int incY);
-void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda, void *X,
-                 const int incX);
-void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-
-void cblas_zgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY);
-void cblas_zgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const void *alpha,
-                 const void *A, const int lda, const void *X,
-                 const int incX, const void *beta, void *Y, const int incY);
-void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda, void *X,
-                 const int incX);
-void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-
-
-/*
- * Routines with S and D prefixes only
- */
-void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const float alpha, const float *Ap,
-                 const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N,
-                const float alpha, const float *X, const int incX,
-                const float *Y, const int incY, float *A, const int lda);
-void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, float *A, const int lda);
-void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, float *Ap);
-void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, const float *Y, const int incY, float *A,
-                const int lda);
-void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, const float *Y, const int incY, float *A);
-
-void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *A,
-                 const int lda, const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const double alpha, const double *A,
-                 const int lda, const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *Ap,
-                 const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N,
-                const double alpha, const double *X, const int incX,
-                const double *Y, const int incY, double *A, const int lda);
-void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, double *A, const int lda);
-void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, double *Ap);
-void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, const double *Y, const int incY, double *A,
-                const int lda);
-void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, const double *Y, const int incY, double *A);
-
-
-/*
- * Routines with C and Z prefixes only
- */
-void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const void *X, const int incX,
-                void *A, const int lda);
-void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const void *X,
-                const int incX, void *A);
-void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *A, const int lda);
-void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *Ap);
-
-void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const void *X, const int incX,
-                void *A, const int lda);
-void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const void *X,
-                const int incX, void *A);
-void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *A, const int lda);
-void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *Ap);
-
-/*
- * ===========================================================================
- * Prototypes for level 3 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const float alpha, const float *A,
-                 const int lda, const float *B, const int ldb,
-                 const float beta, float *C, const int ldc);
-void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta,
-                 float *C, const int ldc);
-void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const float alpha, const float *A, const int lda,
-                 const float beta, float *C, const int ldc);
-void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const float alpha, const float *A, const int lda,
-                  const float *B, const int ldb, const float beta,
-                  float *C, const int ldc);
-void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
-void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
-
-void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const double alpha, const double *A,
-                 const int lda, const double *B, const int ldb,
-                 const double beta, double *C, const int ldc);
-void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *B, const int ldb, const double beta,
-                 double *C, const int ldc);
-void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const double alpha, const double *A, const int lda,
-                 const double beta, double *C, const int ldc);
-void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const double alpha, const double *A, const int lda,
-                  const double *B, const int ldb, const double beta,
-                  double *C, const int ldc);
-void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
-void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
-
-void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc);
-void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const void *alpha, const void *A, const int lda,
-                 const void *beta, void *C, const int ldc);
-void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const void *beta,
-                  void *C, const int ldc);
-void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-
-void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc);
-void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const void *alpha, const void *A, const int lda,
-                 const void *beta, void *C, const int ldc);
-void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const void *beta,
-                  void *C, const int ldc);
-void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-
-
-/*
- * Routines with prefixes C and Z only
- */
-void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const float alpha, const void *A, const int lda,
-                 const float beta, void *C, const int ldc);
-void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const float beta,
-                  void *C, const int ldc);
-
-void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const double alpha, const void *A, const int lda,
-                 const double beta, void *C, const int ldc);
-void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const double beta,
-                  void *C, const int ldc);
-
-void cblas_xerbla(int p, const char *rout, const char *form, ...);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 416e16524bb2..da9ff2799e92 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -443,7 +443,8 @@ def pre_build(context):
                 "src/multiarray/einsum.c.src"]
         bld(target="multiarray_templates", source=multiarray_templates)
         if ENABLE_SEPARATE_COMPILATION:
-            sources = [pjoin('src', 'multiarray', 'arrayobject.c'),
+            sources = [
+                pjoin('src', 'multiarray', 'arrayobject.c'),
                 pjoin('src', 'multiarray', 'alloc.c'),
                 pjoin('src', 'multiarray', 'arraytypes.c.src'),
                 pjoin('src', 'multiarray', 'array_assign.c'),
@@ -485,17 +486,30 @@ def pre_build(context):
                 pjoin('src', 'multiarray', 'sequence.c'),
                 pjoin('src', 'multiarray', 'shape.c'),
                 pjoin('src', 'multiarray', 'ucsnarrow.c'),
-                pjoin('src', 'multiarray', 'usertypes.c')]
+                pjoin('src', 'multiarray', 'usertypes.c'),
+                pjoin('src', 'multiarray', 'vdot.c'),
+                ]
+
+            if bld.env.HAS_CBLAS:
+                sources.append(pjoin('src', 'multiarray', 'cblasfuncs.c'))
         else:
             sources = extension.sources
+
+        use = 'npysort npymath'
+        defines = ['_FILE_OFFSET_BITS=64',
+                   '_LARGEFILE_SOURCE=1',
+                   '_LARGEFILE64_SOURCE=1']
+
+        if bld.env.HAS_CBLAS:
+            use += ' CBLAS'
+            defines.append('HAVE_CBLAS')
+
         includes = ["src/multiarray", "src/private"]
         return context.default_builder(extension,
                                        includes=includes,
                                        source=sources,
-                                       use="npysort npymath",
-                                       defines=['_FILE_OFFSET_BITS=64',
-                                                '_LARGEFILE_SOURCE=1',
-                                                '_LARGEFILE64_SOURCE=1']
+                                       use=use,
+                                       defines=defines
                                        )
     context.register_builder("multiarray", builder_multiarray)
 
@@ -537,9 +551,3 @@ def pre_build(context):
     context.tweak_extension("scalarmath", use="npymath", includes=["src/private"])
     context.tweak_extension("multiarray_tests", use="npymath", includes=["src/private"])
     context.tweak_extension("umath_tests", use="npymath", includes=["src/private"])
-
-    def build_dotblas(extension):
-        if bld.env.HAS_CBLAS:
-            return context.default_builder(extension, use="CBLAS",
-                                           includes=["src/multiarray", "src/private"])
-    context.register_builder("_dotblas", build_dotblas)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index efd8af45dbd2..ac51f5d011b9 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -4,7 +4,7 @@
 import sys
 import warnings
 import collections
-from . import multiarray
+from numpy.core import multiarray
 from . import umath
 from .umath import (invert, sin, UFUNC_BUFSIZE_DEFAULT, ERR_IGNORE,
                     ERR_WARN, ERR_RAISE, ERR_CALL, ERR_PRINT, ERR_LOG,
@@ -1077,31 +1077,61 @@ def outer(a, b, out=None):
 
 # try to import blas optimized dot if available
 envbak = os.environ.copy()
-try:
-    # importing this changes the dot function for basic 4 types
-    # to blas-optimized versions.
-
-    # disables openblas affinity setting of the main thread that limits
-    # python threads or processes to one core
-    if 'OPENBLAS_MAIN_FREE' not in os.environ:
-        os.environ['OPENBLAS_MAIN_FREE'] = '1'
-    if 'GOTOBLAS_MAIN_FREE' not in os.environ:
-        os.environ['GOTOBLAS_MAIN_FREE'] = '1'
-    from ._dotblas import dot, vdot, inner, alterdot, restoredot
-except ImportError:
-    # docstrings are in add_newdocs.py
-    inner = multiarray.inner
-    dot = multiarray.dot
-    def vdot(a, b):
-        return dot(asarray(a).ravel().conj(), asarray(b).ravel())
-    def alterdot():
-        pass
-    def restoredot():
-        pass
-finally:
-    os.environ.clear()
-    os.environ.update(envbak)
-    del envbak
+dot = multiarray.dot
+inner = multiarray.inner
+vdot = multiarray.vdot
+
+def alterdot():
+    """
+    Change `dot`, `vdot`, and `inner` to use accelerated BLAS functions.
+
+    Typically, as a user of Numpy, you do not explicitly call this
+    function. If Numpy is built with an accelerated BLAS, this function is
+    automatically called when Numpy is imported.
+
+    When Numpy is built with an accelerated BLAS like ATLAS, these
+    functions are replaced to make use of the faster implementations.  The
+    faster implementations only affect float32, float64, complex64, and
+    complex128 arrays. Furthermore, the BLAS API only includes
+    matrix-matrix, matrix-vector, and vector-vector products. Products of
+    arrays with larger dimensionalities use the built in functions and are
+    not accelerated.
+
+    .. note:: Deprecated in Numpy 1.10
+              The cblas functions have been integrated into the multarray
+              module and alterdot now longer does anything. It will be
+              removed in Numpy 1.11.0.
+
+    See Also
+    --------
+    restoredot : `restoredot` undoes the effects of `alterdot`.
+
+    """
+    warnings.warn("alterdot no longer does anything.", DeprecationWarning)
+
+
+def restoredot():
+    """
+    Restore `dot`, `vdot`, and `innerproduct` to the default non-BLAS
+    implementations.
+
+    Typically, the user will only need to call this when troubleshooting
+    and installation problem, reproducing the conditions of a build without
+    an accelerated BLAS, or when being very careful about benchmarking
+    linear algebra operations.
+
+    .. note:: Deprecated in Numpy 1.10
+              The cblas functions have been integrated into the multarray
+              module and restoredot now longer does anything. It will be
+              removed in Numpy 1.11.0.
+
+    See Also
+    --------
+    alterdot : `restoredot` undoes the effects of `alterdot`.
+
+    """
+    warnings.warn("restoredot no longer does anything.", DeprecationWarning)
+
 
 def tensordot(a, b, axes=2):
     """
@@ -2153,6 +2183,41 @@ def identity(n, dtype=None):
     from numpy import eye
     return eye(n, dtype=dtype)
 
+def _allclose_points(a, b, rtol=1.e-5, atol=1.e-8):
+    """
+    This is the point-wise inner calculation of 'allclose', which is subtly
+    different from 'isclose'.  Provided as a comparison routine for use in
+    testing.assert_allclose.
+    See those routines for further details.
+
+    """
+    x = array(a, copy=False, ndmin=1)
+    y = array(b, copy=False, ndmin=1)
+
+    # make sure y is an inexact type to avoid abs(MIN_INT); will cause
+    # casting of x later.
+    dtype = multiarray.result_type(y, 1.)
+    y = array(y, dtype=dtype, copy=False)
+
+    xinf = isinf(x)
+    yinf = isinf(y)
+    if any(xinf) or any(yinf):
+        # Check that x and y have inf's only in the same positions
+        if not all(xinf == yinf):
+            return False
+        # Check that sign of inf's in x and y is the same
+        if not all(x[xinf] == y[xinf]):
+            return False
+
+        x = x[~xinf]
+        y = y[~xinf]
+
+    # ignore invalid fpe's
+    with errstate(invalid='ignore'):
+        r = less_equal(abs(x - y), atol + rtol * abs(y))
+
+    return r
+
 def allclose(a, b, rtol=1.e-5, atol=1.e-8):
     """
     Returns True if two arrays are element-wise equal within a tolerance.
@@ -2208,32 +2273,7 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8):
     False
 
     """
-    x = array(a, copy=False, ndmin=1)
-    y = array(b, copy=False, ndmin=1)
-
-    # make sure y is an inexact type to avoid abs(MIN_INT); will cause
-    # casting of x later.
-    dtype = multiarray.result_type(y, 1.)
-    y = array(y, dtype=dtype, copy=False)
-
-    xinf = isinf(x)
-    yinf = isinf(y)
-    if any(xinf) or any(yinf):
-        # Check that x and y have inf's only in the same positions
-        if not all(xinf == yinf):
-            return False
-        # Check that sign of inf's in x and y is the same
-        if not all(x[xinf] == y[xinf]):
-            return False
-
-        x = x[~xinf]
-        y = y[~xinf]
-
-    # ignore invalid fpe's
-    with errstate(invalid='ignore'):
-        r = all(less_equal(abs(x - y), atol + rtol * abs(y)))
-
-    return r
+    return all(_allclose_points(a, b, rtol=rtol, atol=atol))
 
 def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     """
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 15f66fa6cb35..2da2e837a9f7 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -772,6 +772,7 @@ def generate_multiarray_templated_sources(ext, build_dir):
             join('src', 'multiarray', 'shape.h'),
             join('src', 'multiarray', 'ucsnarrow.h'),
             join('src', 'multiarray', 'usertypes.h'),
+            join('src', 'multiarray', 'vdot.h'),
             join('src', 'private', 'lowlevel_strided_loops.h'),
             join('include', 'numpy', 'arrayobject.h'),
             join('include', 'numpy', '_neighborhood_iterator_imp.h'),
@@ -838,23 +839,33 @@ def generate_multiarray_templated_sources(ext, build_dir):
             join('src', 'multiarray', 'scalarapi.c'),
             join('src', 'multiarray', 'scalartypes.c.src'),
             join('src', 'multiarray', 'usertypes.c'),
-            join('src', 'multiarray', 'ucsnarrow.c')]
+            join('src', 'multiarray', 'ucsnarrow.c'),
+            join('src', 'multiarray', 'vdot.c'),
+            ]
 
+    blas_info = get_info('blas_opt', 0)
+    if blas_info and  ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
+        extra_info = blas_info
+        multiarray_src.append(join('src', 'multiarray', 'cblasfuncs.c'))
+    else:
+        extra_info = {}
 
     if not ENABLE_SEPARATE_COMPILATION:
         multiarray_deps.extend(multiarray_src)
         multiarray_src = [join('src', 'multiarray', 'multiarraymodule_onefile.c')]
         multiarray_src.append(generate_multiarray_templated_sources)
 
+
     config.add_extension('multiarray',
-                         sources = multiarray_src +
+                         sources=multiarray_src +
                                  [generate_config_h,
-                                 generate_numpyconfig_h,
-                                 generate_numpy_api,
-                                 join(codegen_dir, 'generate_numpy_api.py'),
-                                 join('*.py')],
-                         depends = deps + multiarray_deps,
-                         libraries = ['npymath', 'npysort'])
+                                  generate_numpyconfig_h,
+                                  generate_numpy_api,
+                                  join(codegen_dir, 'generate_numpy_api.py'),
+                                  join('*.py')],
+                         depends=deps + multiarray_deps,
+                         libraries=['npymath', 'npysort'],
+                         extra_info=extra_info)
 
     #######################################################################
     #                           umath module                              #
@@ -942,28 +953,6 @@ def generate_umath_c(ext, build_dir):
                          libraries = ['npymath'],
                          )
 
-    #######################################################################
-    #                          _dotblas module                            #
-    #######################################################################
-
-    # Configure blasdot
-    blas_info = get_info('blas_opt', 0)
-    #blas_info = {}
-    def get_dotblas_sources(ext, build_dir):
-        if blas_info:
-            if ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', []):
-                return None # dotblas needs ATLAS, Fortran compiled blas will not be sufficient.
-            return ext.depends[:1]
-        return None # no extension module will be built
-
-    config.add_extension('_dotblas',
-                         sources = [get_dotblas_sources],
-                         depends = [join('blasdot', '_dotblas.c'),
-                                  join('blasdot', 'cblas.h'),
-                                  ],
-                         include_dirs = ['blasdot'],
-                         extra_info = blas_info
-                         )
 
     #######################################################################
     #                        umath_tests module                           #
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 595ecfcdd11d..9273e003c820 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3,9 +3,10 @@
 #include "Python.h"
 #include "structmember.h"
 
+
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
-#include "numpy/npy_common.h"
+
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "npy_pycompat.h"
@@ -27,6 +28,12 @@
 #include "numpyos.h"
 #include <string.h>
 
+#if defined(HAVE_CBLAS)
+#include "cblasfuncs.h"
+#include <cblas.h>
+#include <limits.h>
+#endif
+
 
 /*
  *****************************************************************************
@@ -2632,12 +2639,14 @@ STRING_compare(char *ip1, char *ip2, PyArrayObject *ap)
     const unsigned char *c1 = (unsigned char *)ip1;
     const unsigned char *c2 = (unsigned char *)ip2;
     const size_t len = PyArray_DESCR(ap)->elsize;
-    size_t i;
+    int i;
 
-    for(i = 0; i < len; ++i) {
-        if (c1[i] != c2[i]) {
-            return (c1[i] > c2[i]) ? 1 : -1;
-        }
+    i = memcmp(c1, c2, len);
+    if (i > 0) {
+        return 1;
+    }
+    else if (i < 0) {
+        return -1;
     }
     return 0;
 }
@@ -3092,6 +3101,115 @@ static int
  * dot means inner product
  */
 
+/************************** MAYBE USE CBLAS *********************************/
+
+
+/**begin repeat
+ *
+ * #name = FLOAT, DOUBLE#
+ * #type = npy_float, npy_double#
+ * #prefix = s, d#
+ */
+NPY_NO_EXPORT void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+           npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(@type@));
+    int is2b = blas_stride(is2, sizeof(@type@));
+
+    if (is1b && is2b)
+    {
+        double sum = 0.;  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+
+            sum += cblas_@prefix@dot(chunk,
+                                     (@type@ *) ip1, is1b,
+                                     (@type@ *) ip2, is2b);
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        *((@type@ *)op) = (@type@)sum;
+    }
+    else
+#endif
+    {
+        @type@ sum = (@type@)0;  /* could make this double */
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const @type@ ip1r = *((@type@ *)ip1);
+            const @type@ ip2r = *((@type@ *)ip2);
+
+            sum += ip1r * ip2r;
+        }
+        *((@type@ *)op) = sum;
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = CFLOAT, CDOUBLE#
+ * #ctype = npy_cfloat, npy_cdouble#
+ * #type = npy_float, npy_double#
+ * #prefix = c, z#
+ */
+NPY_NO_EXPORT void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+           char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(@ctype@));
+    int is2b = blas_stride(is2, sizeof(@ctype@));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            @type@ tmp[2];
+
+            cblas_@prefix@dotu_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((@type@ *)op)[0] = (@type@)sum[0];
+        ((@type@ *)op)[1] = (@type@)sum[1];
+    }
+    else
+#endif
+    {
+        @type@ sumr = (@type@)0.0;
+        @type@ sumi = (@type@)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const @type@ ip1r = ((@type@ *)ip1)[0];
+            const @type@ ip1i = ((@type@ *)ip1)[1];
+            const @type@ ip2r = ((@type@ *)ip2)[0];
+            const @type@ ip2i = ((@type@ *)ip2)[1];
+
+            sumr += ip1r * ip2r - ip1i * ip2i;
+            sumi += ip1r * ip2i + ip1i * ip2r;
+        }
+        ((@type@ *)op)[0] = sumr;
+        ((@type@ *)op)[1] = sumi;
+    }
+}
+
+/**end repeat**/
+
+/**************************** NO CBLAS VERSIONS *****************************/
+
 static void
 BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
          void *NPY_UNUSED(ignore))
@@ -3112,16 +3230,13 @@ BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
  *
  * #name = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         FLOAT, DOUBLE, LONGDOUBLE,
- *         DATETIME, TIMEDELTA#
+ *         LONGDOUBLE, DATETIME, TIMEDELTA#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_float, npy_double, npy_longdouble,
- *         npy_datetime, npy_timedelta#
+ *         npy_longdouble, npy_datetime, npy_timedelta#
  * #out = npy_long, npy_ulong, npy_long, npy_ulong, npy_long, npy_ulong,
  *        npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *        npy_float, npy_double, npy_longdouble,
- *        npy_datetime, npy_timedelta#
+ *        npy_longdouble, npy_datetime, npy_timedelta#
  */
 static void
 @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
@@ -3139,8 +3254,8 @@ static void
 /**end repeat**/
 
 static void
-HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
-           void *NPY_UNUSED(ignore))
+HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+         npy_intp n, void *NPY_UNUSED(ignore))
 {
     float tmp = 0.0f;
     npy_intp i;
@@ -3152,28 +3267,27 @@ HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
     *((npy_half *)op) = npy_float_to_half(tmp);
 }
 
-/**begin repeat
- *
- * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
- * #type = npy_float, npy_double, npy_longdouble#
- */
-static void @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
-                       char *op, npy_intp n, void *NPY_UNUSED(ignore))
+static void
+CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+                            char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
-    @type@ tmpr = (@type@)0.0, tmpi=(@type@)0.0;
+    npy_longdouble tmpr = 0.0L;
+    npy_longdouble tmpi = 0.0L;
     npy_intp i;
 
     for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
-        tmpr += ((@type@ *)ip1)[0] * ((@type@ *)ip2)[0]
-                - ((@type@ *)ip1)[1] * ((@type@ *)ip2)[1];
-        tmpi += ((@type@ *)ip1)[1] * ((@type@ *)ip2)[0]
-                + ((@type@ *)ip1)[0] * ((@type@ *)ip2)[1];
+        const npy_longdouble ip1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble ip1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble ip2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
+
+        tmpr += ip1r * ip2r - ip1i * ip2i;
+        tmpi += ip1r * ip2i + ip1i * ip2r;
     }
-    ((@type@ *)op)[0] = tmpr; ((@type@ *)op)[1] = tmpi;
+    ((npy_longdouble *)op)[0] = tmpr;
+    ((npy_longdouble *)op)[1] = tmpi;
 }
 
-/**end repeat**/
-
 static void
 OBJECT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
            void *NPY_UNUSED(ignore))
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h
index ff7d4ae408b6..9eed3e92b462 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h
@@ -1,6 +1,8 @@
 #ifndef _NPY_ARRAYTYPES_H_
 #define _NPY_ARRAYTYPES_H_
 
+#include "common.h"
+
 #ifdef NPY_ENABLE_SEPARATE_COMPILATION
 extern NPY_NO_EXPORT PyArray_Descr LONGLONG_Descr;
 extern NPY_NO_EXPORT PyArray_Descr LONG_Descr;
@@ -10,4 +12,17 @@ extern NPY_NO_EXPORT PyArray_Descr INT_Descr;
 NPY_NO_EXPORT int
 set_typeinfo(PyObject *dict);
 
+/* needed for blasfuncs */
+NPY_NO_EXPORT void
+FLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CFLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+DOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CDOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
 #endif
diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/src/multiarray/cblasfuncs.c
similarity index 54%
rename from numpy/core/blasdot/_dotblas.c
rename to numpy/core/src/multiarray/cblasfuncs.c
index 48aa39ff87df..bc09ed6e8013 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -4,141 +4,15 @@
  */
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include "Python.h"
-
-#include "numpy/arrayobject.h"
-#include "npy_config.h"
-#include "npy_pycompat.h"
-#include "ufunc_override.h"
-#ifndef CBLAS_HEADER
-#define CBLAS_HEADER "cblas.h"
-#endif
-#include CBLAS_HEADER
+#define _MULTIARRAYMODULE
 
+#include <Python.h>
 #include <assert.h>
-#include <limits.h>
-#include <stdio.h>
-
-static char module_doc[] =
-"This module provides a BLAS optimized\nmatrix multiply, inner product and dot for numpy arrays";
-
-static PyArray_DotFunc *oldFunctions[NPY_NTYPES];
-
-#define MIN(a, b)   ((a) < (b) ? (a) : (b))
-
-/*
- * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
- * (BLAS won't handle negative or zero strides the way we want).
- */
-static NPY_INLINE int
-blas_stride(npy_intp stride, unsigned itemsize)
-{
-    if (stride <= 0 || stride % itemsize != 0) {
-        return 0;
-    }
-    stride /= itemsize;
-
-    if (stride > INT_MAX) {
-        return 0;
-    }
-    return stride;
-}
-
-/*
- * The following functions do a "chunked" dot product using BLAS when
- * sizeof(npy_intp) > sizeof(int), because BLAS libraries can typically not
- * handle more than INT_MAX elements per call.
- *
- * The chunksize is the greatest power of two less than INT_MAX.
- */
-#if NPY_MAX_INTP > INT_MAX
-# define CHUNKSIZE  (INT_MAX / 2 + 1)
-#else
-# define CHUNKSIZE  NPY_MAX_INTP
-#endif
-
-static void
-FLOAT_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-          npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(float));
-    int nb = blas_stride(strideb, sizeof(float));
-
-    if (na && nb) {
-        double r = 0.;          /* double for stability */
-        float *fa = a, *fb = b;
+#include <cblas.h>
+#include <numpy/arrayobject.h>
+#include "arraytypes.h"
+#include "common.h"
 
-        while (n > 0) {
-            int chunk = MIN(n, CHUNKSIZE);
-
-            r += cblas_sdot(chunk, fa, na, fb, nb);
-            fa += chunk * na;
-            fb += chunk * nb;
-            n -= chunk;
-        }
-        *((float *)res) = r;
-    }
-    else {
-        oldFunctions[NPY_FLOAT](a, stridea, b, strideb, res, n, tmp);
-    }
-}
-
-static void
-DOUBLE_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-           npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(double));
-    int nb = blas_stride(strideb, sizeof(double));
-
-    if (na && nb) {
-        double r = 0.;
-        double *da = a, *db = b;
-
-        while (n > 0) {
-            int chunk = MIN(n, CHUNKSIZE);
-
-            r += cblas_ddot(chunk, da, na, db, nb);
-            da += chunk * na;
-            db += chunk * nb;
-            n -= chunk;
-        }
-        *((double *)res) = r;
-    }
-    else {
-        oldFunctions[NPY_DOUBLE](a, stridea, b, strideb, res, n, tmp);
-    }
-}
-
-static void
-CFLOAT_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-           npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(npy_cfloat));
-    int nb = blas_stride(strideb, sizeof(npy_cfloat));
-
-    if (na && nb) {
-        cblas_cdotu_sub((int)n, (float *)a, na, (float *)b, nb, (float *)res);
-    }
-    else {
-        oldFunctions[NPY_CFLOAT](a, stridea, b, strideb, res, n, tmp);
-    }
-}
-
-static void
-CDOUBLE_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-            npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(npy_cdouble));
-    int nb = blas_stride(strideb, sizeof(npy_cdouble));
-
-    if (na && nb) {
-        cblas_zdotu_sub((int)n, (double *)a, na, (double *)b, nb,
-                        (double *)res);
-    }
-    else {
-        oldFunctions[NPY_CDOUBLE](a, stridea, b, strideb, res, n, tmp);
-    }
-}
 
 /*
  * Helper: call appropriate BLAS dot function for typenum.
@@ -148,29 +22,27 @@ static void
 blas_dot(int typenum, npy_intp n,
          void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
 {
-    PyArray_DotFunc *dot = NULL;
     switch (typenum) {
         case NPY_DOUBLE:
-            dot = DOUBLE_dot;
+            DOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
             break;
         case NPY_FLOAT:
-            dot = FLOAT_dot;
+            FLOAT_dot(a, stridea, b, strideb, res, n, NULL);
             break;
         case NPY_CDOUBLE:
-            dot = CDOUBLE_dot;
+            CDOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
             break;
         case NPY_CFLOAT:
-            dot = CFLOAT_dot;
+            CFLOAT_dot(a, stridea, b, strideb, res, n, NULL);
             break;
     }
-    assert(dot != NULL);
-    dot(a, stridea, b, strideb, res, n, NULL);
 }
 
 
 static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
 static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
 
+
 /*
  * Helper: dispatch to appropriate cblas_?gemm for typenum.
  */
@@ -182,7 +54,6 @@ gemm(int typenum, enum CBLAS_ORDER order,
 {
     const void *Adata = PyArray_DATA(A), *Bdata = PyArray_DATA(B);
     void *Rdata = PyArray_DATA(R);
-
     int ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
 
     switch (typenum) {
@@ -240,124 +111,52 @@ gemv(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
 }
 
 
-static npy_bool altered=NPY_FALSE;
-
-/*
- * alterdot() changes all dot functions to use blas.
- */
-static PyObject *
-dotblas_alterdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
-{
-    PyArray_Descr *descr;
-
-    if (!PyArg_ParseTuple(args, "")) return NULL;
-
-    /* Replace the dot functions to the ones using blas */
-
-    if (!altered) {
-        descr = PyArray_DescrFromType(NPY_FLOAT);
-        oldFunctions[NPY_FLOAT] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)FLOAT_dot;
-
-        descr = PyArray_DescrFromType(NPY_DOUBLE);
-        oldFunctions[NPY_DOUBLE] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)DOUBLE_dot;
-
-        descr = PyArray_DescrFromType(NPY_CFLOAT);
-        oldFunctions[NPY_CFLOAT] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)CFLOAT_dot;
-
-        descr = PyArray_DescrFromType(NPY_CDOUBLE);
-        oldFunctions[NPY_CDOUBLE] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)CDOUBLE_dot;
-
-        altered = NPY_TRUE;
-    }
-
-    Py_INCREF(Py_None);
-    return Py_None;
-}
-
-/*
- * restoredot() restores dots to defaults.
- */
-static PyObject *
-dotblas_restoredot(PyObject *NPY_UNUSED(dummy), PyObject *args)
-{
-    PyArray_Descr *descr;
-
-    if (!PyArg_ParseTuple(args, "")) return NULL;
-
-    if (altered) {
-        descr = PyArray_DescrFromType(NPY_FLOAT);
-        descr->f->dotfunc = oldFunctions[NPY_FLOAT];
-        oldFunctions[NPY_FLOAT] = NULL;
-        Py_XDECREF(descr);
-
-        descr = PyArray_DescrFromType(NPY_DOUBLE);
-        descr->f->dotfunc = oldFunctions[NPY_DOUBLE];
-        oldFunctions[NPY_DOUBLE] = NULL;
-        Py_XDECREF(descr);
-
-        descr = PyArray_DescrFromType(NPY_CFLOAT);
-        descr->f->dotfunc = oldFunctions[NPY_CFLOAT];
-        oldFunctions[NPY_CFLOAT] = NULL;
-        Py_XDECREF(descr);
-
-        descr = PyArray_DescrFromType(NPY_CDOUBLE);
-        descr->f->dotfunc = oldFunctions[NPY_CDOUBLE];
-        oldFunctions[NPY_CDOUBLE] = NULL;
-        Py_XDECREF(descr);
-
-        altered = NPY_FALSE;
-    }
-
-    Py_INCREF(Py_None);
-    return Py_None;
-}
-
 typedef enum {_scalar, _column, _row, _matrix} MatrixShape;
 
+
 static MatrixShape
 _select_matrix_shape(PyArrayObject *array)
 {
     switch (PyArray_NDIM(array)) {
-    case 0:
-        return _scalar;
-    case 1:
-        if (PyArray_DIM(array, 0) > 1)
-            return _column;
-        return _scalar;
-    case 2:
-        if (PyArray_DIM(array, 0) > 1) {
-            if (PyArray_DIM(array, 1) == 1)
+        case 0:
+            return _scalar;
+        case 1:
+            if (PyArray_DIM(array, 0) > 1)
                 return _column;
-            else
-                return _matrix;
-        }
-        if (PyArray_DIM(array, 1) == 1)
             return _scalar;
-        return _row;
+        case 2:
+            if (PyArray_DIM(array, 0) > 1) {
+                if (PyArray_DIM(array, 1) == 1)
+                    return _column;
+                else
+                    return _matrix;
+            }
+            if (PyArray_DIM(array, 1) == 1)
+                return _scalar;
+            return _row;
     }
     return _matrix;
 }
 
 
-/* This also makes sure that the data segment is aligned with
-   an itemsize address as well by returning one if not true.
-*/
+/*
+ * This also makes sure that the data segment is aligned with
+ * an itemsize address as well by returning one if not true.
+ */
 static int
 _bad_strides(PyArrayObject *ap)
 {
-    register int itemsize = PyArray_ITEMSIZE(ap);
-    register int i, N=PyArray_NDIM(ap);
-    register npy_intp *strides = PyArray_STRIDES(ap);
+    int itemsize = PyArray_ITEMSIZE(ap);
+    int i, N=PyArray_NDIM(ap);
+    npy_intp *strides = PyArray_STRIDES(ap);
 
-    if (((npy_intp)(PyArray_DATA(ap)) % itemsize) != 0)
+    if (((npy_intp)(PyArray_DATA(ap)) % itemsize) != 0) {
         return 1;
-    for (i=0; i<N; i++) {
-        if ((strides[i] < 0) || (strides[i] % itemsize) != 0)
+    }
+    for (i = 0; i < N; i++) {
+        if ((strides[i] < 0) || (strides[i] % itemsize) != 0) {
             return 1;
+        }
     }
 
     return 0;
@@ -369,110 +168,30 @@ _bad_strides(PyArrayObject *ap)
  * Like the generic numpy equivalent the product sum is over
  * the last dimension of a and the second-to-last dimension of b.
  * NB: The first argument is not conjugated.;
+ *
+ * This is for use by PyArray_MatrixProduct2. It is assumed on entry that
+ * the arrays ap1 and ap2 have a common data type given by typenum that is
+ * float, double, cfloat, or cdouble and have dimension <= 2. The
+ * __numpy_ufunc__ nonsense is also assumed to have been taken care of.
  */
-static PyObject *
-dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwargs)
+NPY_NO_EXPORT PyObject *
+cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
+                    PyArrayObject *out)
 {
-    static PyObject *cached_npy_dot = NULL;
-    PyObject *override = NULL;
-    PyObject *module;
-    PyObject *op1, *op2;
-    PyArrayObject *ap1 = NULL, *ap2 = NULL, *out = NULL, *ret = NULL;
-    int errval;
+    PyArrayObject *ret = NULL;
     int j, lda, ldb;
     npy_intp l;
-    int typenum, nd;
+    int nd;
     npy_intp ap1stride = 0;
     npy_intp dimensions[NPY_MAXDIMS];
     npy_intp numbytes;
     double prior1, prior2;
     PyTypeObject *subtype;
-    PyArray_Descr *dtype;
     MatrixShape ap1shape, ap2shape;
-    char* kwords[] = {"a", "b", "out", NULL };
-
-    if (cached_npy_dot == NULL) {
-        module = PyImport_ImportModule("numpy.core._dotblas");
-        cached_npy_dot = PyDict_GetItemString(PyModule_GetDict(module), "dot");
-
-        Py_INCREF(cached_npy_dot);
-        Py_DECREF(module);
-    }
-
-    errval = PyUFunc_CheckOverride(cached_npy_dot, "__call__", args, kwargs, 
-                                   &override, 2);
-    if (errval) {
-        return NULL;
-    }
-    else if (override) {
-        return override;
-    }
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O", kwords,
-                                    &op1, &op2, &out)) {
-        return NULL;
-    }
-    if ((PyObject *)out == Py_None) {
-        out = NULL;
-    }
-
-    /*
-     * "Matrix product" using the BLAS.
-     * Only works for float double and complex types.
-     */
-
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    /* This function doesn't handle other types */
-    if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-         typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(
-                                                    (PyObject *)op1,
-                                                    (PyObject *)op2,
-                                                    out));
-    }
-
-    dtype = PyArray_DescrFromType(typenum);
-    if (dtype == NULL) {
-        return NULL;
-    }
-    Py_INCREF(dtype);
-    ap1 = (PyArrayObject *)PyArray_FromAny(op1, dtype, 0, 0, NPY_ARRAY_ALIGNED, NULL);
-    if (ap1 == NULL) {
-        Py_DECREF(dtype);
-        return NULL;
-    }
-    ap2 = (PyArrayObject *)PyArray_FromAny(op2, dtype, 0, 0, NPY_ARRAY_ALIGNED, NULL);
-    if (ap2 == NULL) {
-        Py_DECREF(ap1);
-        return NULL;
-    }
-
-    if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /*
-         * This function doesn't handle dimensions greater than 2
-         * (or negative striding)  -- other
-         * than to ensure the dot function is altered
-         */
-        if (!altered) {
-            /* need to alter dot product */
-            PyObject *tmp1, *tmp2;
-            tmp1 = PyTuple_New(0);
-            tmp2 = dotblas_alterdot(NULL, tmp1);
-            Py_DECREF(tmp1);
-            Py_DECREF(tmp2);
-        }
-        ret = (PyArrayObject *)PyArray_MatrixProduct2((PyObject *)ap1,
-                                                      (PyObject *)ap2,
-                                                      out);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
 
     if (_bad_strides(ap1)) {
-            op1 = PyArray_NewCopy(ap1, NPY_ANYORDER);
+            PyObject *op1 = PyArray_NewCopy(ap1, NPY_ANYORDER);
+
             Py_DECREF(ap1);
             ap1 = (PyArrayObject *)op1;
             if (ap1 == NULL) {
@@ -480,7 +199,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
             }
     }
     if (_bad_strides(ap2)) {
-            op2 = PyArray_NewCopy(ap2, NPY_ANYORDER);
+            PyObject *op2 = PyArray_NewCopy(ap2, NPY_ANYORDER);
+
             Py_DECREF(ap2);
             ap2 = (PyArrayObject *)op2;
             if (ap2 == NULL) {
@@ -494,7 +214,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         PyArrayObject *oap1, *oap2;
         oap1 = ap1; oap2 = ap2;
         /* One of ap1 or ap2 is a scalar */
-        if (ap1shape == _scalar) {              /* Make ap2 the scalar */
+        if (ap1shape == _scalar) {
+            /* Make ap2 the scalar */
             PyArrayObject *t = ap1;
             ap1 = ap2;
             ap2 = t;
@@ -529,7 +250,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
             l = PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1);
 
             if (PyArray_DIM(oap2, 0) != l) {
-                PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+                dot_alignment_error(oap1, PyArray_NDIM(oap1) - 1, oap2, 0);
                 goto fail;
             }
             nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
@@ -541,7 +262,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                  * Either PyArray_NDIM(ap1) is 1 dim or PyArray_NDIM(ap2) is 1 dim
                  * and the other is 2-dim
                  */
-                dimensions[0] = (PyArray_NDIM(oap1) == 2) ? PyArray_DIM(oap1, 0) : PyArray_DIM(oap2, 1);
+                dimensions[0] = (PyArray_NDIM(oap1) == 2) ?
+                                PyArray_DIM(oap1, 0) : PyArray_DIM(oap2, 1);
                 l = dimensions[0];
                 /*
                  * Fix it so that dot(shape=(N,1), shape=(1,))
@@ -579,13 +301,15 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
 
         if (PyArray_DIM(ap2, 0) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+            dot_alignment_error(ap1, PyArray_NDIM(ap1) - 1, ap2, 0);
             goto fail;
         }
         nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
 
-        if (nd == 1)
-            dimensions[0] = (PyArray_NDIM(ap1) == 2) ? PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 1);
+        if (nd == 1) {
+            dimensions[0] = (PyArray_NDIM(ap1) == 2) ?
+                            PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 1);
+        }
         else if (nd == 2) {
             dimensions[0] = PyArray_DIM(ap1, 0);
             dimensions[1] = PyArray_DIM(ap2, 1);
@@ -603,8 +327,9 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         subtype = Py_TYPE(ap1);
     }
 
-    if (out) {
+    if (out != NULL) {
         int d;
+
         /* verify that out is usable */
         if (Py_TYPE(out) != subtype ||
             PyArray_NDIM(out) != nd ||
@@ -625,11 +350,12 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         }
         Py_INCREF(out);
         ret = out;
-    } else {
+    }
+    else {
+        PyObject *tmp = (PyObject *)(prior2 > prior1 ? ap2 : ap1);
+
         ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                           typenum, NULL, NULL, 0, 0,
-                                           (PyObject *)
-                                           (prior2 > prior1 ? ap2 : ap1));
+                                           typenum, NULL, NULL, 0, 0, tmp);
     }
 
     if (ret == NULL) {
@@ -637,7 +363,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
     }
     numbytes = PyArray_NBYTES(ret);
     memset(PyArray_DATA(ret), 0, numbytes);
-    if (numbytes==0 || l == 0) {
+    if (numbytes == 0 || l == 0) {
             Py_DECREF(ap1);
             Py_DECREF(ap2);
             return PyArray_Return(ret);
@@ -654,11 +380,14 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         if (typenum == NPY_DOUBLE) {
             if (l == 1) {
                 *((double *)PyArray_DATA(ret)) = *((double *)PyArray_DATA(ap2)) *
-                    *((double *)PyArray_DATA(ap1));
+                                                 *((double *)PyArray_DATA(ap1));
             }
             else if (ap1shape != _matrix) {
-                cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(double), (double *)PyArray_DATA(ret), 1);
+                cblas_daxpy(l,
+                            *((double *)PyArray_DATA(ap2)),
+                            (double *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(double),
+                            (double *)PyArray_DATA(ret), 1);
             }
             else {
                 int maxind, oind, i, a1s, rets;
@@ -666,7 +395,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                 double val;
 
                 maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
+                oind = 1 - maxind;
                 ptr = PyArray_DATA(ap1);
                 rptr = PyArray_DATA(ret);
                 l = PyArray_DIM(ap1, maxind);
@@ -692,8 +421,11 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                 res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
             }
             else if (ap1shape != _matrix) {
-                cblas_zaxpy(l, (double *)PyArray_DATA(ap2), (double *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(npy_cdouble), (double *)PyArray_DATA(ret), 1);
+                cblas_zaxpy(l,
+                            (double *)PyArray_DATA(ap2),
+                            (double *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(npy_cdouble),
+                            (double *)PyArray_DATA(ret), 1);
             }
             else {
                 int maxind, oind, i, a1s, rets;
@@ -701,7 +433,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                 double *pval;
 
                 maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
+                oind = 1 - maxind;
                 ptr = PyArray_DATA(ap1);
                 rptr = PyArray_DATA(ret);
                 l = PyArray_DIM(ap1, maxind);
@@ -722,8 +454,11 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                     *((float *)PyArray_DATA(ap1));
             }
             else if (ap1shape != _matrix) {
-                cblas_saxpy(l, *((float *)PyArray_DATA(ap2)), (float *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(float), (float *)PyArray_DATA(ret), 1);
+                cblas_saxpy(l,
+                            *((float *)PyArray_DATA(ap2)),
+                            (float *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(float),
+                            (float *)PyArray_DATA(ret), 1);
             }
             else {
                 int maxind, oind, i, a1s, rets;
@@ -731,7 +466,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                 float val;
 
                 maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
+                oind = 1 - maxind;
                 ptr = PyArray_DATA(ap1);
                 rptr = PyArray_DATA(ret);
                 l = PyArray_DIM(ap1, maxind);
@@ -757,8 +492,11 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                 res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
             }
             else if (ap1shape != _matrix) {
-                cblas_caxpy(l, (float *)PyArray_DATA(ap2), (float *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(npy_cfloat), (float *)PyArray_DATA(ret), 1);
+                cblas_caxpy(l,
+                            (float *)PyArray_DATA(ap2),
+                            (float *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(npy_cfloat),
+                            (float *)PyArray_DATA(ret), 1);
             }
             else {
                 int maxind, oind, i, a1s, rets;
@@ -766,7 +504,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
                 float *pval;
 
                 maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
+                oind = 1 - maxind;
                 ptr = PyArray_DATA(ap1);
                 rptr = PyArray_DATA(ret);
                 l = PyArray_DIM(ap1, maxind);
@@ -918,7 +656,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
     Py_DECREF(ap2);
     return PyArray_Return(ret);
 
- fail:
+fail:
     Py_XDECREF(ap1);
     Py_XDECREF(ap2);
     Py_XDECREF(ret);
@@ -933,65 +671,28 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
  * floating point types. Like the generic NumPy equivalent the product
  * sum is over the last dimension of a and b.
  * NB: The first argument is not conjugated.
+ *
+ * This is for use by PyArray_InnerProduct. It is assumed on entry that the
+ * arrays ap1 and ap2 have a common data type given by typenum that is
+ * float, double, cfloat, or cdouble and have dimension <= 2, and have the
+ * contiguous flag set. The * __numpy_ufunc__ nonsense is also assumed to
+ * have been taken care of.
  */
 
-static PyObject *
-dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+NPY_NO_EXPORT PyObject *
+cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
 {
-    PyObject *op1, *op2;
-    PyArrayObject *ap1, *ap2, *ret;
     int j, l, lda, ldb;
-    int typenum, nd;
+    int nd;
+    double prior1, prior2;
+    PyArrayObject *ret;
     npy_intp dimensions[NPY_MAXDIMS];
     PyTypeObject *subtype;
-    double prior1, prior2;
-
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
-
-    /*
-     * Inner product using the BLAS.  The product sum is taken along the last
-     * dimensions of the two arrays.
-     * Only speeds things up for float double and complex types.
-     */
-
-
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    /* This function doesn't handle other types */
-    if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-         typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
-            return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(op1, op2));
-    }
-
-    ret = NULL;
-    ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
-    if (ap1 == NULL) return NULL;
-    ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
-    if (ap2 == NULL) goto fail;
-
-    if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /* This function doesn't handle dimensions greater than 2 -- other
-           than to ensure the dot function is altered
-        */
-        if (!altered) {
-            /* need to alter dot product */
-            PyObject *tmp1, *tmp2;
-            tmp1 = PyTuple_New(0);
-            tmp2 = dotblas_alterdot(NULL, tmp1);
-            Py_DECREF(tmp1);
-            Py_DECREF(tmp2);
-        }
-        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
-                                                    (PyObject *)ap2);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
 
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         /* One of ap1 or ap2 is a scalar */
-        if (PyArray_NDIM(ap1) == 0) {             /* Make ap2 the scalar */
+        if (PyArray_NDIM(ap1) == 0) {
+            /* Make ap2 the scalar */
             PyArrayObject *t = ap1;
             ap1 = ap2;
             ap2 = t;
@@ -1002,18 +703,23 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         }
         nd = PyArray_NDIM(ap1);
     }
-    else { /* (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2) */
-        /*  Both ap1 and ap2 are vectors or matrices */
-        l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
+    else {
+        /*
+         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
+         *  Both ap1 and ap2 are vectors or matrices
+         */
+        l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
 
-        if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+        if (PyArray_DIM(ap2, PyArray_NDIM(ap2) - 1) != l) {
+            dot_alignment_error(ap1, PyArray_NDIM(ap1) - 1,
+                                ap2, PyArray_NDIM(ap2) - 1);
             goto fail;
         }
-        nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
+        nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
 
         if (nd == 1)
-            dimensions[0] = (PyArray_NDIM(ap1) == 2) ? PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 0);
+            dimensions[0] = (PyArray_NDIM(ap1) == 2) ?
+                            PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 0);
         else if (nd == 2) {
             dimensions[0] = PyArray_DIM(ap1, 0);
             dimensions[1] = PyArray_DIM(ap2, 0);
@@ -1027,36 +733,49 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
     ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
                                        typenum, NULL, NULL, 0, 0,
-                                       (PyObject *)\
-                                       (prior2 > prior1 ? ap2 : ap1));
+                                       (PyObject *)
+                                           (prior2 > prior1 ? ap2 : ap1));
+
+    if (ret == NULL) {
+        goto fail;
+    }
 
-    if (ret == NULL) goto fail;
-    NPY_BEGIN_ALLOW_THREADS
+    NPY_BEGIN_ALLOW_THREADS;
     memset(PyArray_DATA(ret), 0, PyArray_NBYTES(ret));
 
     if (PyArray_NDIM(ap2) == 0) {
         /* Multiplication by a scalar -- Level 1 BLAS */
         if (typenum == NPY_DOUBLE) {
-            cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1), 1,
+            cblas_daxpy(l,
+                        *((double *)PyArray_DATA(ap2)),
+                        (double *)PyArray_DATA(ap1), 1,
                         (double *)PyArray_DATA(ret), 1);
         }
         else if (typenum == NPY_CDOUBLE) {
-            cblas_zaxpy(l, (double *)PyArray_DATA(ap2), (double *)PyArray_DATA(ap1), 1,
+            cblas_zaxpy(l,
+                        (double *)PyArray_DATA(ap2),
+                        (double *)PyArray_DATA(ap1), 1,
                         (double *)PyArray_DATA(ret), 1);
         }
         else if (typenum == NPY_FLOAT) {
-            cblas_saxpy(l, *((float *)PyArray_DATA(ap2)), (float *)PyArray_DATA(ap1), 1,
+            cblas_saxpy(l,
+                        *((float *)PyArray_DATA(ap2)),
+                        (float *)PyArray_DATA(ap1), 1,
                         (float *)PyArray_DATA(ret), 1);
         }
         else if (typenum == NPY_CFLOAT) {
-            cblas_caxpy(l, (float *)PyArray_DATA(ap2), (float *)PyArray_DATA(ap1), 1,
+            cblas_caxpy(l,
+                        (float *)PyArray_DATA(ap2),
+                        (float *)PyArray_DATA(ap1), 1,
                         (float *)PyArray_DATA(ret), 1);
         }
     }
     else if (PyArray_NDIM(ap1) == 1 && PyArray_NDIM(ap2) == 1) {
         /* Dot product between two vectors -- Level 1 BLAS */
-        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
-                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
+        blas_dot(typenum, l,
+                 PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
+                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2),
+                 PyArray_DATA(ret));
     }
     else if (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 1) {
         /* Matrix-vector multiplication -- Level 2 BLAS */
@@ -1068,115 +787,17 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         lda = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
         gemv(typenum, CblasRowMajor, CblasNoTrans, ap2, lda, ap1, 1, ret);
     }
-    else { /* (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2) */
-        /* Matrix matrix multiplication -- Level 3 BLAS */
+    else {
+        /*
+         * (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2)
+         * Matrix matrix multiplication -- Level 3 BLAS
+         */
         lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
         ldb = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
         gemm(typenum, CblasRowMajor, CblasNoTrans, CblasTrans,
              PyArray_DIM(ap1, 0), PyArray_DIM(ap2, 0), PyArray_DIM(ap1, 1),
              ap1, lda, ap2, ldb, ret);
     }
-    NPY_END_ALLOW_THREADS
-    Py_DECREF(ap1);
-    Py_DECREF(ap2);
-    return PyArray_Return(ret);
-
- fail:
-    Py_XDECREF(ap1);
-    Py_XDECREF(ap2);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-
-/*
- * vdot(a,b)
- *
- * Returns the dot product of a and b for scalars and vectors of
- * floating point and complex types.  The first argument, a, is conjugated.
- */
-static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
-    PyObject *op1, *op2;
-    PyArrayObject *ap1=NULL, *ap2=NULL, *ret=NULL;
-    int l;
-    int typenum;
-    npy_intp dimensions[NPY_MAXDIMS];
-    PyArray_Descr *type;
-
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
-
-    /*
-     * Conjugating dot product using the BLAS for vectors.
-     * Multiplies op1 and op2, each of which must be vector.
-     */
-
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    type = PyArray_DescrFromType(typenum);
-    Py_INCREF(type);
-    ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
-    if (ap1==NULL) {Py_DECREF(type); goto fail;}
-    op1 = PyArray_Flatten(ap1, 0);
-    if (op1==NULL) {Py_DECREF(type); goto fail;}
-    Py_DECREF(ap1);
-    ap1 = (PyArrayObject *)op1;
-
-    ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
-    if (ap2==NULL) goto fail;
-    op2 = PyArray_Flatten(ap2, 0);
-    if (op2 == NULL) goto fail;
-    Py_DECREF(ap2);
-    ap2 = (PyArrayObject *)op2;
-
-    if (typenum != NPY_FLOAT && typenum != NPY_DOUBLE &&
-        typenum != NPY_CFLOAT && typenum != NPY_CDOUBLE) {
-        if (!altered) {
-            /* need to alter dot product */
-            PyObject *tmp1, *tmp2;
-            tmp1 = PyTuple_New(0);
-            tmp2 = dotblas_alterdot(NULL, tmp1);
-            Py_DECREF(tmp1);
-            Py_DECREF(tmp2);
-        }
-        if (PyTypeNum_ISCOMPLEX(typenum)) {
-            op1 = PyArray_Conjugate(ap1, NULL);
-            if (op1==NULL) goto fail;
-            Py_DECREF(ap1);
-            ap1 = (PyArrayObject *)op1;
-        }
-        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
-                                                    (PyObject *)ap2);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
-
-    if (PyArray_DIM(ap2, 0) != PyArray_DIM(ap1, PyArray_NDIM(ap1)-1)) {
-        PyErr_SetString(PyExc_ValueError, "vectors have different lengths");
-        goto fail;
-    }
-    l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
-
-    ret = (PyArrayObject *)PyArray_SimpleNew(0, dimensions, typenum);
-    if (ret == NULL) goto fail;
-
-    NPY_BEGIN_ALLOW_THREADS;
-
-    /* Dot product between two vectors -- Level 1 BLAS */
-    if (typenum == NPY_DOUBLE || typenum == NPY_FLOAT) {
-        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
-                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
-    }
-    else if (typenum == NPY_CDOUBLE) {
-        cblas_zdotc_sub(l, (double *)PyArray_DATA(ap1), 1,
-                        (double *)PyArray_DATA(ap2), 1, (double *)PyArray_DATA(ret));
-    }
-    else if (typenum == NPY_CFLOAT) {
-        cblas_cdotc_sub(l, (float *)PyArray_DATA(ap1), 1,
-                        (float *)PyArray_DATA(ap2), 1, (float *)PyArray_DATA(ret));
-    }
-
     NPY_END_ALLOW_THREADS;
 
     Py_DECREF(ap1);
@@ -1189,65 +810,3 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
     Py_XDECREF(ret);
     return NULL;
 }
-
-static struct PyMethodDef dotblas_module_methods[] = {
-    {"dot",  (PyCFunction)dotblas_matrixproduct, METH_VARARGS|METH_KEYWORDS, NULL},
-    {"inner",   (PyCFunction)dotblas_innerproduct,  1, NULL},
-    {"vdot", (PyCFunction)dotblas_vdot, 1, NULL},
-    {"alterdot", (PyCFunction)dotblas_alterdot, 1, NULL},
-    {"restoredot", (PyCFunction)dotblas_restoredot, 1, NULL},
-    {NULL, NULL, 0, NULL}               /* sentinel */
-};
-
-#if defined(NPY_PY3K)
-static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        "_dotblas",
-        NULL,
-        -1,
-        dotblas_module_methods,
-        NULL,
-        NULL,
-        NULL,
-        NULL
-};
-#endif
-
-/* Initialization function for the module */
-#if defined(NPY_PY3K)
-#define RETVAL m
-PyMODINIT_FUNC PyInit__dotblas(void)
-#else
-#define RETVAL
-PyMODINIT_FUNC init_dotblas(void)
-#endif
-{
-#if defined(NPY_PY3K)
-    int i;
-
-    PyObject *d, *s, *m;
-    m = PyModule_Create(&moduledef);
-#else
-    int i;
-
-    PyObject *d, *s;
-    Py_InitModule3("_dotblas", dotblas_module_methods, module_doc);
-#endif
-
-    /* add the functions */
-
-    /* Import the array object */
-    import_array();
-
-    /* Initialise the array of dot functions */
-    for (i = 0; i < NPY_NTYPES; i++)
-        oldFunctions[i] = NULL;
-
-    /* alterdot at load */
-    d = PyTuple_New(0);
-    s = dotblas_alterdot(NULL, d);
-    Py_DECREF(d);
-    Py_DECREF(s);
-
-    return RETVAL;
-}
diff --git a/numpy/core/src/multiarray/cblasfuncs.h b/numpy/core/src/multiarray/cblasfuncs.h
new file mode 100644
index 000000000000..d3ec08db608b
--- /dev/null
+++ b/numpy/core/src/multiarray/cblasfuncs.h
@@ -0,0 +1,10 @@
+#ifndef _NPY_CBLASFUNCS_H_
+#define _NPY_CBLASFUNCS_H_
+
+NPY_NO_EXPORT PyObject *
+cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
+
+NPY_NO_EXPORT PyObject *
+cblas_innerproduct(int, PyArrayObject *, PyArrayObject *);
+
+#endif
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 35b705aff4e6..2d18af06e73a 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -787,7 +787,7 @@ offset_bounds_from_strides(const int itemsize, const int nd,
  * @param Dimensionality of the shape
  * @param npy_intp pointer to shape array
  * @param String to append after the shape `(1, 2)%s`.
- * 
+ *
  * @return Python unicode string
  */
 NPY_NO_EXPORT PyObject *
@@ -836,7 +836,59 @@ convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending)
     }
     else {
         tmp = PyUString_FromFormat(")%s", ending);
-        }
+    }
     PyUString_ConcatAndDel(&ret, tmp);
     return ret;
 }
+
+
+NPY_NO_EXPORT void
+dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j)
+{
+    PyObject *errmsg = NULL, *format = NULL, *fmt_args = NULL,
+             *i_obj = NULL, *j_obj = NULL,
+             *shape1 = NULL, *shape2 = NULL,
+             *shape1_i = NULL, *shape2_j = NULL;
+
+    format = PyUString_FromString("shapes %s and %s not aligned:"
+                                  " %d (dim %d) != %d (dim %d)");
+
+    shape1 = convert_shape_to_string(PyArray_NDIM(a), PyArray_DIMS(a), "");
+    shape2 = convert_shape_to_string(PyArray_NDIM(b), PyArray_DIMS(b), "");
+
+    i_obj = PyLong_FromLong(i);
+    j_obj = PyLong_FromLong(j);
+
+    shape1_i = PyLong_FromSsize_t(PyArray_DIM(a, i));
+    shape2_j = PyLong_FromSsize_t(PyArray_DIM(b, j));
+
+    if (!format || !shape1 || !shape2 || !i_obj || !j_obj ||
+            !shape1_i || !shape2_j) {
+        goto end;
+    }
+
+    fmt_args = PyTuple_Pack(6, shape1, shape2,
+                            shape1_i, i_obj, shape2_j, j_obj);
+    if (fmt_args == NULL) {
+        goto end;
+    }
+
+    errmsg = PyUString_Format(format, fmt_args);
+    if (errmsg != NULL) {
+        PyErr_SetObject(PyExc_ValueError, errmsg);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "shapes are not aligned");
+    }
+
+end:
+    Py_XDECREF(errmsg);
+    Py_XDECREF(fmt_args);
+    Py_XDECREF(format);
+    Py_XDECREF(i_obj);
+    Py_XDECREF(j_obj);
+    Py_XDECREF(shape1);
+    Py_XDECREF(shape2);
+    Py_XDECREF(shape1_i);
+    Py_XDECREF(shape2_j);
+}
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 6b49d6b4cf5c..9cf2e27bfb50 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -3,6 +3,7 @@
 #include <numpy/npy_common.h>
 #include <numpy/npy_cpu.h>
 #include <numpy/ndarraytypes.h>
+#include <limits.h>
 
 #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
 
@@ -72,6 +73,13 @@ offset_bounds_from_strides(const int itemsize, const int nd,
 NPY_NO_EXPORT PyObject *
 convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending);
 
+/*
+ * Sets ValueError with "matrices not aligned" message for np.dot and friends
+ * when a.shape[i] should match b.shape[j], but doesn't.
+ */
+NPY_NO_EXPORT void
+dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j);
+
 
 /*
  * Returns -1 and sets an exception if *index is an invalid index for
@@ -208,6 +216,35 @@ _is_basic_python_type(PyObject * obj)
     return 0;
 }
 
+/*
+ * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
+ * (BLAS won't handle negative or zero strides the way we want).
+ */
+static NPY_INLINE int
+blas_stride(npy_intp stride, unsigned itemsize)
+{
+    /*
+     * Should probably check pointer alignment also, but this may cause
+     * problems if we require complex to be 16 byte aligned.
+     */
+    if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
+        stride /= itemsize;
+        if (stride <= INT_MAX) {
+            return stride;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Define a chunksize for CBLAS. CBLAS counts in integers.
+ */
+#if NPY_MAX_INTP > INT_MAX
+# define NPY_CBLAS_CHUNK  (INT_MAX / 2 + 1)
+#else
+# define NPY_CBLAS_CHUNK  NPY_MAX_INTP
+#endif
+
 #include "ucsnarrow.h"
 
 #endif
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index bfd3bc3c1593..17b95b654242 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -9,9 +9,8 @@
 #include "numpy/arrayscalars.h"
 
 #include "npy_config.h"
-
 #include "npy_pycompat.h"
-
+#include "ufunc_override.h"
 #include "common.h"
 #include "ctors.h"
 #include "calculation.h"
@@ -2011,31 +2010,52 @@ array_cumprod(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_dot(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *fname, *ret, *b, *out = NULL;
-    static PyObject *numpycore = NULL;
-    char * kwords[] = {"b", "out", NULL };
+    static PyUFuncObject *cached_npy_dot = NULL;
+    int errval;
+    PyObject *override = NULL;
+    PyObject *a = (PyObject *)self, *b, *o = Py_None;
+    PyObject *newargs;
+    PyArrayObject *ret;
+    char* kwlist[] = {"b", "out", NULL };
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwords, &b, &out)) {
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, &b, &o)) {
         return NULL;
     }
 
-    /* Since blas-dot is exposed only on the Python side, we need to grab it
-     * from there */
-    if (numpycore == NULL) {
-        numpycore = PyImport_ImportModule("numpy.core");
-        if (numpycore == NULL) {
-            return NULL;
-        }
+    if (cached_npy_dot == NULL) {
+        PyObject *module = PyImport_ImportModule("numpy.core.multiarray");
+        cached_npy_dot = (PyUFuncObject*)PyDict_GetItemString(
+                                              PyModule_GetDict(module), "dot");
+
+        Py_INCREF(cached_npy_dot);
+        Py_DECREF(module);
     }
-    fname = PyUString_FromString("dot");
-    if (out == NULL) {
-        ret = PyObject_CallMethodObjArgs(numpycore, fname, self, b, NULL);
+
+    if ((newargs = PyTuple_Pack(3, a, b, o)) == NULL) {
+        return NULL;
     }
-    else {
-        ret = PyObject_CallMethodObjArgs(numpycore, fname, self, b, out, NULL);
+    errval = PyUFunc_CheckOverride(cached_npy_dot, "__call__",
+                                   newargs, NULL, &override, 2);
+    Py_DECREF(newargs);
+
+    if (errval) {
+        return NULL;
     }
-    Py_DECREF(fname);
-    return ret;
+    else if (override) {
+        return override;
+    }
+
+    if (o == Py_None) {
+        o = NULL;
+    }
+    if (o != NULL && !PyArray_Check(o)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "'out' must be an array");
+        return NULL;
+    }
+    ret = (PyArrayObject *)PyArray_MatrixProduct2(a, b, (PyArrayObject *)o);
+    return PyArray_Return(ret);
 }
 
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 8e9b656cf336..6d5bf887ea3f 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -57,6 +57,8 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "ufunc_override.h"
 #include "scalarmathmodule.h" /* for npy_mul_with_overflow_intp */
 #include "multiarraymodule.h"
+#include "cblasfuncs.h"
+#include "vdot.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -835,8 +837,18 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     ap2 = (PyArrayObject *)PyArray_FromAny(op2, typec, 0, 0,
                                         NPY_ARRAY_ALIGNED, NULL);
     if (ap2 == NULL) {
-        goto fail;
+        Py_DECREF(ap1);
+        return NULL;
     }
+
+#if defined(HAVE_CBLAS)
+    if (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2 &&
+            (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+             NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+        return cblas_innerproduct(typenum, ap1, ap2);
+    }
+#endif
+
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         ret = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
         ret = (PyArrayObject *)Py_TYPE(ret)->tp_as_number->nb_multiply(
@@ -848,7 +860,8 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
 
     l = PyArray_DIMS(ap1)[PyArray_NDIM(ap1) - 1];
     if (PyArray_DIMS(ap2)[PyArray_NDIM(ap2) - 1] != l) {
-        PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+        dot_alignment_error(ap1, PyArray_NDIM(ap1) - 1,
+                            ap2, PyArray_NDIM(ap2) - 1);
         goto fail;
     }
 
@@ -882,7 +895,8 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     }
     is1 = PyArray_STRIDES(ap1)[PyArray_NDIM(ap1) - 1];
     is2 = PyArray_STRIDES(ap2)[PyArray_NDIM(ap2) - 1];
-    op = PyArray_DATA(ret); os = PyArray_DESCR(ret)->elsize;
+    op = PyArray_DATA(ret);
+    os = PyArray_DESCR(ret)->elsize;
     axis = PyArray_NDIM(ap1) - 1;
     it1 = (PyArrayIterObject *) PyArray_IterAllButAxis((PyObject *)ap1, &axis);
     axis = PyArray_NDIM(ap2) - 1;
@@ -915,7 +929,17 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
 }
 
 /*NUMPY_API
- * Numeric.matrixproduct(a,v,out)
+ * Numeric.matrixproduct(a,v)
+ * just like inner product but does the swapaxes stuff on the fly
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
+{
+    return PyArray_MatrixProduct2(op1, op2, NULL);
+}
+
+/*NUMPY_API
+ * Numeric.matrixproduct2(a,v,out)
  * just like inner product but does the swapaxes stuff on the fly
  */
 NPY_NO_EXPORT PyObject *
@@ -950,8 +974,18 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     ap2 = (PyArrayObject *)PyArray_FromAny(op2, typec, 0, 0,
                                         NPY_ARRAY_ALIGNED, NULL);
     if (ap2 == NULL) {
-        goto fail;
+        Py_DECREF(ap1);
+        return NULL;
+    }
+
+#if defined(HAVE_CBLAS)
+    if (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2 &&
+            (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+             NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+        return cblas_matrixproduct(typenum, ap1, ap2, out);
     }
+#endif
+
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         ret = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
         ret = (PyArrayObject *)Py_TYPE(ret)->tp_as_number->nb_multiply(
@@ -968,7 +1002,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
         matchDim = 0;
     }
     if (PyArray_DIMS(ap2)[matchDim] != l) {
-        PyErr_SetString(PyExc_ValueError, "objects are not aligned");
+        dot_alignment_error(ap1, PyArray_NDIM(ap1) - 1, ap2, matchDim);
         goto fail;
     }
     nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
@@ -986,14 +1020,9 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     if(PyArray_NDIM(ap2) > 1) {
         dimensions[j++] = PyArray_DIMS(ap2)[PyArray_NDIM(ap2)-1];
     }
-    /*
-      fprintf(stderr, "nd=%d dimensions=", nd);
-      for(i=0; i<j; i++)
-      fprintf(stderr, "%d ", dimensions[i]);
-      fprintf(stderr, "\n");
-    */
 
-    is1 = PyArray_STRIDES(ap1)[PyArray_NDIM(ap1)-1]; is2 = PyArray_STRIDES(ap2)[matchDim];
+    is1 = PyArray_STRIDES(ap1)[PyArray_NDIM(ap1)-1];
+    is2 = PyArray_STRIDES(ap2)[matchDim];
     /* Choose which subtype to return */
     ret = new_array_for_sum(ap1, ap2, out, nd, dimensions, typenum);
     if (ret == NULL) {
@@ -1052,15 +1081,6 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     return NULL;
 }
 
-/*NUMPY_API
- *Numeric.matrixproduct(a,v)
- * just like inner product but does the swapaxes stuff on the fly
- */
-NPY_NO_EXPORT PyObject *
-PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
-{
-    return PyArray_MatrixProduct2(op1, op2, NULL);
-}
 
 /*NUMPY_API
  * Copy and Transpose
@@ -1953,8 +1973,9 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
                 if (tmpobj == NULL) {
                     /* More informative error message */
                     PyErr_SetString(PyExc_ValueError,
-                                    ("Failed to encode Numpy scalar data string to latin1. "
-                                     "pickle.load(a, encoding='latin1') is assumed if unpickling."));
+                            "Failed to encode Numpy scalar data string to "
+                            "latin1,\npickle.load(a, encoding='latin1') is "
+                            "assumed if unpickling.");
                     return NULL;
                 }
             }
@@ -2199,15 +2220,15 @@ array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 static PyObject *
 array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
 {
-    int errval;
     static PyUFuncObject *cached_npy_dot = NULL;
+    int errval;
     PyObject *override = NULL;
     PyObject *v, *a, *o = NULL;
+    PyArrayObject *ret;
     char* kwlist[] = {"a", "b", "out", NULL };
-    PyObject *module;
 
     if (cached_npy_dot == NULL) {
-        module = PyImport_ImportModule("numpy.core.multiarray");
+        PyObject *module = PyImport_ImportModule("numpy.core.multiarray");
         cached_npy_dot = (PyUFuncObject*)PyDict_GetItemString(
                                               PyModule_GetDict(module), "dot");
 
@@ -2235,9 +2256,120 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
                         "'out' must be an array");
         return NULL;
     }
-    return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(a, v, (PyArrayObject *)o));
+    ret = (PyArrayObject *)PyArray_MatrixProduct2(a, v, (PyArrayObject *)o);
+    return PyArray_Return(ret);
+}
+
+
+static PyObject *
+array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    int typenum;
+    char *ip1, *ip2, *op;
+    npy_intp n, stride;
+    PyObject *op1, *op2;
+    PyArrayObject *ap1 = NULL, *ap2  = NULL, *ret = NULL;
+    PyArray_Descr *type;
+    PyArray_DotFunc *vdot;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+        return NULL;
+    }
+
+    /*
+     * Conjugating dot product using the BLAS for vectors.
+     * Flattens both op1 and op2 before dotting.
+     */
+    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op2, typenum);
+
+    type = PyArray_DescrFromType(typenum);
+    Py_INCREF(type);
+    ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
+    if (ap1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
+    op1 = PyArray_Ravel(ap1, NPY_CORDER);
+    if (op1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
+    Py_DECREF(ap1);
+    ap1 = (PyArrayObject *)op1;
+
+    ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
+    if (ap2 == NULL) {
+        goto fail;
+    }
+    op2 = PyArray_Ravel(ap2, NPY_CORDER);
+    if (op2 == NULL) {
+        goto fail;
+    }
+    Py_DECREF(ap2);
+    ap2 = (PyArrayObject *)op2;
+
+    if (PyArray_DIM(ap2, 0) != PyArray_DIM(ap1, 0)) {
+        PyErr_SetString(PyExc_ValueError,
+                "vectors have different lengths");
+        goto fail;
+    }
+
+    /* array scalar output */
+    ret = new_array_for_sum(ap1, ap2, NULL, 0, (npy_intp *)NULL, typenum);
+    if (ret == NULL) {
+        goto fail;
+    }
+
+    n = PyArray_DIM(ap1, 0);
+    stride = type->elsize;
+    ip1 = PyArray_DATA(ap1);
+    ip2 = PyArray_DATA(ap2);
+    op = PyArray_DATA(ret);
+
+    switch (typenum) {
+        case NPY_CFLOAT:
+            vdot = (PyArray_DotFunc *)CFLOAT_vdot;
+            break;
+        case NPY_CDOUBLE:
+            vdot = (PyArray_DotFunc *)CDOUBLE_vdot;
+            break;
+        case NPY_CLONGDOUBLE:
+            vdot = (PyArray_DotFunc *)CLONGDOUBLE_vdot;
+            break;
+        case NPY_OBJECT:
+            vdot = (PyArray_DotFunc *)OBJECT_vdot;
+            break;
+        default:
+            vdot = type->f->dotfunc;
+            if (vdot == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "function not available for this data type");
+                goto fail;
+            }
+    }
+
+    if (n < 500) {
+        vdot(ip1, stride, ip2, stride, op, n, NULL);
+    }
+    else {
+        NPY_BEGIN_THREADS_DESCR(type);
+        vdot(ip1, stride, ip2, stride, op, n, NULL);
+        NPY_END_THREADS_DESCR(type);
+    }
+
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    return PyArray_Return(ret);
+fail:
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    Py_XDECREF(ret);
+    return NULL;
 }
 
+
 static int
 einsum_sub_op_from_str(PyObject *args, PyObject **str_obj, char **subscripts,
                        PyArrayObject **op)
@@ -3805,6 +3937,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"dot",
         (PyCFunction)array_matrixproduct,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"vdot",
+        (PyCFunction)array_vdot,
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index 2d05c20ef007..04fef61ce0f3 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -2,7 +2,7 @@
  * This file includes all the .c files needed for a complete multiarray module.
  * This is used in the case where separate compilation is not enabled
  *
- * Note that the order of the includs matters
+ * Note that the order of the includes matters
  */
 
 #include "common.c"
@@ -15,6 +15,7 @@
 #include "datetime_busday.c"
 #include "datetime_busdaycal.c"
 #include "arraytypes.c"
+#include "vdot.c"
 
 #include "hashdescr.c"
 #include "numpyos.c"
@@ -50,9 +51,10 @@
 #include "array_assign_scalar.c"
 #include "array_assign_array.c"
 #include "ucsnarrow.c"
-
 #include "arrayobject.c"
-
 #include "numpymemoryview.c"
-
 #include "multiarraymodule.c"
+
+#if defined(HAVE_CBLAS)
+#include "cblasfuncs.c"
+#endif
diff --git a/numpy/core/src/multiarray/vdot.c b/numpy/core/src/multiarray/vdot.c
new file mode 100644
index 000000000000..0b80b6ccaa7c
--- /dev/null
+++ b/numpy/core/src/multiarray/vdot.c
@@ -0,0 +1,183 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <Python.h>
+#include "common.h"
+#include "vdot.h"
+
+#if defined(HAVE_CBLAS)
+#include <cblas.h>
+#endif
+
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+CFLOAT_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+            char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(npy_cfloat));
+    int is2b = blas_stride(is2, sizeof(npy_cfloat));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            float tmp[2];
+
+            cblas_cdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((float *)op)[0] = (float)sum[0];
+        ((float *)op)[1] = (float)sum[1];
+    }
+    else
+#endif
+    {
+        float sumr = (float)0.0;
+        float sumi = (float)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const float ip1r = ((float *)ip1)[0];
+            const float ip1i = ((float *)ip1)[1];
+            const float ip2r = ((float *)ip2)[0];
+            const float ip2i = ((float *)ip2)[1];
+
+            sumr += ip1r * ip2r + ip1i * ip2i;
+            sumi += ip1r * ip2i - ip1i * ip2r;
+        }
+        ((float *)op)[0] = sumr;
+        ((float *)op)[1] = sumi;
+    }
+}
+
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+CDOUBLE_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+             char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(npy_cdouble));
+    int is2b = blas_stride(is2, sizeof(npy_cdouble));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            double tmp[2];
+
+            cblas_zdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((double *)op)[0] = (double)sum[0];
+        ((double *)op)[1] = (double)sum[1];
+    }
+    else
+#endif
+    {
+        double sumr = (double)0.0;
+        double sumi = (double)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const double ip1r = ((double *)ip1)[0];
+            const double ip1i = ((double *)ip1)[1];
+            const double ip2r = ((double *)ip2)[0];
+            const double ip2i = ((double *)ip2)[1];
+
+            sumr += ip1r * ip2r + ip1i * ip2i;
+            sumi += ip1r * ip2i - ip1i * ip2r;
+        }
+        ((double *)op)[0] = sumr;
+        ((double *)op)[1] = sumi;
+    }
+}
+
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+CLONGDOUBLE_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+                 char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+    npy_longdouble tmpr = 0.0L;
+    npy_longdouble tmpi = 0.0L;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        const npy_longdouble ip1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble ip1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble ip2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
+
+        tmpr += ip1r * ip2r + ip1i * ip2i;
+        tmpi += ip1r * ip2i - ip1i * ip2r;
+    }
+    ((npy_longdouble *)op)[0] = tmpr;
+    ((npy_longdouble *)op)[1] = tmpi;
+}
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+OBJECT_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+            void *NPY_UNUSED(ignore))
+{
+    npy_intp i;
+    PyObject *tmp0, *tmp1, *tmp2, *tmp = NULL;
+    PyObject **tmp3;
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        if ((*((PyObject **)ip1) == NULL) || (*((PyObject **)ip2) == NULL)) {
+            tmp1 = Py_False;
+            Py_INCREF(Py_False);
+        }
+        else {
+            tmp0 = PyObject_CallMethod(*((PyObject **)ip1), "conjugate", NULL);
+            if (tmp0 == NULL) {
+                Py_XDECREF(tmp);
+                return;
+            }
+            tmp1 = PyNumber_Multiply(tmp0, *((PyObject **)ip2));
+            Py_DECREF(tmp0);
+            if (tmp1 == NULL) {
+                Py_XDECREF(tmp);
+                return;
+            }
+        }
+        if (i == 0) {
+            tmp = tmp1;
+        }
+        else {
+            tmp2 = PyNumber_Add(tmp, tmp1);
+            Py_XDECREF(tmp);
+            Py_XDECREF(tmp1);
+            if (tmp2 == NULL) {
+                return;
+            }
+            tmp = tmp2;
+        }
+    }
+    tmp3 = (PyObject**) op;
+    tmp2 = *tmp3;
+    *((PyObject **)op) = tmp;
+    Py_XDECREF(tmp2);
+}
diff --git a/numpy/core/src/multiarray/vdot.h b/numpy/core/src/multiarray/vdot.h
new file mode 100644
index 000000000000..0f60ca6d19a5
--- /dev/null
+++ b/numpy/core/src/multiarray/vdot.h
@@ -0,0 +1,18 @@
+#ifndef _NPY_VDOT_H_
+#define _NPY_VDOT_H_
+
+#include "common.h"
+
+NPY_NO_EXPORT void
+CFLOAT_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CDOUBLE_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+OBJECT_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+#endif
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
deleted file mode 100644
index caa576abcf7e..000000000000
--- a/numpy/core/tests/test_blasdot.py
+++ /dev/null
@@ -1,171 +0,0 @@
-from __future__ import division, absolute_import, print_function
-
-import numpy as np
-import sys
-from numpy.core import zeros, float64
-from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
-     assert_raises, assert_array_equal, assert_allclose, assert_equal
-from numpy.core.multiarray import inner as inner_
-
-DECPREC = 14
-
-class TestInner(TestCase):
-    def test_vecself(self):
-        """Ticket 844."""
-        # Inner product of a vector with itself segfaults or give meaningless
-        # result
-        a = zeros(shape = (1, 80), dtype = float64)
-        p = inner_(a, a)
-        assert_almost_equal(p, 0, decimal = DECPREC)
-
-try:
-    import numpy.core._dotblas as _dotblas
-except ImportError:
-    _dotblas = None
-
-@dec.skipif(_dotblas is None, "Numpy is not compiled with _dotblas")
-def test_blasdot_used():
-    from numpy.core import dot, vdot, inner, alterdot, restoredot
-    assert_(dot is _dotblas.dot)
-    assert_(vdot is _dotblas.vdot)
-    assert_(inner is _dotblas.inner)
-    assert_(alterdot is _dotblas.alterdot)
-    assert_(restoredot is _dotblas.restoredot)
-
-
-def test_dot_2args():
-    from numpy.core import dot
-
-    a = np.array([[1, 2], [3, 4]], dtype=float)
-    b = np.array([[1, 0], [1, 1]], dtype=float)
-    c = np.array([[3, 2], [7, 4]], dtype=float)
-
-    d = dot(a, b)
-    assert_allclose(c, d)
-
-def test_dot_3args():
-    np.random.seed(22)
-    f = np.random.random_sample((1024, 16))
-    v = np.random.random_sample((16, 32))
-
-    r = np.empty((1024, 32))
-    for i in range(12):
-        np.dot(f, v, r)
-    assert_equal(sys.getrefcount(r), 2)
-    r2 = np.dot(f, v, out=None)
-    assert_array_equal(r2, r)
-    assert_(r is np.dot(f, v, out=r))
-
-    v = v[:, 0].copy() # v.shape == (16,)
-    r = r[:, 0].copy() # r.shape == (1024,)
-    r2 = np.dot(f, v)
-    assert_(r is np.dot(f, v, r))
-    assert_array_equal(r2, r)
-
-def test_dot_3args_errors():
-    np.random.seed(22)
-    f = np.random.random_sample((1024, 16))
-    v = np.random.random_sample((16, 32))
-
-    r = np.empty((1024, 31))
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((1024,))
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((32,))
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((32, 1024))
-    assert_raises(ValueError, np.dot, f, v, r)
-    assert_raises(ValueError, np.dot, f, v, r.T)
-
-    r = np.empty((1024, 64))
-    assert_raises(ValueError, np.dot, f, v, r[:, ::2])
-    assert_raises(ValueError, np.dot, f, v, r[:, :32])
-
-    r = np.empty((1024, 32), dtype=np.float32)
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((1024, 32), dtype=int)
-    assert_raises(ValueError, np.dot, f, v, r)
-
-def test_dot_array_order():
-    """ Test numpy dot with different order C, F
-
-    Comparing results with multiarray dot.
-    Double and single precisions array are compared using relative
-    precision of 7 and 5 decimals respectively.
-    Use 30 decimal when comparing exact operations like:
-        (a.b)' = b'.a'
-    """
-    _dot = np.core.multiarray.dot
-    a_dim, b_dim, c_dim = 10, 4, 7
-    orders = ["C", "F"]
-    dtypes_prec = {np.float64: 7, np.float32: 5}
-    np.random.seed(7)
-
-    for arr_type, prec in dtypes_prec.items():
-        for a_order in orders:
-            a = np.asarray(np.random.randn(a_dim, a_dim),
-                dtype=arr_type, order=a_order)
-            assert_array_equal(np.dot(a, a), a.dot(a))
-            # (a.a)' = a'.a', note that mse~=1e-31 needs almost_equal
-            assert_almost_equal(a.dot(a), a.T.dot(a.T).T, decimal=prec)
-
-            #
-            # Check with making explicit copy
-            #
-            a_T = a.T.copy(order=a_order)
-            assert_almost_equal(a_T.dot(a_T), a.T.dot(a.T), decimal=prec)
-            assert_almost_equal(a.dot(a_T), a.dot(a.T), decimal=prec)
-            assert_almost_equal(a_T.dot(a), a.T.dot(a), decimal=prec)
-
-            #
-            # Compare with multiarray dot
-            #
-            assert_almost_equal(a.dot(a), _dot(a, a), decimal=prec)
-            assert_almost_equal(a.T.dot(a), _dot(a.T, a), decimal=prec)
-            assert_almost_equal(a.dot(a.T), _dot(a, a.T), decimal=prec)
-            assert_almost_equal(a.T.dot(a.T), _dot(a.T, a.T), decimal=prec)
-            for res in a.dot(a), a.T.dot(a), a.dot(a.T), a.T.dot(a.T):
-                assert res.flags.c_contiguous
-
-            for b_order in orders:
-                b = np.asarray(np.random.randn(a_dim, b_dim),
-                    dtype=arr_type, order=b_order)
-                b_T = b.T.copy(order=b_order)
-                assert_almost_equal(a_T.dot(b), a.T.dot(b), decimal=prec)
-                assert_almost_equal(b_T.dot(a), b.T.dot(a), decimal=prec)
-                # (b'.a)' = a'.b
-                assert_almost_equal(b.T.dot(a), a.T.dot(b).T, decimal=prec)
-                assert_almost_equal(a.dot(b), _dot(a, b), decimal=prec)
-                assert_almost_equal(b.T.dot(a), _dot(b.T, a), decimal=prec)
-
-
-                for c_order in orders:
-                    c = np.asarray(np.random.randn(b_dim, c_dim),
-                        dtype=arr_type, order=c_order)
-                    c_T = c.T.copy(order=c_order)
-                    assert_almost_equal(c.T.dot(b.T), c_T.dot(b_T), decimal=prec)
-                    assert_almost_equal(c.T.dot(b.T).T, b.dot(c), decimal=prec)
-                    assert_almost_equal(b.dot(c), _dot(b, c), decimal=prec)
-                    assert_almost_equal(c.T.dot(b.T), _dot(c.T, b.T), decimal=prec)
-
-def test_dot_override():
-    class A(object):
-        def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
-            return "A"
-
-    class B(object):
-        def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
-            return NotImplemented
-
-    a = A()
-    b = B()
-    c = np.array([[1]])
-
-    assert_equal(np.dot(a, b), "A")
-    assert_equal(c.dot(a), "A")
-    assert_raises(TypeError, np.dot, b, c)
-    assert_raises(TypeError, c.dot, b)
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index ef56766f5f41..9e2248205431 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -5,13 +5,11 @@
 """
 from __future__ import division, absolute_import, print_function
 
-import sys
 import operator
 import warnings
-from nose.plugins.skip import SkipTest
 
 import numpy as np
-from numpy.testing import (dec, run_module_suite, assert_raises,
+from numpy.testing import (run_module_suite, assert_raises,
                            assert_warns, assert_array_equal, assert_)
 
 
@@ -34,11 +32,9 @@ def setUp(self):
         warnings.filterwarnings("always", message=self.message,
                                     category=DeprecationWarning)
 
-
     def tearDown(self):
         self.warn_ctx.__exit__()
 
-
     def assert_deprecated(self, function, num=1, ignore_others=False,
                         function_fails=False,
                         exceptions=(DeprecationWarning,), args=(), kwargs={}):
@@ -102,7 +98,6 @@ def assert_deprecated(self, function, num=1, ignore_others=False,
                 if exceptions == tuple():
                     raise AssertionError("Error raised during function call")
 
-
     def assert_not_deprecated(self, function, args=(), kwargs={}):
         """Test if DeprecationWarnings are given and raised.
 
@@ -143,6 +138,7 @@ class TestFloatNonIntegerArgumentDeprecation(_DeprecationTestCase):
 
     def test_indexing(self):
         a = np.array([[[5]]])
+
         def assert_deprecated(*args, **kwargs):
             self.assert_deprecated(*args, exceptions=(IndexError,), **kwargs)
 
@@ -172,7 +168,6 @@ def assert_deprecated(*args, **kwargs):
         assert_deprecated(lambda: a[0.0:, 0.0], num=2)
         assert_deprecated(lambda: a[0.0:, 0.0,:], num=2)
 
-
     def test_valid_indexing(self):
         a = np.array([[[5]]])
         assert_not_deprecated = self.assert_not_deprecated
@@ -183,9 +178,9 @@ def test_valid_indexing(self):
         assert_not_deprecated(lambda: a[:, 0,:])
         assert_not_deprecated(lambda: a[:,:,:])
 
-
     def test_slicing(self):
         a = np.array([[5]])
+
         def assert_deprecated(*args, **kwargs):
             self.assert_deprecated(*args, exceptions=(IndexError,), **kwargs)
 
@@ -217,7 +212,6 @@ def assert_deprecated(*args, **kwargs):
         # should still get the DeprecationWarning if step = 0.
         assert_deprecated(lambda: a[::0.0], function_fails=True)
 
-
     def test_valid_slicing(self):
         a = np.array([[[5]]])
         assert_not_deprecated = self.assert_not_deprecated
@@ -231,7 +225,6 @@ def test_valid_slicing(self):
         assert_not_deprecated(lambda: a[:2:2])
         assert_not_deprecated(lambda: a[1:2:2])
 
-
     def test_non_integer_argument_deprecations(self):
         a = np.array([[5]])
 
@@ -240,7 +233,6 @@ def test_non_integer_argument_deprecations(self):
         self.assert_deprecated(np.take, args=(a, [0], 1.))
         self.assert_deprecated(np.take, args=(a, [0], np.float64(1.)))
 
-
     def test_non_integer_sequence_multiplication(self):
         # Numpy scalar sequence multiply should not work with non-integers
         def mult(a, b):
@@ -248,7 +240,6 @@ def mult(a, b):
         self.assert_deprecated(mult, args=([1], np.float_(3)))
         self.assert_not_deprecated(mult, args=([1], np.int_(3)))
 
-
     def test_reduce_axis_float_index(self):
         d = np.zeros((3,3,3))
         self.assert_deprecated(np.min, args=(d, 0.5))
@@ -303,7 +294,6 @@ def test_array_to_index_deprecation(self):
         # Check slicing. Normal indexing checks arrays specifically.
         self.assert_deprecated(lambda: a[a:a:a], exceptions=(), num=3)
 
-
 class TestNonIntegerArrayLike(_DeprecationTestCase):
     """Tests that array likes, i.e. lists give a deprecation warning
     when they cannot be safely cast to an integer.
@@ -320,7 +310,6 @@ def test_basic(self):
 
         self.assert_not_deprecated(a.__getitem__, ([],))
 
-
     def test_boolean_futurewarning(self):
         a = np.arange(10)
         with warnings.catch_warnings():
@@ -378,12 +367,13 @@ class TestRankDeprecation(_DeprecationTestCase):
     """Test that np.rank is deprecated. The function should simply be
     removed. The VisibleDeprecationWarning may become unnecessary.
     """
+
     def test(self):
         a = np.arange(10)
         assert_warns(np.VisibleDeprecationWarning, np.rank, a)
 
 
-class TestComparisonDepreactions(_DeprecationTestCase):
+class TestComparisonDeprecations(_DeprecationTestCase):
     """This tests the deprecation, for non-elementwise comparison logic.
     This used to mean that when an error occured during element-wise comparison
     (i.e. broadcasting) NotImplemented was returned, but also in the comparison
@@ -408,7 +398,6 @@ def test_normal_types(self):
             b = np.array([1, np.array([1,2,3])], dtype=object)
             self.assert_deprecated(op, args=(a, b), num=None)
 
-
     def test_string(self):
         # For two string arrays, strings always raised the broadcasting error:
         a = np.array(['a', 'b'])
@@ -420,7 +409,6 @@ def test_string(self):
         # following works (and returns False) due to dtype mismatch:
         a == []
 
-
     def test_none_comparison(self):
         # Test comparison of None, which should result in elementwise
         # comparison in the future. [1, 2] == None should be [False, False].
@@ -455,14 +443,14 @@ def test_scalar_none_comparison(self):
         assert_(np.equal(np.datetime64('NaT'), None))
 
 
-class TestIdentityComparisonDepreactions(_DeprecationTestCase):
+class TestIdentityComparisonDeprecations(_DeprecationTestCase):
     """This tests the equal and not_equal object ufuncs identity check
     deprecation. This was due to the usage of PyObject_RichCompareBool.
 
     This tests that for example for `a = np.array([np.nan], dtype=object)`
     `a == a` it is warned that False and not `np.nan is np.nan` is returned.
 
-    Should be kept in sync with TestComparisonDepreactions and new tests
+    Should be kept in sync with TestComparisonDeprecations and new tests
     added when the deprecation is over. Requires only removing of @identity@
     (and blocks) from the ufunc loops.c.src of the OBJECT comparisons.
     """
@@ -488,11 +476,11 @@ def test_identity_equality_mismatch(self):
                 np.less_equal(a, a)
                 np.greater_equal(a, a)
 
-
     def test_comparison_error(self):
         class FunkyType(object):
             def __eq__(self, other):
                 raise TypeError("I won't compare")
+
             def __ne__(self, other):
                 raise TypeError("I won't compare")
 
@@ -500,7 +488,6 @@ def __ne__(self, other):
         self.assert_deprecated(np.equal, args=(a, a))
         self.assert_deprecated(np.not_equal, args=(a, a))
 
-
     def test_bool_error(self):
         # The comparison result cannot be interpreted as a bool
         a = np.array([np.array([1, 2, 3]), None], dtype=object)
@@ -508,5 +495,18 @@ def test_bool_error(self):
         self.assert_deprecated(np.not_equal, args=(a, a))
 
 
+class TestAlterdotRestoredotDeprecations(_DeprecationTestCase):
+    """The alterdot/restoredot functions are deprecated.
+
+    These functions no longer do anything in numpy 1.10, so should not be
+    used.
+
+    """
+
+    def test_alterdot_restoredot_deprecation(self):
+        self.assert_deprecated(np.alterdot)
+        self.assert_deprecated(np.restoredot)
+
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 96db8cde3cb5..059842c51d91 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -3355,6 +3355,44 @@ def __array_finalize__(self, obj):
         res = dat.var(1)
         assert_(res.info == dat.info)
 
+class TestVdot(TestCase):
+    def test_basic(self):
+        dt_numeric = np.typecodes['AllFloat'] + np.typecodes['AllInteger']
+        dt_complex = np.typecodes['Complex']
+
+        # test real
+        a = np.eye(3)
+        for dt in dt_numeric + 'O':
+            b = a.astype(dt)
+            res = np.vdot(b, b)
+            assert_(np.isscalar(res))
+            assert_equal(np.vdot(b, b), 3)
+
+        # test complex
+        a = np.eye(3) * 1j
+        for dt in dt_complex + 'O':
+            b = a.astype(dt)
+            res = np.vdot(b, b)
+            assert_(np.isscalar(res))
+            assert_equal(np.vdot(b, b), 3)
+
+        # test boolean
+        b = np.eye(3, dtype=np.bool)
+        res = np.vdot(b, b)
+        assert_(np.isscalar(res))
+        assert_equal(np.vdot(b, b), True)
+
+    def test_vdot_array_order(self):
+        a = array([[1, 2], [3, 4]], order='C')
+        b = array([[1, 2], [3, 4]], order='F')
+        res = np.vdot(a, a)
+
+        # integer arrays are exact
+        assert_equal(np.vdot(a, b), res)
+        assert_equal(np.vdot(b, a), res)
+        assert_equal(np.vdot(b, b), res)
+
+
 class TestDot(TestCase):
     def test_dot_2args(self):
         from numpy.core.multiarray import dot
@@ -3417,6 +3455,16 @@ def test_dot_3args_errors(self):
         r = np.empty((1024, 32), dtype=int)
         assert_raises(ValueError, dot, f, v, r)
 
+    def test_dot_array_order(self):
+        a = array([[1, 2], [3, 4]], order='C')
+        b = array([[1, 2], [3, 4]], order='F')
+        res = np.dot(a, a)
+
+        # integer arrays are exact
+        assert_equal(np.dot(a, b), res)
+        assert_equal(np.dot(b, a), res)
+        assert_equal(np.dot(b, b), res)
+
     def test_dot_scalar_and_matrix_of_objects(self):
         # Ticket #2469
         arr = np.matrix([1, 2], dtype=object)
@@ -3424,6 +3472,24 @@ def test_dot_scalar_and_matrix_of_objects(self):
         assert_equal(np.dot(arr, 3), desired)
         assert_equal(np.dot(3, arr), desired)
 
+    def test_dot_override(self):
+        class A(object):
+            def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
+                return "A"
+
+        class B(object):
+            def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
+                return NotImplemented
+
+        a = A()
+        b = B()
+        c = np.array([[1]])
+
+        assert_equal(np.dot(a, b), "A")
+        assert_equal(c.dot(a), "A")
+        assert_raises(TypeError, np.dot, b, c)
+        assert_raises(TypeError, c.dot, b)
+
 
 class TestInner(TestCase):
 
@@ -3434,6 +3500,14 @@ def test_inner_scalar_and_matrix_of_objects(self):
         assert_equal(np.inner(arr, 3), desired)
         assert_equal(np.inner(3, arr), desired)
 
+    def test_vecself(self):
+        # Ticket 844.
+        # Inner product of a vector with itself segfaults or give
+        # meaningless result
+        a = zeros(shape = (1, 80), dtype = float64)
+        p = inner(a, a)
+        assert_almost_equal(p, 0, decimal=14)
+
 
 class TestSummarization(TestCase):
     def test_1d(self):
@@ -3454,7 +3528,6 @@ def test_2d(self):
                 '       [ 501,  502,  503, ...,  999, 1000, 1001]])'
         assert_(repr(A) == reprA)
 
-
 class TestChoose(TestCase):
     def setUp(self):
         self.x = 2*ones((3,), dtype=int)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 3030c68abb08..e32519316eb6 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -333,11 +333,10 @@ def test_log1p(self):
         assert_almost_equal(ncu.log1p(1e-6), ncu.log(1+1e-6))
 
     def test_special(self):
-        assert_equal(ncu.log1p(np.nan), np.nan)
-        assert_equal(ncu.log1p(np.inf), np.inf)
-        with np.errstate(divide="ignore"):
+        with np.errstate(invalid="ignore", divide="ignore"):
+            assert_equal(ncu.log1p(np.nan), np.nan)
+            assert_equal(ncu.log1p(np.inf), np.inf)
             assert_equal(ncu.log1p(-1.), -np.inf)
-        with np.errstate(invalid="ignore"):
             assert_equal(ncu.log1p(-2.), np.nan)
             assert_equal(ncu.log1p(-np.inf), np.nan)
 
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 48c92c548224..ddb1513c4258 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -434,7 +434,7 @@ class UmfpackNotFoundError(NotFoundError):
     the UMFPACK environment variable."""
 
 
-class system_info:
+class system_info(object):
 
     """ get_info() is the only public method. Don't use others.
     """
@@ -962,7 +962,8 @@ def calc_info(self):
         if info is None:
             return
         dict_append(info,
-                    define_macros=[('SCIPY_MKL_H', None)],
+                    define_macros=[('SCIPY_MKL_H', None),
+                                   ('HAVE_CBLAS', None)],
                     include_dirs=incl_dirs)
         if sys.platform == 'win32':
             pass  # win32 has no pthread library
@@ -1120,6 +1121,7 @@ def calc_info(self):
             h = os.path.dirname(h)
             dict_append(info, include_dirs=[h])
         info['language'] = 'c'
+        info['define_macros'] = [('HAVE_CBLAS', None)]
 
         atlas_version, atlas_extra_info = get_atlas_version(**atlas)
         dict_append(atlas, **atlas_extra_info)
@@ -1414,7 +1416,8 @@ def calc_info(self):
             if args:
                 self.set_info(extra_compile_args=args,
                               extra_link_args=link_args,
-                              define_macros=[('NO_ATLAS_INFO', 3)])
+                              define_macros=[('NO_ATLAS_INFO', 3),
+                                             ('HAVE_CBLAS', None)])
                 return
 
         #atlas_info = {} ## uncomment for testing
@@ -1515,7 +1518,8 @@ def calc_info(self):
             if args:
                 self.set_info(extra_compile_args=args,
                               extra_link_args=link_args,
-                              define_macros=[('NO_ATLAS_INFO', 3)])
+                              define_macros=[('NO_ATLAS_INFO', 3),
+                                             ('HAVE_CBLAS', None)])
                 return
 
         need_blas = 0
@@ -1556,9 +1560,33 @@ def calc_info(self):
         info = self.check_libs(lib_dirs, blas_libs, [])
         if info is None:
             return
-        info['language'] = 'f77'  # XXX: is it generally true?
+        if self.has_cblas():
+            info['language'] = 'c'
+            info['define_macros'] = [('HAVE_CBLAS', None)]
+        else:
+            info['language'] = 'f77'  # XXX: is it generally true?
         self.set_info(**info)
 
+    def has_cblas(self):
+        # primitive cblas check by looking for the header
+        res = False
+        c = distutils.ccompiler.new_compiler()
+        tmpdir = tempfile.mkdtemp()
+        s = """#include <cblas.h>"""
+        src = os.path.join(tmpdir, 'source.c')
+        try:
+            with open(src, 'wt') as f:
+                f.write(s)
+            try:
+                c.compile([src], output_dir=tmpdir,
+                          include_dirs=self.get_include_dirs())
+                res = True
+            except distutils.ccompiler.CompileError:
+                res = False
+        finally:
+            shutil.rmtree(tmpdir)
+        return res
+
 
 class openblas_info(blas_info):
     section = 'openblas'
@@ -1580,9 +1608,10 @@ def calc_info(self):
             return
 
         if not self.check_embedded_lapack(info):
-            return None
+            return
 
-        info['language'] = 'f77'  # XXX: is it generally true?
+        info['language'] = 'c'
+        info['define_macros'] = [('HAVE_CBLAS', None)]
         self.set_info(**info)
 
 
diff --git a/numpy/linalg/bscript b/numpy/linalg/bscript
index deed4fd72fec..70fdd9de3b5d 100644
--- a/numpy/linalg/bscript
+++ b/numpy/linalg/bscript
@@ -20,7 +20,7 @@ def pbuild(context):
         return context.default_builder(extension,
                                        includes=includes,
                                        **kw)
- 
+
     context.register_builder("lapack_lite", build_lapack_lite)
     context.register_builder("_umath_linalg", build_lapack_lite)
 
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index aa0a2669fd7d..5189f7e633f4 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -449,6 +449,15 @@ def test_min_int(self):
         # Should not raise:
         assert_allclose(a, a)
 
+    def test_report_fail_percentage(self):
+        a = np.array([1, 1, 1, 1])
+        b = np.array([1, 1, 1, 2])
+        try:
+            assert_allclose(a, b)
+            msg = ''
+        except AssertionError as exc:
+            msg = exc.args[0]
+        self.assertTrue("mismatch 25.0%" in msg)
 
 class TestArrayAlmostEqualNulp(unittest.TestCase):
     @dec.knownfailureif(True, "Github issue #347")
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index bd184d922041..71c7145f9c51 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -1289,7 +1289,7 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=0,
     """
     import numpy as np
     def compare(x, y):
-        return np.allclose(x, y, rtol=rtol, atol=atol)
+        return np.core.numeric._allclose_points(x, y, rtol=rtol, atol=atol)
 
     actual, desired = np.asanyarray(actual), np.asanyarray(desired)
     header = 'Not equal to tolerance rtol=%g, atol=%g' % (rtol, atol)
diff --git a/tools/travis-test.sh b/tools/travis-test.sh
index d970daab3d3e..f342aa7b13b1 100755
--- a/tools/travis-test.sh
+++ b/tools/travis-test.sh
@@ -59,9 +59,9 @@ setup_bento()
   cd ..
 
   # Waf
-  wget http://waf.googlecode.com/files/waf-1.7.13.tar.bz2
-  tar xjvf waf-1.7.13.tar.bz2
-  cd waf-1.7.13
+  wget http://ftp.waf.io/pub/release/waf-1.7.16.tar.bz2
+  tar xjvf waf-1.7.16.tar.bz2
+  cd waf-1.7.16
   python waf-light
   export WAFDIR=$PWD
   cd ..