From 2f6da63938e516bb653b95f1de7b59b33c05a2fc Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Mon, 31 Mar 2014 21:58:10 +0200
Subject: [PATCH 01/19] ENH: optimize STRING_compare by using memcmp

---
 numpy/core/src/multiarray/arraytypes.c.src | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index ffd741deaab4..bd089cbddc51 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -20,6 +20,7 @@
 #include "arrayobject.h"
 
 #include "numpyos.h"
+#include <string.h>
 
 
 /*
@@ -2598,12 +2599,14 @@ STRING_compare(char *ip1, char *ip2, PyArrayObject *ap)
     const unsigned char *c1 = (unsigned char *)ip1;
     const unsigned char *c2 = (unsigned char *)ip2;
     const size_t len = PyArray_DESCR(ap)->elsize;
-    size_t i;
+    int i;
 
-    for(i = 0; i < len; ++i) {
-        if (c1[i] != c2[i]) {
-            return (c1[i] > c2[i]) ? 1 : -1;
-        }
+    i = memcmp(c1, c2, len);
+    if (i > 0) {
+        return 1;
+    }
+    else if (i < 0) {
+        return -1;
     }
     return 0;
 }

From f615c6c6b3cecc5b9e37c3c5271084d00438e3d3 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Mon, 11 Aug 2014 13:20:08 -0600
Subject: [PATCH 02/19] ENH: Add 'HAVE_CBLAS' macro for build purposes.

The current system works for MKL and OpenBLAS by default because the
mkl_info and openblas_info classes in numpy/distutils/system_info do not
define the macro 'NO_ATLAS_INFO=1' that currently signals the absence of
CBLAS. This PR declares the presence of CBLAS directly by defining the
'HAVE_CBLAS' macro.
---
 numpy/core/setup.py            |  8 ++++----
 numpy/distutils/system_info.py | 17 +++++++++++------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 15f66fa6cb35..b81374d143cc 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -951,10 +951,10 @@ def generate_umath_c(ext, build_dir):
     #blas_info = {}
     def get_dotblas_sources(ext, build_dir):
         if blas_info:
-            if ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', []):
-                return None # dotblas needs ATLAS, Fortran compiled blas will not be sufficient.
-            return ext.depends[:1]
-        return None # no extension module will be built
+            if ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
+                return ext.depends[:1]
+        # dotblas needs CBLAS, Fortran compiled BLAS will not work.
+        return None
 
     config.add_extension('_dotblas',
                          sources = [get_dotblas_sources],
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 48c92c548224..75c2c0ea5201 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -434,7 +434,7 @@ class UmfpackNotFoundError(NotFoundError):
     the UMFPACK environment variable."""
 
 
-class system_info:
+class system_info(object):
 
     """ get_info() is the only public method. Don't use others.
     """
@@ -962,7 +962,8 @@ def calc_info(self):
         if info is None:
             return
         dict_append(info,
-                    define_macros=[('SCIPY_MKL_H', None)],
+                    define_macros=[('SCIPY_MKL_H', None),
+                                   ('HAVE_CBLAS', None)],
                     include_dirs=incl_dirs)
         if sys.platform == 'win32':
             pass  # win32 has no pthread library
@@ -1120,6 +1121,7 @@ def calc_info(self):
             h = os.path.dirname(h)
             dict_append(info, include_dirs=[h])
         info['language'] = 'c'
+        info['define_macros'] = [('HAVE_CBLAS', None)]
 
         atlas_version, atlas_extra_info = get_atlas_version(**atlas)
         dict_append(atlas, **atlas_extra_info)
@@ -1414,7 +1416,8 @@ def calc_info(self):
             if args:
                 self.set_info(extra_compile_args=args,
                               extra_link_args=link_args,
-                              define_macros=[('NO_ATLAS_INFO', 3)])
+                              define_macros=[('NO_ATLAS_INFO', 3),
+                                             ('HAVE_CBLAS', None)])
                 return
 
         #atlas_info = {} ## uncomment for testing
@@ -1515,7 +1518,8 @@ def calc_info(self):
             if args:
                 self.set_info(extra_compile_args=args,
                               extra_link_args=link_args,
-                              define_macros=[('NO_ATLAS_INFO', 3)])
+                              define_macros=[('NO_ATLAS_INFO', 3),
+                                             ('HAVE_CBLAS', None)])
                 return
 
         need_blas = 0
@@ -1580,9 +1584,10 @@ def calc_info(self):
             return
 
         if not self.check_embedded_lapack(info):
-            return None
+            return
 
-        info['language'] = 'f77'  # XXX: is it generally true?
+        info['language'] = 'c'
+        info['define_macros'] = [('HAVE_CBLAS', None)]
         self.set_info(**info)
 
 

From ace49c2ebcd42b9d41bdac11729f3c58e525480e Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Wed, 13 Aug 2014 18:16:26 -0600
Subject: [PATCH 03/19] ENH: When cblas is available use it in descr->f->dot.

Importing _dotblas currently executes _dotblas.alterdot, which replaces
the default descr->f->dot function with a cblas based version for float,
double, complex float, and complex double data types. This PR changes
the default descr->f->dot to use cblas whenever it is available. After
this change, the alterdot and restoredot functions serve no purpose, so
are changed to do nothing and deprecated. Note that those functions were
already doing nothing when _dotblas was not available.
---
 numpy/core/blasdot/_dotblas.c              | 135 +--------------
 numpy/core/bscript                         |  16 +-
 numpy/core/numeric.py                      |  18 +-
 numpy/core/setup.py                        |  21 ++-
 numpy/core/src/multiarray/arraytypes.c.src | 191 ++++++++++++++++++---
 numpy/core/tests/test_blasdot.py           |   4 +-
 numpy/core/tests/test_deprecations.py      |  42 ++---
 7 files changed, 228 insertions(+), 199 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 48aa39ff87df..c1b4e7b05aff 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -24,121 +24,6 @@ static char module_doc[] =
 
 static PyArray_DotFunc *oldFunctions[NPY_NTYPES];
 
-#define MIN(a, b)   ((a) < (b) ? (a) : (b))
-
-/*
- * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
- * (BLAS won't handle negative or zero strides the way we want).
- */
-static NPY_INLINE int
-blas_stride(npy_intp stride, unsigned itemsize)
-{
-    if (stride <= 0 || stride % itemsize != 0) {
-        return 0;
-    }
-    stride /= itemsize;
-
-    if (stride > INT_MAX) {
-        return 0;
-    }
-    return stride;
-}
-
-/*
- * The following functions do a "chunked" dot product using BLAS when
- * sizeof(npy_intp) > sizeof(int), because BLAS libraries can typically not
- * handle more than INT_MAX elements per call.
- *
- * The chunksize is the greatest power of two less than INT_MAX.
- */
-#if NPY_MAX_INTP > INT_MAX
-# define CHUNKSIZE  (INT_MAX / 2 + 1)
-#else
-# define CHUNKSIZE  NPY_MAX_INTP
-#endif
-
-static void
-FLOAT_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-          npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(float));
-    int nb = blas_stride(strideb, sizeof(float));
-
-    if (na && nb) {
-        double r = 0.;          /* double for stability */
-        float *fa = a, *fb = b;
-
-        while (n > 0) {
-            int chunk = MIN(n, CHUNKSIZE);
-
-            r += cblas_sdot(chunk, fa, na, fb, nb);
-            fa += chunk * na;
-            fb += chunk * nb;
-            n -= chunk;
-        }
-        *((float *)res) = r;
-    }
-    else {
-        oldFunctions[NPY_FLOAT](a, stridea, b, strideb, res, n, tmp);
-    }
-}
-
-static void
-DOUBLE_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-           npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(double));
-    int nb = blas_stride(strideb, sizeof(double));
-
-    if (na && nb) {
-        double r = 0.;
-        double *da = a, *db = b;
-
-        while (n > 0) {
-            int chunk = MIN(n, CHUNKSIZE);
-
-            r += cblas_ddot(chunk, da, na, db, nb);
-            da += chunk * na;
-            db += chunk * nb;
-            n -= chunk;
-        }
-        *((double *)res) = r;
-    }
-    else {
-        oldFunctions[NPY_DOUBLE](a, stridea, b, strideb, res, n, tmp);
-    }
-}
-
-static void
-CFLOAT_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-           npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(npy_cfloat));
-    int nb = blas_stride(strideb, sizeof(npy_cfloat));
-
-    if (na && nb) {
-        cblas_cdotu_sub((int)n, (float *)a, na, (float *)b, nb, (float *)res);
-    }
-    else {
-        oldFunctions[NPY_CFLOAT](a, stridea, b, strideb, res, n, tmp);
-    }
-}
-
-static void
-CDOUBLE_dot(void *a, npy_intp stridea, void *b, npy_intp strideb, void *res,
-            npy_intp n, void *tmp)
-{
-    int na = blas_stride(stridea, sizeof(npy_cdouble));
-    int nb = blas_stride(strideb, sizeof(npy_cdouble));
-
-    if (na && nb) {
-        cblas_zdotu_sub((int)n, (double *)a, na, (double *)b, nb,
-                        (double *)res);
-    }
-    else {
-        oldFunctions[NPY_CDOUBLE](a, stridea, b, strideb, res, n, tmp);
-    }
-}
 
 /*
  * Helper: call appropriate BLAS dot function for typenum.
@@ -149,20 +34,8 @@ blas_dot(int typenum, npy_intp n,
          void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
 {
     PyArray_DotFunc *dot = NULL;
-    switch (typenum) {
-        case NPY_DOUBLE:
-            dot = DOUBLE_dot;
-            break;
-        case NPY_FLOAT:
-            dot = FLOAT_dot;
-            break;
-        case NPY_CDOUBLE:
-            dot = CDOUBLE_dot;
-            break;
-        case NPY_CFLOAT:
-            dot = CFLOAT_dot;
-            break;
-    }
+
+    dot = oldFunctions[typenum];
     assert(dot != NULL);
     dot(a, stridea, b, strideb, res, n, NULL);
 }
@@ -257,19 +130,15 @@ dotblas_alterdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (!altered) {
         descr = PyArray_DescrFromType(NPY_FLOAT);
         oldFunctions[NPY_FLOAT] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)FLOAT_dot;
 
         descr = PyArray_DescrFromType(NPY_DOUBLE);
         oldFunctions[NPY_DOUBLE] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)DOUBLE_dot;
 
         descr = PyArray_DescrFromType(NPY_CFLOAT);
         oldFunctions[NPY_CFLOAT] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)CFLOAT_dot;
 
         descr = PyArray_DescrFromType(NPY_CDOUBLE);
         oldFunctions[NPY_CDOUBLE] = descr->f->dotfunc;
-        descr->f->dotfunc = (PyArray_DotFunc *)CDOUBLE_dot;
 
         altered = NPY_TRUE;
     }
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 416e16524bb2..3306c534188d 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -488,14 +488,22 @@ def pre_build(context):
                 pjoin('src', 'multiarray', 'usertypes.c')]
         else:
             sources = extension.sources
+
+        use = 'npysort npymath'
+        defines = ['_FILE_OFFSET_BITS=64',
+                   '_LARGEFILE_SOURCE=1',
+                   '_LARGEFILE64_SOURCE=1']
+
+        if bld.env.HAS_CBLAS:
+            use += ' CBLAS'
+            defines.append('HAVE_CBLAS')
+
         includes = ["src/multiarray", "src/private"]
         return context.default_builder(extension,
                                        includes=includes,
                                        source=sources,
-                                       use="npysort npymath",
-                                       defines=['_FILE_OFFSET_BITS=64',
-                                                '_LARGEFILE_SOURCE=1',
-                                                '_LARGEFILE64_SOURCE=1']
+                                       use=use,
+                                       defines=defines
                                        )
     context.register_builder("multiarray", builder_multiarray)
 
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 5d7407ce0de9..9d64e6753f8c 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1078,31 +1078,33 @@ def outer(a, b, out=None):
 # try to import blas optimized dot if available
 envbak = os.environ.copy()
 try:
-    # importing this changes the dot function for basic 4 types
-    # to blas-optimized versions.
-
     # disables openblas affinity setting of the main thread that limits
     # python threads or processes to one core
     if 'OPENBLAS_MAIN_FREE' not in os.environ:
         os.environ['OPENBLAS_MAIN_FREE'] = '1'
     if 'GOTOBLAS_MAIN_FREE' not in os.environ:
         os.environ['GOTOBLAS_MAIN_FREE'] = '1'
-    from ._dotblas import dot, vdot, inner, alterdot, restoredot
+    from ._dotblas import dot, vdot, inner
 except ImportError:
     # docstrings are in add_newdocs.py
     inner = multiarray.inner
     dot = multiarray.dot
     def vdot(a, b):
         return dot(asarray(a).ravel().conj(), asarray(b).ravel())
-    def alterdot():
-        pass
-    def restoredot():
-        pass
 finally:
     os.environ.clear()
     os.environ.update(envbak)
     del envbak
 
+
+def alterdot():
+    warnings.warn("alterdot no longer does anything.", DeprecationWarning)
+
+
+def restoredot():
+    warnings.warn("restoredot no longer does anything.", DeprecationWarning)
+
+
 def tensordot(a, b, axes=2):
     """
     Compute tensor dot product along specified axes for arrays >= 1-D.
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index b81374d143cc..4c2af5e62610 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -846,15 +846,22 @@ def generate_multiarray_templated_sources(ext, build_dir):
         multiarray_src = [join('src', 'multiarray', 'multiarraymodule_onefile.c')]
         multiarray_src.append(generate_multiarray_templated_sources)
 
+    blas_info = get_info('blas_opt', 0)
+    if blas_info and  ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
+        extra_info = blas_info
+    else:
+        extra_info = {}
+
     config.add_extension('multiarray',
-                         sources = multiarray_src +
+                         sources=multiarray_src +
                                  [generate_config_h,
-                                 generate_numpyconfig_h,
-                                 generate_numpy_api,
-                                 join(codegen_dir, 'generate_numpy_api.py'),
-                                 join('*.py')],
-                         depends = deps + multiarray_deps,
-                         libraries = ['npymath', 'npysort'])
+                                  generate_numpyconfig_h,
+                                  generate_numpy_api,
+                                  join(codegen_dir, 'generate_numpy_api.py'),
+                                  join('*.py')],
+                         depends=deps + multiarray_deps,
+                         libraries=['npymath', 'npysort'],
+                         extra_info=extra_info)
 
     #######################################################################
     #                           umath module                              #
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 8a0b1826b46d..54747004518b 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3,8 +3,15 @@
 #include "Python.h"
 #include "structmember.h"
 
+#include <limits.h>
+#if defined(HAVE_CBLAS)
+#include <cblas.h>
+#endif
+
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+
+
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "npy_pycompat.h"
@@ -3074,6 +3081,149 @@ static int
  * dot means inner product
  */
 
+/************************** MAYBE USE CBLAS *********************************/
+
+/*
+ * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
+ * (BLAS won't handle negative or zero strides the way we want).
+ */
+#if defined(HAVE_CBLAS)
+static NPY_INLINE int
+blas_stride(npy_intp stride, unsigned itemsize)
+{
+    /*
+     * Should probably check pointer alignment also, but this may cause
+     * problems if we require complex to be 16 byte aligned.
+     */
+    if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
+        stride /= itemsize;
+        if (stride <= INT_MAX) {
+            return stride;
+        }
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * The following functions do a "chunked" dot product using BLAS when
+ * sizeof(npy_intp) > sizeof(int), because BLAS libraries can typically not
+ * handle more than INT_MAX elements per call.
+ *
+ * The chunksize is the greatest power of two less than INT_MAX.
+ */
+#if NPY_MAX_INTP > INT_MAX
+# define CHUNKSIZE  (INT_MAX / 2 + 1)
+#else
+# define CHUNKSIZE  NPY_MAX_INTP
+#endif
+
+/**begin repeat
+ *
+ * #name = FLOAT, DOUBLE#
+ * #type = npy_float, npy_double#
+ * #prefix = s, d#
+ */
+static void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+           npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(@type@));
+    int is2b = blas_stride(is2, sizeof(@type@));
+
+    if (is1b && is2b)
+    {
+        double sum = 0.;  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < CHUNKSIZE ? n : CHUNKSIZE;
+
+            sum += cblas_@prefix@dot(chunk,
+                                     (@type@ *) ip1, is1b,
+                                     (@type@ *) ip2, is2b);
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        *((@type@ *)op) = (@type@)sum;
+    }
+    else
+#endif
+    {
+        @type@ sum = (@type@)0;  /* could make this double */
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const @type@ ip1r = *((@type@ *)ip1);
+            const @type@ ip2r = *((@type@ *)ip2);
+
+            sum += ip1r * ip2r;
+        }
+        *((@type@ *)op) = sum;
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = CFLOAT, CDOUBLE#
+ * #ctype = npy_cfloat, npy_cdouble#
+ * #type = npy_float, npy_double#
+ * #prefix = c, z#
+ */
+static void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+           char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(@ctype@));
+    int is2b = blas_stride(is2, sizeof(@ctype@));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < CHUNKSIZE ? n : CHUNKSIZE;
+            @type@ tmp[2];
+
+            cblas_@prefix@dotu_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((@type@ *)op)[0] = (@type@)sum[0];
+        ((@type@ *)op)[1] = (@type@)sum[1];
+    }
+    else
+#endif
+    {
+        @type@ sumr = (@type@)0.0;
+        @type@ sumi = (@type@)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const @type@ ip1r = ((@type@ *)ip1)[0];
+            const @type@ ip1i = ((@type@ *)ip1)[1];
+            const @type@ ip2r = ((@type@ *)ip2)[0];
+            const @type@ ip2i = ((@type@ *)ip2)[1];
+
+            sumr += ip1r * ip2r - ip1i * ip2i;
+            sumi += ip1i * ip2r + ip1r * ip2i;
+        }
+        ((@type@ *)op)[0] = sumr;
+        ((@type@ *)op)[1] = sumi;
+    }
+}
+/**end repeat**/
+
+/**************************** NO CBLAS VERSIONS *****************************/
+
 static void
 BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
          void *NPY_UNUSED(ignore))
@@ -3094,16 +3244,13 @@ BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
  *
  * #name = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         FLOAT, DOUBLE, LONGDOUBLE,
- *         DATETIME, TIMEDELTA#
+ *         LONGDOUBLE, DATETIME, TIMEDELTA#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_float, npy_double, npy_longdouble,
- *         npy_datetime, npy_timedelta#
+ *         npy_longdouble, npy_datetime, npy_timedelta#
  * #out = npy_long, npy_ulong, npy_long, npy_ulong, npy_long, npy_ulong,
  *        npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *        npy_float, npy_double, npy_longdouble,
- *        npy_datetime, npy_timedelta#
+ *        npy_longdouble, npy_datetime, npy_timedelta#
  */
 static void
 @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
@@ -3121,8 +3268,8 @@ static void
 /**end repeat**/
 
 static void
-HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
-           void *NPY_UNUSED(ignore))
+HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+         npy_intp n, void *NPY_UNUSED(ignore))
 {
     float tmp = 0.0f;
     npy_intp i;
@@ -3134,28 +3281,26 @@ HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
     *((npy_half *)op) = npy_float_to_half(tmp);
 }
 
-/**begin repeat
- *
- * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
- * #type = npy_float, npy_double, npy_longdouble#
- */
-static void @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
-                       char *op, npy_intp n, void *NPY_UNUSED(ignore))
+static void CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+                            char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
-    @type@ tmpr = (@type@)0.0, tmpi=(@type@)0.0;
+    npy_longdouble tmpr = 0.0L;
+    npy_longdouble tmpi = 0.0L;
     npy_intp i;
 
     for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
-        tmpr += ((@type@ *)ip1)[0] * ((@type@ *)ip2)[0]
-                - ((@type@ *)ip1)[1] * ((@type@ *)ip2)[1];
-        tmpi += ((@type@ *)ip1)[1] * ((@type@ *)ip2)[0]
-                + ((@type@ *)ip1)[0] * ((@type@ *)ip2)[1];
+        const npy_longdouble ip1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble ip1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble ip2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
+
+        tmpr += ip1r * ip2r - ip1i * ip2i;
+        tmpi += ip1i * ip2r + ip1r * ip2i;
     }
-    ((@type@ *)op)[0] = tmpr; ((@type@ *)op)[1] = tmpi;
+    ((npy_longdouble *)op)[0] = tmpr;
+    ((npy_longdouble *)op)[1] = tmpi;
 }
 
-/**end repeat**/
-
 static void
 OBJECT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
            void *NPY_UNUSED(ignore))
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index caa576abcf7e..7c2ede982863 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -25,12 +25,10 @@ def test_vecself(self):
 
 @dec.skipif(_dotblas is None, "Numpy is not compiled with _dotblas")
 def test_blasdot_used():
-    from numpy.core import dot, vdot, inner, alterdot, restoredot
+    from numpy.core import dot, vdot, inner
     assert_(dot is _dotblas.dot)
     assert_(vdot is _dotblas.vdot)
     assert_(inner is _dotblas.inner)
-    assert_(alterdot is _dotblas.alterdot)
-    assert_(restoredot is _dotblas.restoredot)
 
 
 def test_dot_2args():
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index ef56766f5f41..9e2248205431 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -5,13 +5,11 @@
 """
 from __future__ import division, absolute_import, print_function
 
-import sys
 import operator
 import warnings
-from nose.plugins.skip import SkipTest
 
 import numpy as np
-from numpy.testing import (dec, run_module_suite, assert_raises,
+from numpy.testing import (run_module_suite, assert_raises,
                            assert_warns, assert_array_equal, assert_)
 
 
@@ -34,11 +32,9 @@ def setUp(self):
         warnings.filterwarnings("always", message=self.message,
                                     category=DeprecationWarning)
 
-
     def tearDown(self):
         self.warn_ctx.__exit__()
 
-
     def assert_deprecated(self, function, num=1, ignore_others=False,
                         function_fails=False,
                         exceptions=(DeprecationWarning,), args=(), kwargs={}):
@@ -102,7 +98,6 @@ def assert_deprecated(self, function, num=1, ignore_others=False,
                 if exceptions == tuple():
                     raise AssertionError("Error raised during function call")
 
-
     def assert_not_deprecated(self, function, args=(), kwargs={}):
         """Test if DeprecationWarnings are given and raised.
 
@@ -143,6 +138,7 @@ class TestFloatNonIntegerArgumentDeprecation(_DeprecationTestCase):
 
     def test_indexing(self):
         a = np.array([[[5]]])
+
         def assert_deprecated(*args, **kwargs):
             self.assert_deprecated(*args, exceptions=(IndexError,), **kwargs)
 
@@ -172,7 +168,6 @@ def assert_deprecated(*args, **kwargs):
         assert_deprecated(lambda: a[0.0:, 0.0], num=2)
         assert_deprecated(lambda: a[0.0:, 0.0,:], num=2)
 
-
     def test_valid_indexing(self):
         a = np.array([[[5]]])
         assert_not_deprecated = self.assert_not_deprecated
@@ -183,9 +178,9 @@ def test_valid_indexing(self):
         assert_not_deprecated(lambda: a[:, 0,:])
         assert_not_deprecated(lambda: a[:,:,:])
 
-
     def test_slicing(self):
         a = np.array([[5]])
+
         def assert_deprecated(*args, **kwargs):
             self.assert_deprecated(*args, exceptions=(IndexError,), **kwargs)
 
@@ -217,7 +212,6 @@ def assert_deprecated(*args, **kwargs):
         # should still get the DeprecationWarning if step = 0.
         assert_deprecated(lambda: a[::0.0], function_fails=True)
 
-
     def test_valid_slicing(self):
         a = np.array([[[5]]])
         assert_not_deprecated = self.assert_not_deprecated
@@ -231,7 +225,6 @@ def test_valid_slicing(self):
         assert_not_deprecated(lambda: a[:2:2])
         assert_not_deprecated(lambda: a[1:2:2])
 
-
     def test_non_integer_argument_deprecations(self):
         a = np.array([[5]])
 
@@ -240,7 +233,6 @@ def test_non_integer_argument_deprecations(self):
         self.assert_deprecated(np.take, args=(a, [0], 1.))
         self.assert_deprecated(np.take, args=(a, [0], np.float64(1.)))
 
-
     def test_non_integer_sequence_multiplication(self):
         # Numpy scalar sequence multiply should not work with non-integers
         def mult(a, b):
@@ -248,7 +240,6 @@ def mult(a, b):
         self.assert_deprecated(mult, args=([1], np.float_(3)))
         self.assert_not_deprecated(mult, args=([1], np.int_(3)))
 
-
     def test_reduce_axis_float_index(self):
         d = np.zeros((3,3,3))
         self.assert_deprecated(np.min, args=(d, 0.5))
@@ -303,7 +294,6 @@ def test_array_to_index_deprecation(self):
         # Check slicing. Normal indexing checks arrays specifically.
         self.assert_deprecated(lambda: a[a:a:a], exceptions=(), num=3)
 
-
 class TestNonIntegerArrayLike(_DeprecationTestCase):
     """Tests that array likes, i.e. lists give a deprecation warning
     when they cannot be safely cast to an integer.
@@ -320,7 +310,6 @@ def test_basic(self):
 
         self.assert_not_deprecated(a.__getitem__, ([],))
 
-
     def test_boolean_futurewarning(self):
         a = np.arange(10)
         with warnings.catch_warnings():
@@ -378,12 +367,13 @@ class TestRankDeprecation(_DeprecationTestCase):
     """Test that np.rank is deprecated. The function should simply be
     removed. The VisibleDeprecationWarning may become unnecessary.
     """
+
     def test(self):
         a = np.arange(10)
         assert_warns(np.VisibleDeprecationWarning, np.rank, a)
 
 
-class TestComparisonDepreactions(_DeprecationTestCase):
+class TestComparisonDeprecations(_DeprecationTestCase):
     """This tests the deprecation, for non-elementwise comparison logic.
     This used to mean that when an error occured during element-wise comparison
     (i.e. broadcasting) NotImplemented was returned, but also in the comparison
@@ -408,7 +398,6 @@ def test_normal_types(self):
             b = np.array([1, np.array([1,2,3])], dtype=object)
             self.assert_deprecated(op, args=(a, b), num=None)
 
-
     def test_string(self):
         # For two string arrays, strings always raised the broadcasting error:
         a = np.array(['a', 'b'])
@@ -420,7 +409,6 @@ def test_string(self):
         # following works (and returns False) due to dtype mismatch:
         a == []
 
-
     def test_none_comparison(self):
         # Test comparison of None, which should result in elementwise
         # comparison in the future. [1, 2] == None should be [False, False].
@@ -455,14 +443,14 @@ def test_scalar_none_comparison(self):
         assert_(np.equal(np.datetime64('NaT'), None))
 
 
-class TestIdentityComparisonDepreactions(_DeprecationTestCase):
+class TestIdentityComparisonDeprecations(_DeprecationTestCase):
     """This tests the equal and not_equal object ufuncs identity check
     deprecation. This was due to the usage of PyObject_RichCompareBool.
 
     This tests that for example for `a = np.array([np.nan], dtype=object)`
     `a == a` it is warned that False and not `np.nan is np.nan` is returned.
 
-    Should be kept in sync with TestComparisonDepreactions and new tests
+    Should be kept in sync with TestComparisonDeprecations and new tests
     added when the deprecation is over. Requires only removing of @identity@
     (and blocks) from the ufunc loops.c.src of the OBJECT comparisons.
     """
@@ -488,11 +476,11 @@ def test_identity_equality_mismatch(self):
                 np.less_equal(a, a)
                 np.greater_equal(a, a)
 
-
     def test_comparison_error(self):
         class FunkyType(object):
             def __eq__(self, other):
                 raise TypeError("I won't compare")
+
             def __ne__(self, other):
                 raise TypeError("I won't compare")
 
@@ -500,7 +488,6 @@ def __ne__(self, other):
         self.assert_deprecated(np.equal, args=(a, a))
         self.assert_deprecated(np.not_equal, args=(a, a))
 
-
     def test_bool_error(self):
         # The comparison result cannot be interpreted as a bool
         a = np.array([np.array([1, 2, 3]), None], dtype=object)
@@ -508,5 +495,18 @@ def test_bool_error(self):
         self.assert_deprecated(np.not_equal, args=(a, a))
 
 
+class TestAlterdotRestoredotDeprecations(_DeprecationTestCase):
+    """The alterdot/restoredot functions are deprecated.
+
+    These functions no longer do anything in numpy 1.10, so should not be
+    used.
+
+    """
+
+    def test_alterdot_restoredot_deprecation(self):
+        self.assert_deprecated(np.alterdot)
+        self.assert_deprecated(np.restoredot)
+
+
 if __name__ == "__main__":
     run_module_suite()

From efd023f95af5cc665880658cff751a655ea5fe4c Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Fri, 15 Aug 2014 13:33:23 -0600
Subject: [PATCH 04/19] MAINT, STY: Remove use of alterdot, restoredot in
 _dotblas.c.

These are no longer needed. Also do C style cleanups.
---
 numpy/core/blasdot/_dotblas.c | 259 ++++++++++++++--------------------
 1 file changed, 108 insertions(+), 151 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index c1b4e7b05aff..812d947b7888 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -19,8 +19,11 @@
 #include <limits.h>
 #include <stdio.h>
 
+
 static char module_doc[] =
-"This module provides a BLAS optimized\nmatrix multiply, inner product and dot for numpy arrays";
+    "This module provides a BLAS optimized\n"
+    "matrix multiply, inner product and dot for numpy arrays";
+
 
 static PyArray_DotFunc *oldFunctions[NPY_NTYPES];
 
@@ -33,7 +36,7 @@ static void
 blas_dot(int typenum, npy_intp n,
          void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
 {
-    PyArray_DotFunc *dot = NULL;
+    PyArray_DotFunc *dot;
 
     dot = oldFunctions[typenum];
     assert(dot != NULL);
@@ -44,6 +47,7 @@ blas_dot(int typenum, npy_intp n,
 static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
 static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
 
+
 /*
  * Helper: dispatch to appropriate cblas_?gemm for typenum.
  */
@@ -113,80 +117,37 @@ gemv(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
 }
 
 
-static npy_bool altered=NPY_FALSE;
-
 /*
- * alterdot() changes all dot functions to use blas.
+ * Initialize oldFunctions table.
  */
-static PyObject *
-dotblas_alterdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
+static void
+init_oldFunctions(void)
 {
     PyArray_Descr *descr;
+    int i;
 
-    if (!PyArg_ParseTuple(args, "")) return NULL;
-
-    /* Replace the dot functions to the ones using blas */
-
-    if (!altered) {
-        descr = PyArray_DescrFromType(NPY_FLOAT);
-        oldFunctions[NPY_FLOAT] = descr->f->dotfunc;
-
-        descr = PyArray_DescrFromType(NPY_DOUBLE);
-        oldFunctions[NPY_DOUBLE] = descr->f->dotfunc;
+    /* Initialise the array of dot functions */
+    for (i = 0; i < NPY_NTYPES; i++)
+        oldFunctions[i] = NULL;
 
-        descr = PyArray_DescrFromType(NPY_CFLOAT);
-        oldFunctions[NPY_CFLOAT] = descr->f->dotfunc;
+    /* index dot functions we want to use here */
+    descr = PyArray_DescrFromType(NPY_FLOAT);
+    oldFunctions[NPY_FLOAT] = descr->f->dotfunc;
 
-        descr = PyArray_DescrFromType(NPY_CDOUBLE);
-        oldFunctions[NPY_CDOUBLE] = descr->f->dotfunc;
+    descr = PyArray_DescrFromType(NPY_DOUBLE);
+    oldFunctions[NPY_DOUBLE] = descr->f->dotfunc;
 
-        altered = NPY_TRUE;
-    }
+    descr = PyArray_DescrFromType(NPY_CFLOAT);
+    oldFunctions[NPY_CFLOAT] = descr->f->dotfunc;
 
-    Py_INCREF(Py_None);
-    return Py_None;
+    descr = PyArray_DescrFromType(NPY_CDOUBLE);
+    oldFunctions[NPY_CDOUBLE] = descr->f->dotfunc;
 }
 
-/*
- * restoredot() restores dots to defaults.
- */
-static PyObject *
-dotblas_restoredot(PyObject *NPY_UNUSED(dummy), PyObject *args)
-{
-    PyArray_Descr *descr;
-
-    if (!PyArg_ParseTuple(args, "")) return NULL;
-
-    if (altered) {
-        descr = PyArray_DescrFromType(NPY_FLOAT);
-        descr->f->dotfunc = oldFunctions[NPY_FLOAT];
-        oldFunctions[NPY_FLOAT] = NULL;
-        Py_XDECREF(descr);
-
-        descr = PyArray_DescrFromType(NPY_DOUBLE);
-        descr->f->dotfunc = oldFunctions[NPY_DOUBLE];
-        oldFunctions[NPY_DOUBLE] = NULL;
-        Py_XDECREF(descr);
-
-        descr = PyArray_DescrFromType(NPY_CFLOAT);
-        descr->f->dotfunc = oldFunctions[NPY_CFLOAT];
-        oldFunctions[NPY_CFLOAT] = NULL;
-        Py_XDECREF(descr);
-
-        descr = PyArray_DescrFromType(NPY_CDOUBLE);
-        descr->f->dotfunc = oldFunctions[NPY_CDOUBLE];
-        oldFunctions[NPY_CDOUBLE] = NULL;
-        Py_XDECREF(descr);
-
-        altered = NPY_FALSE;
-    }
-
-    Py_INCREF(Py_None);
-    return Py_None;
-}
 
 typedef enum {_scalar, _column, _row, _matrix} MatrixShape;
 
+
 static MatrixShape
 _select_matrix_shape(PyArrayObject *array)
 {
@@ -212,21 +173,24 @@ _select_matrix_shape(PyArrayObject *array)
 }
 
 
-/* This also makes sure that the data segment is aligned with
-   an itemsize address as well by returning one if not true.
-*/
+/*
+ * This also makes sure that the data segment is aligned with
+ * an itemsize address as well by returning one if not true.
+ */
 static int
 _bad_strides(PyArrayObject *ap)
 {
-    register int itemsize = PyArray_ITEMSIZE(ap);
-    register int i, N=PyArray_NDIM(ap);
-    register npy_intp *strides = PyArray_STRIDES(ap);
+    int itemsize = PyArray_ITEMSIZE(ap);
+    int i, N=PyArray_NDIM(ap);
+    npy_intp *strides = PyArray_STRIDES(ap);
 
-    if (((npy_intp)(PyArray_DATA(ap)) % itemsize) != 0)
+    if (((npy_intp)(PyArray_DATA(ap)) % itemsize) != 0) {
         return 1;
-    for (i=0; i<N; i++) {
-        if ((strides[i] < 0) || (strides[i] % itemsize) != 0)
+    }
+    for (i = 0; i < N; i++) {
+        if ((strides[i] < 0) || (strides[i] % itemsize) != 0) {
             return 1;
+        }
     }
 
     return 0;
@@ -268,7 +232,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         Py_DECREF(module);
     }
 
-    errval = PyUFunc_CheckOverride(cached_npy_dot, "__call__", args, kwargs, 
+    errval = PyUFunc_CheckOverride(cached_npy_dot, "__call__", args, kwargs,
                                    &override, 2);
     if (errval) {
         return NULL;
@@ -295,7 +259,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
 
     /* This function doesn't handle other types */
     if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-         typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
+            typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
         return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(
                                                     (PyObject *)op1,
                                                     (PyObject *)op2,
@@ -319,19 +283,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
     }
 
     if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /*
-         * This function doesn't handle dimensions greater than 2
-         * (or negative striding)  -- other
-         * than to ensure the dot function is altered
-         */
-        if (!altered) {
-            /* need to alter dot product */
-            PyObject *tmp1, *tmp2;
-            tmp1 = PyTuple_New(0);
-            tmp2 = dotblas_alterdot(NULL, tmp1);
-            Py_DECREF(tmp1);
-            Py_DECREF(tmp2);
-        }
+        /* This function doesn't handle dimensions greater than 2 */
         ret = (PyArrayObject *)PyArray_MatrixProduct2((PyObject *)ap1,
                                                       (PyObject *)ap2,
                                                       out);
@@ -363,7 +315,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         PyArrayObject *oap1, *oap2;
         oap1 = ap1; oap2 = ap2;
         /* One of ap1 or ap2 is a scalar */
-        if (ap1shape == _scalar) {              /* Make ap2 the scalar */
+        if (ap1shape == _scalar) {
+            /* Make ap2 the scalar */
             PyArrayObject *t = ap1;
             ap1 = ap2;
             ap2 = t;
@@ -453,8 +406,9 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         }
         nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
 
-        if (nd == 1)
+        if (nd == 1) {
             dimensions[0] = (PyArray_NDIM(ap1) == 2) ? PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 1);
+        }
         else if (nd == 2) {
             dimensions[0] = PyArray_DIM(ap1, 0);
             dimensions[1] = PyArray_DIM(ap2, 1);
@@ -474,6 +428,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
 
     if (out) {
         int d;
+
         /* verify that out is usable */
         if (Py_TYPE(out) != subtype ||
             PyArray_NDIM(out) != nd ||
@@ -506,7 +461,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
     }
     numbytes = PyArray_NBYTES(ret);
     memset(PyArray_DATA(ret), 0, numbytes);
-    if (numbytes==0 || l == 0) {
+    if (numbytes == 0 || l == 0) {
             Py_DECREF(ap1);
             Py_DECREF(ap2);
             return PyArray_Return(ret);
@@ -523,7 +478,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         if (typenum == NPY_DOUBLE) {
             if (l == 1) {
                 *((double *)PyArray_DATA(ret)) = *((double *)PyArray_DATA(ap2)) *
-                    *((double *)PyArray_DATA(ap1));
+                                                 *((double *)PyArray_DATA(ap1));
             }
             else if (ap1shape != _matrix) {
                 cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1),
@@ -815,7 +770,9 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     PyTypeObject *subtype;
     double prior1, prior2;
 
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
+    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+        return NULL;
+    }
 
     /*
      * Inner product using the BLAS.  The product sum is taken along the last
@@ -829,28 +786,22 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
     /* This function doesn't handle other types */
     if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-         typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
-            return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(op1, op2));
+            typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
+        return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(op1, op2));
     }
 
     ret = NULL;
     ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
-    if (ap1 == NULL) return NULL;
+    if (ap1 == NULL) {
+        return NULL;
+    }
     ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
-    if (ap2 == NULL) goto fail;
+    if (ap2 == NULL) {
+        goto fail;
+    }
 
     if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /* This function doesn't handle dimensions greater than 2 -- other
-           than to ensure the dot function is altered
-        */
-        if (!altered) {
-            /* need to alter dot product */
-            PyObject *tmp1, *tmp2;
-            tmp1 = PyTuple_New(0);
-            tmp2 = dotblas_alterdot(NULL, tmp1);
-            Py_DECREF(tmp1);
-            Py_DECREF(tmp2);
-        }
+        /* This function doesn't handle dimensions greater than 2 */
         ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
                                                     (PyObject *)ap2);
         Py_DECREF(ap1);
@@ -860,7 +811,8 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         /* One of ap1 or ap2 is a scalar */
-        if (PyArray_NDIM(ap1) == 0) {             /* Make ap2 the scalar */
+        if (PyArray_NDIM(ap1) == 0) {
+            /* Make ap2 the scalar */
             PyArrayObject *t = ap1;
             ap1 = ap2;
             ap2 = t;
@@ -871,8 +823,11 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         }
         nd = PyArray_NDIM(ap1);
     }
-    else { /* (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2) */
-        /*  Both ap1 and ap2 are vectors or matrices */
+    else {
+        /*
+         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
+         *  Both ap1 and ap2 are vectors or matrices
+         */
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
 
         if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
@@ -899,7 +854,9 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
                                        (PyObject *)\
                                        (prior2 > prior1 ? ap2 : ap1));
 
-    if (ret == NULL) goto fail;
+    if (ret == NULL) {
+        goto fail;
+    }
     NPY_BEGIN_ALLOW_THREADS
     memset(PyArray_DATA(ret), 0, PyArray_NBYTES(ret));
 
@@ -937,8 +894,11 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         lda = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
         gemv(typenum, CblasRowMajor, CblasNoTrans, ap2, lda, ap1, 1, ret);
     }
-    else { /* (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2) */
-        /* Matrix matrix multiplication -- Level 3 BLAS */
+    else {
+        /*
+         * (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2)
+         * Matrix matrix multiplication -- Level 3 BLAS
+         */
         lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
         ldb = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
         gemm(typenum, CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -966,51 +926,56 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
  */
 static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
     PyObject *op1, *op2;
-    PyArrayObject *ap1=NULL, *ap2=NULL, *ret=NULL;
+    PyArrayObject *ap1 = NULL, *ap2  = NULL, *ret = NULL;
     int l;
     int typenum;
     npy_intp dimensions[NPY_MAXDIMS];
     PyArray_Descr *type;
 
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
+    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+        return NULL;
+    }
 
     /*
      * Conjugating dot product using the BLAS for vectors.
      * Multiplies op1 and op2, each of which must be vector.
      */
-
     typenum = PyArray_ObjectType(op1, 0);
     typenum = PyArray_ObjectType(op2, typenum);
 
     type = PyArray_DescrFromType(typenum);
     Py_INCREF(type);
     ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
-    if (ap1==NULL) {Py_DECREF(type); goto fail;}
+    if (ap1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
     op1 = PyArray_Flatten(ap1, 0);
-    if (op1==NULL) {Py_DECREF(type); goto fail;}
+    if (op1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
     Py_DECREF(ap1);
     ap1 = (PyArrayObject *)op1;
 
     ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
-    if (ap2==NULL) goto fail;
+    if (ap2 == NULL) {
+        goto fail;
+    }
     op2 = PyArray_Flatten(ap2, 0);
-    if (op2 == NULL) goto fail;
+    if (op2 == NULL) {
+        goto fail;
+    }
     Py_DECREF(ap2);
     ap2 = (PyArrayObject *)op2;
 
     if (typenum != NPY_FLOAT && typenum != NPY_DOUBLE &&
         typenum != NPY_CFLOAT && typenum != NPY_CDOUBLE) {
-        if (!altered) {
-            /* need to alter dot product */
-            PyObject *tmp1, *tmp2;
-            tmp1 = PyTuple_New(0);
-            tmp2 = dotblas_alterdot(NULL, tmp1);
-            Py_DECREF(tmp1);
-            Py_DECREF(tmp2);
-        }
         if (PyTypeNum_ISCOMPLEX(typenum)) {
             op1 = PyArray_Conjugate(ap1, NULL);
-            if (op1==NULL) goto fail;
+            if (op1 == NULL) {
+                goto fail;
+            }
             Py_DECREF(ap1);
             ap1 = (PyArrayObject *)op1;
         }
@@ -1028,7 +993,9 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
     l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
 
     ret = (PyArrayObject *)PyArray_SimpleNew(0, dimensions, typenum);
-    if (ret == NULL) goto fail;
+    if (ret == NULL) {
+        goto fail;
+    }
 
     NPY_BEGIN_ALLOW_THREADS;
 
@@ -1060,11 +1027,15 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
 }
 
 static struct PyMethodDef dotblas_module_methods[] = {
-    {"dot",  (PyCFunction)dotblas_matrixproduct, METH_VARARGS|METH_KEYWORDS, NULL},
-    {"inner",   (PyCFunction)dotblas_innerproduct,  1, NULL},
-    {"vdot", (PyCFunction)dotblas_vdot, 1, NULL},
-    {"alterdot", (PyCFunction)dotblas_alterdot, 1, NULL},
-    {"restoredot", (PyCFunction)dotblas_restoredot, 1, NULL},
+    {"dot",
+        (PyCFunction)dotblas_matrixproduct,
+        METH_VARARGS|METH_KEYWORDS, NULL},
+    {"inner",
+        (PyCFunction)dotblas_innerproduct,
+        1, NULL},
+    {"vdot",
+        (PyCFunction)dotblas_vdot,
+        1, NULL},
     {NULL, NULL, 0, NULL}               /* sentinel */
 };
 
@@ -1092,31 +1063,17 @@ PyMODINIT_FUNC init_dotblas(void)
 #endif
 {
 #if defined(NPY_PY3K)
-    int i;
-
-    PyObject *d, *s, *m;
+    PyObject *m;
     m = PyModule_Create(&moduledef);
 #else
-    int i;
-
-    PyObject *d, *s;
     Py_InitModule3("_dotblas", dotblas_module_methods, module_doc);
 #endif
 
-    /* add the functions */
-
     /* Import the array object */
     import_array();
 
-    /* Initialise the array of dot functions */
-    for (i = 0; i < NPY_NTYPES; i++)
-        oldFunctions[i] = NULL;
-
-    /* alterdot at load */
-    d = PyTuple_New(0);
-    s = dotblas_alterdot(NULL, d);
-    Py_DECREF(d);
-    Py_DECREF(s);
+    /* initialize oldFunctions table */
+    init_oldFunctions();
 
     return RETVAL;
 }

From 4e6066b35e3698e2c74c76092d4315c8fd6cc382 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sat, 16 Aug 2014 11:53:02 -0600
Subject: [PATCH 05/19] ENH: Move dotblas_matrixproduct down into multiarray.

Move the dotblas_matrixproduct function from the _dotblas.c file to a
new cblasfuncs.c file in multiarray and rename it cblas_matrixproduct.
Modify it so that it can called directly from PyArray_MatrixProduct2 and
do so.  Fix up numeric.py and core/__init__.py to reflect these changes.
---
 numpy/core/__init__.py                        |  14 +
 numpy/core/blasdot/_dotblas.c                 | 585 +----------
 numpy/core/bscript                            |   3 +
 numpy/core/numeric.py                         |  12 +-
 numpy/core/setup.py                           |  11 +-
 numpy/core/src/multiarray/arraytypes.c.src    |   4 +-
 numpy/core/src/multiarray/arraytypes.h        |  15 +
 numpy/core/src/multiarray/cblasfuncs.c        | 945 ++++++++++++++++++
 numpy/core/src/multiarray/cblasfuncs.h        |   7 +
 numpy/core/src/multiarray/multiarraymodule.c  |  47 +-
 .../src/multiarray/multiarraymodule_onefile.c |   9 +-
 numpy/core/tests/test_blasdot.py              |   7 +-
 numpy/linalg/bscript                          |   2 +-
 13 files changed, 1049 insertions(+), 612 deletions(-)
 create mode 100644 numpy/core/src/multiarray/cblasfuncs.c
 create mode 100644 numpy/core/src/multiarray/cblasfuncs.h

diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 0b8d5bb17786..976977621cda 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -3,7 +3,21 @@
 from .info import __doc__
 from numpy.version import version as __version__
 
+
+# disables openblas affinity setting of the main thread that limits
+# python threads or processes to one core
+import os
+envbak = os.environ.copy()
+if 'openblas_main_free' not in os.environ:
+    os.environ['openblas_main_free'] = '1'
+if 'gotoblas_main_free' not in os.environ:
+    os.environ['gotoblas_main_free'] = '1'
 from . import multiarray
+os.environ.clear()
+os.environ.update(envbak)
+del envbak
+del os
+
 from . import umath
 from . import _internal # for freeze programs
 from . import numerictypes as nt
diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 812d947b7888..4fc645358b5f 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -152,22 +152,22 @@ static MatrixShape
 _select_matrix_shape(PyArrayObject *array)
 {
     switch (PyArray_NDIM(array)) {
-    case 0:
-        return _scalar;
-    case 1:
-        if (PyArray_DIM(array, 0) > 1)
-            return _column;
-        return _scalar;
-    case 2:
-        if (PyArray_DIM(array, 0) > 1) {
-            if (PyArray_DIM(array, 1) == 1)
+        case 0:
+            return _scalar;
+        case 1:
+            if (PyArray_DIM(array, 0) > 1)
                 return _column;
-            else
-                return _matrix;
-        }
-        if (PyArray_DIM(array, 1) == 1)
             return _scalar;
-        return _row;
+        case 2:
+            if (PyArray_DIM(array, 0) > 1) {
+                if (PyArray_DIM(array, 1) == 1)
+                    return _column;
+                else
+                    return _matrix;
+            }
+            if (PyArray_DIM(array, 1) == 1)
+                return _scalar;
+            return _row;
     }
     return _matrix;
 }
@@ -196,560 +196,6 @@ _bad_strides(PyArrayObject *ap)
     return 0;
 }
 
-/*
- * dot(a,b)
- * Returns the dot product of a and b for arrays of floating point types.
- * Like the generic numpy equivalent the product sum is over
- * the last dimension of a and the second-to-last dimension of b.
- * NB: The first argument is not conjugated.;
- */
-static PyObject *
-dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwargs)
-{
-    static PyObject *cached_npy_dot = NULL;
-    PyObject *override = NULL;
-    PyObject *module;
-    PyObject *op1, *op2;
-    PyArrayObject *ap1 = NULL, *ap2 = NULL, *out = NULL, *ret = NULL;
-    int errval;
-    int j, lda, ldb;
-    npy_intp l;
-    int typenum, nd;
-    npy_intp ap1stride = 0;
-    npy_intp dimensions[NPY_MAXDIMS];
-    npy_intp numbytes;
-    double prior1, prior2;
-    PyTypeObject *subtype;
-    PyArray_Descr *dtype;
-    MatrixShape ap1shape, ap2shape;
-    char* kwords[] = {"a", "b", "out", NULL };
-
-    if (cached_npy_dot == NULL) {
-        module = PyImport_ImportModule("numpy.core._dotblas");
-        cached_npy_dot = PyDict_GetItemString(PyModule_GetDict(module), "dot");
-
-        Py_INCREF(cached_npy_dot);
-        Py_DECREF(module);
-    }
-
-    errval = PyUFunc_CheckOverride(cached_npy_dot, "__call__", args, kwargs,
-                                   &override, 2);
-    if (errval) {
-        return NULL;
-    }
-    else if (override) {
-        return override;
-    }
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O", kwords,
-                                    &op1, &op2, &out)) {
-        return NULL;
-    }
-    if ((PyObject *)out == Py_None) {
-        out = NULL;
-    }
-
-    /*
-     * "Matrix product" using the BLAS.
-     * Only works for float double and complex types.
-     */
-
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    /* This function doesn't handle other types */
-    if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-            typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(
-                                                    (PyObject *)op1,
-                                                    (PyObject *)op2,
-                                                    out));
-    }
-
-    dtype = PyArray_DescrFromType(typenum);
-    if (dtype == NULL) {
-        return NULL;
-    }
-    Py_INCREF(dtype);
-    ap1 = (PyArrayObject *)PyArray_FromAny(op1, dtype, 0, 0, NPY_ARRAY_ALIGNED, NULL);
-    if (ap1 == NULL) {
-        Py_DECREF(dtype);
-        return NULL;
-    }
-    ap2 = (PyArrayObject *)PyArray_FromAny(op2, dtype, 0, 0, NPY_ARRAY_ALIGNED, NULL);
-    if (ap2 == NULL) {
-        Py_DECREF(ap1);
-        return NULL;
-    }
-
-    if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /* This function doesn't handle dimensions greater than 2 */
-        ret = (PyArrayObject *)PyArray_MatrixProduct2((PyObject *)ap1,
-                                                      (PyObject *)ap2,
-                                                      out);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
-
-    if (_bad_strides(ap1)) {
-            op1 = PyArray_NewCopy(ap1, NPY_ANYORDER);
-            Py_DECREF(ap1);
-            ap1 = (PyArrayObject *)op1;
-            if (ap1 == NULL) {
-                goto fail;
-            }
-    }
-    if (_bad_strides(ap2)) {
-            op2 = PyArray_NewCopy(ap2, NPY_ANYORDER);
-            Py_DECREF(ap2);
-            ap2 = (PyArrayObject *)op2;
-            if (ap2 == NULL) {
-                goto fail;
-            }
-    }
-    ap1shape = _select_matrix_shape(ap1);
-    ap2shape = _select_matrix_shape(ap2);
-
-    if (ap1shape == _scalar || ap2shape == _scalar) {
-        PyArrayObject *oap1, *oap2;
-        oap1 = ap1; oap2 = ap2;
-        /* One of ap1 or ap2 is a scalar */
-        if (ap1shape == _scalar) {
-            /* Make ap2 the scalar */
-            PyArrayObject *t = ap1;
-            ap1 = ap2;
-            ap2 = t;
-            ap1shape = ap2shape;
-            ap2shape = _scalar;
-        }
-
-        if (ap1shape == _row) {
-            ap1stride = PyArray_STRIDE(ap1, 1);
-        }
-        else if (PyArray_NDIM(ap1) > 0) {
-            ap1stride = PyArray_STRIDE(ap1, 0);
-        }
-
-        if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
-            npy_intp *thisdims;
-            if (PyArray_NDIM(ap1) == 0) {
-                nd = PyArray_NDIM(ap2);
-                thisdims = PyArray_DIMS(ap2);
-            }
-            else {
-                nd = PyArray_NDIM(ap1);
-                thisdims = PyArray_DIMS(ap1);
-            }
-            l = 1;
-            for (j = 0; j < nd; j++) {
-                dimensions[j] = thisdims[j];
-                l *= dimensions[j];
-            }
-        }
-        else {
-            l = PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1);
-
-            if (PyArray_DIM(oap2, 0) != l) {
-                PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
-                goto fail;
-            }
-            nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
-            /*
-             * nd = 0 or 1 or 2. If nd == 0 do nothing ...
-             */
-            if (nd == 1) {
-                /*
-                 * Either PyArray_NDIM(ap1) is 1 dim or PyArray_NDIM(ap2) is 1 dim
-                 * and the other is 2-dim
-                 */
-                dimensions[0] = (PyArray_NDIM(oap1) == 2) ? PyArray_DIM(oap1, 0) : PyArray_DIM(oap2, 1);
-                l = dimensions[0];
-                /*
-                 * Fix it so that dot(shape=(N,1), shape=(1,))
-                 * and dot(shape=(1,), shape=(1,N)) both return
-                 * an (N,) array (but use the fast scalar code)
-                 */
-            }
-            else if (nd == 2) {
-                dimensions[0] = PyArray_DIM(oap1, 0);
-                dimensions[1] = PyArray_DIM(oap2, 1);
-                /*
-                 * We need to make sure that dot(shape=(1,1), shape=(1,N))
-                 * and dot(shape=(N,1),shape=(1,1)) uses
-                 * scalar multiplication appropriately
-                 */
-                if (ap1shape == _row) {
-                    l = dimensions[1];
-                }
-                else {
-                    l = dimensions[0];
-                }
-            }
-
-            /* Check if the summation dimension is 0-sized */
-            if (PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1) == 0) {
-                l = 0;
-            }
-        }
-    }
-    else {
-        /*
-         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
-         * Both ap1 and ap2 are vectors or matrices
-         */
-        l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
-
-        if (PyArray_DIM(ap2, 0) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
-            goto fail;
-        }
-        nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
-
-        if (nd == 1) {
-            dimensions[0] = (PyArray_NDIM(ap1) == 2) ? PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 1);
-        }
-        else if (nd == 2) {
-            dimensions[0] = PyArray_DIM(ap1, 0);
-            dimensions[1] = PyArray_DIM(ap2, 1);
-        }
-    }
-
-    /* Choose which subtype to return */
-    if (Py_TYPE(ap1) != Py_TYPE(ap2)) {
-        prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
-        prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
-        subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
-    }
-    else {
-        prior1 = prior2 = 0.0;
-        subtype = Py_TYPE(ap1);
-    }
-
-    if (out) {
-        int d;
-
-        /* verify that out is usable */
-        if (Py_TYPE(out) != subtype ||
-            PyArray_NDIM(out) != nd ||
-            PyArray_TYPE(out) != typenum ||
-            !PyArray_ISCARRAY(out)) {
-
-            PyErr_SetString(PyExc_ValueError,
-                "output array is not acceptable "
-                "(must have the right type, nr dimensions, and be a C-Array)");
-            goto fail;
-        }
-        for (d = 0; d < nd; ++d) {
-            if (dimensions[d] != PyArray_DIM(out, d)) {
-                PyErr_SetString(PyExc_ValueError,
-                    "output array has wrong dimensions");
-                goto fail;
-            }
-        }
-        Py_INCREF(out);
-        ret = out;
-    } else {
-        ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                           typenum, NULL, NULL, 0, 0,
-                                           (PyObject *)
-                                           (prior2 > prior1 ? ap2 : ap1));
-    }
-
-    if (ret == NULL) {
-        goto fail;
-    }
-    numbytes = PyArray_NBYTES(ret);
-    memset(PyArray_DATA(ret), 0, numbytes);
-    if (numbytes == 0 || l == 0) {
-            Py_DECREF(ap1);
-            Py_DECREF(ap2);
-            return PyArray_Return(ret);
-    }
-
-    if (ap2shape == _scalar) {
-        /*
-         * Multiplication by a scalar -- Level 1 BLAS
-         * if ap1shape is a matrix and we are not contiguous, then we can't
-         * just blast through the entire array using a single striding factor
-         */
-        NPY_BEGIN_ALLOW_THREADS;
-
-        if (typenum == NPY_DOUBLE) {
-            if (l == 1) {
-                *((double *)PyArray_DATA(ret)) = *((double *)PyArray_DATA(ap2)) *
-                                                 *((double *)PyArray_DATA(ap1));
-            }
-            else if (ap1shape != _matrix) {
-                cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(double), (double *)PyArray_DATA(ret), 1);
-            }
-            else {
-                int maxind, oind, i, a1s, rets;
-                char *ptr, *rptr;
-                double val;
-
-                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
-                ptr = PyArray_DATA(ap1);
-                rptr = PyArray_DATA(ret);
-                l = PyArray_DIM(ap1, maxind);
-                val = *((double *)PyArray_DATA(ap2));
-                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(double);
-                rets = PyArray_STRIDE(ret, maxind) / sizeof(double);
-                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_daxpy(l, val, (double *)ptr, a1s,
-                                (double *)rptr, rets);
-                    ptr += PyArray_STRIDE(ap1, oind);
-                    rptr += PyArray_STRIDE(ret, oind);
-                }
-            }
-        }
-        else if (typenum == NPY_CDOUBLE) {
-            if (l == 1) {
-                npy_cdouble *ptr1, *ptr2, *res;
-
-                ptr1 = (npy_cdouble *)PyArray_DATA(ap2);
-                ptr2 = (npy_cdouble *)PyArray_DATA(ap1);
-                res = (npy_cdouble *)PyArray_DATA(ret);
-                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
-                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
-            }
-            else if (ap1shape != _matrix) {
-                cblas_zaxpy(l, (double *)PyArray_DATA(ap2), (double *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(npy_cdouble), (double *)PyArray_DATA(ret), 1);
-            }
-            else {
-                int maxind, oind, i, a1s, rets;
-                char *ptr, *rptr;
-                double *pval;
-
-                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
-                ptr = PyArray_DATA(ap1);
-                rptr = PyArray_DATA(ret);
-                l = PyArray_DIM(ap1, maxind);
-                pval = (double *)PyArray_DATA(ap2);
-                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(npy_cdouble);
-                rets = PyArray_STRIDE(ret, maxind) / sizeof(npy_cdouble);
-                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_zaxpy(l, pval, (double *)ptr, a1s,
-                                (double *)rptr, rets);
-                    ptr += PyArray_STRIDE(ap1, oind);
-                    rptr += PyArray_STRIDE(ret, oind);
-                }
-            }
-        }
-        else if (typenum == NPY_FLOAT) {
-            if (l == 1) {
-                *((float *)PyArray_DATA(ret)) = *((float *)PyArray_DATA(ap2)) *
-                    *((float *)PyArray_DATA(ap1));
-            }
-            else if (ap1shape != _matrix) {
-                cblas_saxpy(l, *((float *)PyArray_DATA(ap2)), (float *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(float), (float *)PyArray_DATA(ret), 1);
-            }
-            else {
-                int maxind, oind, i, a1s, rets;
-                char *ptr, *rptr;
-                float val;
-
-                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
-                ptr = PyArray_DATA(ap1);
-                rptr = PyArray_DATA(ret);
-                l = PyArray_DIM(ap1, maxind);
-                val = *((float *)PyArray_DATA(ap2));
-                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(float);
-                rets = PyArray_STRIDE(ret, maxind) / sizeof(float);
-                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_saxpy(l, val, (float *)ptr, a1s,
-                                (float *)rptr, rets);
-                    ptr += PyArray_STRIDE(ap1, oind);
-                    rptr += PyArray_STRIDE(ret, oind);
-                }
-            }
-        }
-        else if (typenum == NPY_CFLOAT) {
-            if (l == 1) {
-                npy_cfloat *ptr1, *ptr2, *res;
-
-                ptr1 = (npy_cfloat *)PyArray_DATA(ap2);
-                ptr2 = (npy_cfloat *)PyArray_DATA(ap1);
-                res = (npy_cfloat *)PyArray_DATA(ret);
-                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
-                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
-            }
-            else if (ap1shape != _matrix) {
-                cblas_caxpy(l, (float *)PyArray_DATA(ap2), (float *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(npy_cfloat), (float *)PyArray_DATA(ret), 1);
-            }
-            else {
-                int maxind, oind, i, a1s, rets;
-                char *ptr, *rptr;
-                float *pval;
-
-                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
-                oind = 1-maxind;
-                ptr = PyArray_DATA(ap1);
-                rptr = PyArray_DATA(ret);
-                l = PyArray_DIM(ap1, maxind);
-                pval = (float *)PyArray_DATA(ap2);
-                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(npy_cfloat);
-                rets = PyArray_STRIDE(ret, maxind) / sizeof(npy_cfloat);
-                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_caxpy(l, pval, (float *)ptr, a1s,
-                                (float *)rptr, rets);
-                    ptr += PyArray_STRIDE(ap1, oind);
-                    rptr += PyArray_STRIDE(ret, oind);
-                }
-            }
-        }
-        NPY_END_ALLOW_THREADS;
-    }
-    else if ((ap2shape == _column) && (ap1shape != _matrix)) {
-        NPY_BEGIN_ALLOW_THREADS;
-
-        /* Dot product between two vectors -- Level 1 BLAS */
-        blas_dot(typenum, l,
-                 PyArray_DATA(ap1), PyArray_STRIDE(ap1, (ap1shape == _row)),
-                 PyArray_DATA(ap2), PyArray_STRIDE(ap2, 0),
-                 PyArray_DATA(ret));
-        NPY_END_ALLOW_THREADS;
-    }
-    else if (ap1shape == _matrix && ap2shape != _matrix) {
-        /* Matrix vector multiplication -- Level 2 BLAS */
-        /* lda must be MAX(M,1) */
-        enum CBLAS_ORDER Order;
-        int ap2s;
-
-        if (!PyArray_ISONESEGMENT(ap1)) {
-            PyObject *new;
-            new = PyArray_Copy(ap1);
-            Py_DECREF(ap1);
-            ap1 = (PyArrayObject *)new;
-            if (new == NULL) {
-                goto fail;
-            }
-        }
-        NPY_BEGIN_ALLOW_THREADS
-        if (PyArray_ISCONTIGUOUS(ap1)) {
-            Order = CblasRowMajor;
-            lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
-        }
-        else {
-            Order = CblasColMajor;
-            lda = (PyArray_DIM(ap1, 0) > 1 ? PyArray_DIM(ap1, 0) : 1);
-        }
-        ap2s = PyArray_STRIDE(ap2, 0) / PyArray_ITEMSIZE(ap2);
-        gemv(typenum, Order, CblasNoTrans, ap1, lda, ap2, ap2s, ret);
-        NPY_END_ALLOW_THREADS;
-    }
-    else if (ap1shape != _matrix && ap2shape == _matrix) {
-        /* Vector matrix multiplication -- Level 2 BLAS */
-        enum CBLAS_ORDER Order;
-        int ap1s;
-
-        if (!PyArray_ISONESEGMENT(ap2)) {
-            PyObject *new;
-            new = PyArray_Copy(ap2);
-            Py_DECREF(ap2);
-            ap2 = (PyArrayObject *)new;
-            if (new == NULL) {
-                goto fail;
-            }
-        }
-        NPY_BEGIN_ALLOW_THREADS
-        if (PyArray_ISCONTIGUOUS(ap2)) {
-            Order = CblasRowMajor;
-            lda = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
-        }
-        else {
-            Order = CblasColMajor;
-            lda = (PyArray_DIM(ap2, 0) > 1 ? PyArray_DIM(ap2, 0) : 1);
-        }
-        if (ap1shape == _row) {
-            ap1s = PyArray_STRIDE(ap1, 1) / PyArray_ITEMSIZE(ap1);
-        }
-        else {
-            ap1s = PyArray_STRIDE(ap1, 0) / PyArray_ITEMSIZE(ap1);
-        }
-        gemv(typenum, Order, CblasTrans, ap2, lda, ap1, ap1s, ret);
-        NPY_END_ALLOW_THREADS;
-    }
-    else {
-        /*
-         * (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2)
-         * Matrix matrix multiplication -- Level 3 BLAS
-         *  L x M  multiplied by M x N
-         */
-        enum CBLAS_ORDER Order;
-        enum CBLAS_TRANSPOSE Trans1, Trans2;
-        int M, N, L;
-
-        /* Optimization possible: */
-        /*
-         * We may be able to handle single-segment arrays here
-         * using appropriate values of Order, Trans1, and Trans2.
-         */
-        if (!PyArray_IS_C_CONTIGUOUS(ap2) && !PyArray_IS_F_CONTIGUOUS(ap2)) {
-            PyObject *new = PyArray_Copy(ap2);
-
-            Py_DECREF(ap2);
-            ap2 = (PyArrayObject *)new;
-            if (new == NULL) {
-                goto fail;
-            }
-        }
-        if (!PyArray_IS_C_CONTIGUOUS(ap1) && !PyArray_IS_F_CONTIGUOUS(ap1)) {
-            PyObject *new = PyArray_Copy(ap1);
-
-            Py_DECREF(ap1);
-            ap1 = (PyArrayObject *)new;
-            if (new == NULL) {
-                goto fail;
-            }
-        }
-
-        NPY_BEGIN_ALLOW_THREADS;
-
-        Order = CblasRowMajor;
-        Trans1 = CblasNoTrans;
-        Trans2 = CblasNoTrans;
-        L = PyArray_DIM(ap1, 0);
-        N = PyArray_DIM(ap2, 1);
-        M = PyArray_DIM(ap2, 0);
-        lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
-        ldb = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
-
-        /*
-         * Avoid temporary copies for arrays in Fortran order
-         */
-        if (PyArray_IS_F_CONTIGUOUS(ap1)) {
-            Trans1 = CblasTrans;
-            lda = (PyArray_DIM(ap1, 0) > 1 ? PyArray_DIM(ap1, 0) : 1);
-        }
-        if (PyArray_IS_F_CONTIGUOUS(ap2)) {
-            Trans2 = CblasTrans;
-            ldb = (PyArray_DIM(ap2, 0) > 1 ? PyArray_DIM(ap2, 0) : 1);
-        }
-        gemm(typenum, Order, Trans1, Trans2, L, N, M, ap1, lda, ap2, ldb, ret);
-        NPY_END_ALLOW_THREADS;
-    }
-
-
-    Py_DECREF(ap1);
-    Py_DECREF(ap2);
-    return PyArray_Return(ret);
-
- fail:
-    Py_XDECREF(ap1);
-    Py_XDECREF(ap2);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-
 /*
  * innerproduct(a,b)
  *
@@ -1027,9 +473,6 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
 }
 
 static struct PyMethodDef dotblas_module_methods[] = {
-    {"dot",
-        (PyCFunction)dotblas_matrixproduct,
-        METH_VARARGS|METH_KEYWORDS, NULL},
     {"inner",
         (PyCFunction)dotblas_innerproduct,
         1, NULL},
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 3306c534188d..260fba4e7d4c 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -486,6 +486,9 @@ def pre_build(context):
                 pjoin('src', 'multiarray', 'shape.c'),
                 pjoin('src', 'multiarray', 'ucsnarrow.c'),
                 pjoin('src', 'multiarray', 'usertypes.c')]
+
+            if bld.env.HAS_CBLAS:
+                sources.append(pjoin('src', 'multiarray', 'cblasfuncs.c'))
         else:
             sources = extension.sources
 
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 9d64e6753f8c..d60756c2e6ec 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1080,15 +1080,14 @@ def outer(a, b, out=None):
 try:
     # disables openblas affinity setting of the main thread that limits
     # python threads or processes to one core
-    if 'OPENBLAS_MAIN_FREE' not in os.environ:
-        os.environ['OPENBLAS_MAIN_FREE'] = '1'
-    if 'GOTOBLAS_MAIN_FREE' not in os.environ:
-        os.environ['GOTOBLAS_MAIN_FREE'] = '1'
-    from ._dotblas import dot, vdot, inner
+    if 'openblas_main_free' not in os.environ:
+        os.environ['openblas_main_free'] = '1'
+    if 'gotoblas_main_free' not in os.environ:
+        os.environ['gotoblas_main_free'] = '1'
+    from ._dotblas import vdot, inner
 except ImportError:
     # docstrings are in add_newdocs.py
     inner = multiarray.inner
-    dot = multiarray.dot
     def vdot(a, b):
         return dot(asarray(a).ravel().conj(), asarray(b).ravel())
 finally:
@@ -1096,6 +1095,7 @@ def vdot(a, b):
     os.environ.update(envbak)
     del envbak
 
+dot = multiarray.dot
 
 def alterdot():
     warnings.warn("alterdot no longer does anything.", DeprecationWarning)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 4c2af5e62610..48d5c255b67f 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -840,17 +840,18 @@ def generate_multiarray_templated_sources(ext, build_dir):
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'ucsnarrow.c')]
 
+    blas_info = get_info('blas_opt', 0)
+    if blas_info and  ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
+        extra_info = blas_info
+        multiarray_src.append(join('src', 'multiarray', 'cblasfuncs.c'))
+    else:
+        extra_info = {}
 
     if not ENABLE_SEPARATE_COMPILATION:
         multiarray_deps.extend(multiarray_src)
         multiarray_src = [join('src', 'multiarray', 'multiarraymodule_onefile.c')]
         multiarray_src.append(generate_multiarray_templated_sources)
 
-    blas_info = get_info('blas_opt', 0)
-    if blas_info and  ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
-        extra_info = blas_info
-    else:
-        extra_info = {}
 
     config.add_extension('multiarray',
                          sources=multiarray_src +
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 54747004518b..953734ddc8a1 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3125,7 +3125,7 @@ blas_stride(npy_intp stride, unsigned itemsize)
  * #type = npy_float, npy_double#
  * #prefix = s, d#
  */
-static void
+NPY_NO_EXPORT void
 @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
            npy_intp n, void *NPY_UNUSED(ignore))
 {
@@ -3174,7 +3174,7 @@ static void
  * #type = npy_float, npy_double#
  * #prefix = c, z#
  */
-static void
+NPY_NO_EXPORT void
 @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
            char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h
index ff7d4ae408b6..9626ad9cbc83 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h
@@ -1,10 +1,25 @@
 #ifndef _NPY_ARRAYTYPES_H_
 #define _NPY_ARRAYTYPES_H_
 
+#include "common.h"
+
 #ifdef NPY_ENABLE_SEPARATE_COMPILATION
 extern NPY_NO_EXPORT PyArray_Descr LONGLONG_Descr;
 extern NPY_NO_EXPORT PyArray_Descr LONG_Descr;
 extern NPY_NO_EXPORT PyArray_Descr INT_Descr;
+
+/* needed for blasfuncs */
+NPY_NO_EXPORT void
+FLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CFLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+DOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CDOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
 #endif
 
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/multiarray/cblasfuncs.c b/numpy/core/src/multiarray/cblasfuncs.c
new file mode 100644
index 000000000000..3de9d083841b
--- /dev/null
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -0,0 +1,945 @@
+/*
+ * This module provides a BLAS optimized matrix multiply,
+ * inner product and dot for numpy arrays
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <Python.h>
+#include <assert.h>
+#include <limits.h>
+#include <cblas.h>
+#include <numpy/arrayobject.h>
+#include "arraytypes.h"
+
+
+/*
+ * Helper: call appropriate BLAS dot function for typenum.
+ * Strides are NumPy strides.
+ */
+static void
+blas_dot(int typenum, npy_intp n,
+         void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
+{
+    switch (typenum) {
+        case NPY_DOUBLE:
+            DOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
+            break;
+        case NPY_FLOAT:
+            FLOAT_dot(a, stridea, b, strideb, res, n, NULL);
+            break;
+        case NPY_CDOUBLE:
+            CDOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
+            break;
+        case NPY_CFLOAT:
+            CFLOAT_dot(a, stridea, b, strideb, res, n, NULL);
+            break;
+    }
+}
+
+
+static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
+static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
+
+
+/*
+ * Helper: dispatch to appropriate cblas_?gemm for typenum.
+ */
+static void
+gemm(int typenum, enum CBLAS_ORDER order,
+     enum CBLAS_TRANSPOSE transA, enum CBLAS_TRANSPOSE transB,
+     int m, int n, int k,
+     PyArrayObject *A, int lda, PyArrayObject *B, int ldb, PyArrayObject *R)
+{
+    const void *Adata = PyArray_DATA(A), *Bdata = PyArray_DATA(B);
+    void *Rdata = PyArray_DATA(R);
+    int ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
+
+    switch (typenum) {
+        case NPY_DOUBLE:
+            cblas_dgemm(order, transA, transB, m, n, k, 1.,
+                        Adata, lda, Bdata, ldb, 0., Rdata, ldc);
+            break;
+        case NPY_FLOAT:
+            cblas_sgemm(order, transA, transB, m, n, k, 1.f,
+                        Adata, lda, Bdata, ldb, 0.f, Rdata, ldc);
+            break;
+        case NPY_CDOUBLE:
+            cblas_zgemm(order, transA, transB, m, n, k, oneD,
+                        Adata, lda, Bdata, ldb, zeroD, Rdata, ldc);
+            break;
+        case NPY_CFLOAT:
+            cblas_cgemm(order, transA, transB, m, n, k, oneF,
+                        Adata, lda, Bdata, ldb, zeroF, Rdata, ldc);
+            break;
+    }
+}
+
+
+/*
+ * Helper: dispatch to appropriate cblas_?gemv for typenum.
+ */
+static void
+gemv(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
+     PyArrayObject *A, int lda, PyArrayObject *X, int incX,
+     PyArrayObject *R)
+{
+    const void *Adata = PyArray_DATA(A), *Xdata = PyArray_DATA(X);
+    void *Rdata = PyArray_DATA(R);
+
+    int m = PyArray_DIM(A, 0), n = PyArray_DIM(A, 1);
+
+    switch (typenum) {
+        case NPY_DOUBLE:
+            cblas_dgemv(order, trans, m, n, 1., Adata, lda, Xdata, incX,
+                        0., Rdata, 1);
+            break;
+        case NPY_FLOAT:
+            cblas_sgemv(order, trans, m, n, 1.f, Adata, lda, Xdata, incX,
+                        0.f, Rdata, 1);
+            break;
+        case NPY_CDOUBLE:
+            cblas_zgemv(order, trans, m, n, oneD, Adata, lda, Xdata, incX,
+                        zeroD, Rdata, 1);
+            break;
+        case NPY_CFLOAT:
+            cblas_cgemv(order, trans, m, n, oneF, Adata, lda, Xdata, incX,
+                        zeroF, Rdata, 1);
+            break;
+    }
+}
+
+
+typedef enum {_scalar, _column, _row, _matrix} MatrixShape;
+
+
+static MatrixShape
+_select_matrix_shape(PyArrayObject *array)
+{
+    switch (PyArray_NDIM(array)) {
+        case 0:
+            return _scalar;
+        case 1:
+            if (PyArray_DIM(array, 0) > 1)
+                return _column;
+            return _scalar;
+        case 2:
+            if (PyArray_DIM(array, 0) > 1) {
+                if (PyArray_DIM(array, 1) == 1)
+                    return _column;
+                else
+                    return _matrix;
+            }
+            if (PyArray_DIM(array, 1) == 1)
+                return _scalar;
+            return _row;
+    }
+    return _matrix;
+}
+
+
+/*
+ * This also makes sure that the data segment is aligned with
+ * an itemsize address as well by returning one if not true.
+ */
+static int
+_bad_strides(PyArrayObject *ap)
+{
+    int itemsize = PyArray_ITEMSIZE(ap);
+    int i, N=PyArray_NDIM(ap);
+    npy_intp *strides = PyArray_STRIDES(ap);
+
+    if (((npy_intp)(PyArray_DATA(ap)) % itemsize) != 0) {
+        return 1;
+    }
+    for (i = 0; i < N; i++) {
+        if ((strides[i] < 0) || (strides[i] % itemsize) != 0) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * dot(a,b)
+ * Returns the dot product of a and b for arrays of floating point types.
+ * Like the generic numpy equivalent the product sum is over
+ * the last dimension of a and the second-to-last dimension of b.
+ * NB: The first argument is not conjugated.;
+ *
+ * This is for use by PyArray_MatrixProduct2. It is assumed on entry that
+ * the arrays ap1 and ap2 have a common data type given by typenum that is
+ * float, double, cfloat, or cdouble and have dimension <= 2. The
+ * __numpy_ufunc__ nonsense is also assumed to have been taken care of.
+ */
+NPY_NO_EXPORT PyObject *
+cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
+                    PyArrayObject *out)
+{
+    PyArrayObject *ret = NULL;
+    int j, lda, ldb;
+    npy_intp l;
+    int nd;
+    npy_intp ap1stride = 0;
+    npy_intp dimensions[NPY_MAXDIMS];
+    npy_intp numbytes;
+    double prior1, prior2;
+    PyTypeObject *subtype;
+    MatrixShape ap1shape, ap2shape;
+
+    if (_bad_strides(ap1)) {
+            PyObject *op1 = PyArray_NewCopy(ap1, NPY_ANYORDER);
+
+            Py_DECREF(ap1);
+            ap1 = (PyArrayObject *)op1;
+            if (ap1 == NULL) {
+                goto fail;
+            }
+    }
+    if (_bad_strides(ap2)) {
+            PyObject *op2 = PyArray_NewCopy(ap2, NPY_ANYORDER);
+
+            Py_DECREF(ap2);
+            ap2 = (PyArrayObject *)op2;
+            if (ap2 == NULL) {
+                goto fail;
+            }
+    }
+    ap1shape = _select_matrix_shape(ap1);
+    ap2shape = _select_matrix_shape(ap2);
+
+    if (ap1shape == _scalar || ap2shape == _scalar) {
+        PyArrayObject *oap1, *oap2;
+        oap1 = ap1; oap2 = ap2;
+        /* One of ap1 or ap2 is a scalar */
+        if (ap1shape == _scalar) {
+            /* Make ap2 the scalar */
+            PyArrayObject *t = ap1;
+            ap1 = ap2;
+            ap2 = t;
+            ap1shape = ap2shape;
+            ap2shape = _scalar;
+        }
+
+        if (ap1shape == _row) {
+            ap1stride = PyArray_STRIDE(ap1, 1);
+        }
+        else if (PyArray_NDIM(ap1) > 0) {
+            ap1stride = PyArray_STRIDE(ap1, 0);
+        }
+
+        if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
+            npy_intp *thisdims;
+            if (PyArray_NDIM(ap1) == 0) {
+                nd = PyArray_NDIM(ap2);
+                thisdims = PyArray_DIMS(ap2);
+            }
+            else {
+                nd = PyArray_NDIM(ap1);
+                thisdims = PyArray_DIMS(ap1);
+            }
+            l = 1;
+            for (j = 0; j < nd; j++) {
+                dimensions[j] = thisdims[j];
+                l *= dimensions[j];
+            }
+        }
+        else {
+            l = PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1);
+
+            if (PyArray_DIM(oap2, 0) != l) {
+                PyErr_SetString(PyExc_ValueError,
+                        "matrices are not aligned");
+                goto fail;
+            }
+            nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
+            /*
+             * nd = 0 or 1 or 2. If nd == 0 do nothing ...
+             */
+            if (nd == 1) {
+                /*
+                 * Either PyArray_NDIM(ap1) is 1 dim or PyArray_NDIM(ap2) is 1 dim
+                 * and the other is 2-dim
+                 */
+                dimensions[0] = (PyArray_NDIM(oap1) == 2) ?
+                                PyArray_DIM(oap1, 0) : PyArray_DIM(oap2, 1);
+                l = dimensions[0];
+                /*
+                 * Fix it so that dot(shape=(N,1), shape=(1,))
+                 * and dot(shape=(1,), shape=(1,N)) both return
+                 * an (N,) array (but use the fast scalar code)
+                 */
+            }
+            else if (nd == 2) {
+                dimensions[0] = PyArray_DIM(oap1, 0);
+                dimensions[1] = PyArray_DIM(oap2, 1);
+                /*
+                 * We need to make sure that dot(shape=(1,1), shape=(1,N))
+                 * and dot(shape=(N,1),shape=(1,1)) uses
+                 * scalar multiplication appropriately
+                 */
+                if (ap1shape == _row) {
+                    l = dimensions[1];
+                }
+                else {
+                    l = dimensions[0];
+                }
+            }
+
+            /* Check if the summation dimension is 0-sized */
+            if (PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1) == 0) {
+                l = 0;
+            }
+        }
+    }
+    else {
+        /*
+         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
+         * Both ap1 and ap2 are vectors or matrices
+         */
+        l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
+
+        if (PyArray_DIM(ap2, 0) != l) {
+            PyErr_SetString(PyExc_ValueError,
+                    "matrices are not aligned");
+            goto fail;
+        }
+        nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
+
+        if (nd == 1) {
+            dimensions[0] = (PyArray_NDIM(ap1) == 2) ?
+                            PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 1);
+        }
+        else if (nd == 2) {
+            dimensions[0] = PyArray_DIM(ap1, 0);
+            dimensions[1] = PyArray_DIM(ap2, 1);
+        }
+    }
+
+    /* Choose which subtype to return */
+    if (Py_TYPE(ap1) != Py_TYPE(ap2)) {
+        prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
+        prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
+        subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
+    }
+    else {
+        prior1 = prior2 = 0.0;
+        subtype = Py_TYPE(ap1);
+    }
+
+    if (out != NULL) {
+        int d;
+
+        /* verify that out is usable */
+        if (Py_TYPE(out) != subtype ||
+            PyArray_NDIM(out) != nd ||
+            PyArray_TYPE(out) != typenum ||
+            !PyArray_ISCARRAY(out)) {
+
+            PyErr_SetString(PyExc_ValueError,
+                "output array is not acceptable "
+                "(must have the right type, nr dimensions, and be a C-Array)");
+            goto fail;
+        }
+        for (d = 0; d < nd; ++d) {
+            if (dimensions[d] != PyArray_DIM(out, d)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "output array has wrong dimensions");
+                goto fail;
+            }
+        }
+        Py_INCREF(out);
+        ret = out;
+    }
+    else {
+        PyObject *tmp = (PyObject *)(prior2 > prior1 ? ap2 : ap1);
+
+        ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
+                                           typenum, NULL, NULL, 0, 0, tmp);
+    }
+
+    if (ret == NULL) {
+        goto fail;
+    }
+    numbytes = PyArray_NBYTES(ret);
+    memset(PyArray_DATA(ret), 0, numbytes);
+    if (numbytes == 0 || l == 0) {
+            Py_DECREF(ap1);
+            Py_DECREF(ap2);
+            return PyArray_Return(ret);
+    }
+
+    if (ap2shape == _scalar) {
+        /*
+         * Multiplication by a scalar -- Level 1 BLAS
+         * if ap1shape is a matrix and we are not contiguous, then we can't
+         * just blast through the entire array using a single striding factor
+         */
+        NPY_BEGIN_ALLOW_THREADS;
+
+        if (typenum == NPY_DOUBLE) {
+            if (l == 1) {
+                *((double *)PyArray_DATA(ret)) = *((double *)PyArray_DATA(ap2)) *
+                                                 *((double *)PyArray_DATA(ap1));
+            }
+            else if (ap1shape != _matrix) {
+                cblas_daxpy(l,
+                            *((double *)PyArray_DATA(ap2)),
+                            (double *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(double),
+                            (double *)PyArray_DATA(ret), 1);
+            }
+            else {
+                int maxind, oind, i, a1s, rets;
+                char *ptr, *rptr;
+                double val;
+
+                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
+                oind = 1 - maxind;
+                ptr = PyArray_DATA(ap1);
+                rptr = PyArray_DATA(ret);
+                l = PyArray_DIM(ap1, maxind);
+                val = *((double *)PyArray_DATA(ap2));
+                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(double);
+                rets = PyArray_STRIDE(ret, maxind) / sizeof(double);
+                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
+                    cblas_daxpy(l, val, (double *)ptr, a1s,
+                                (double *)rptr, rets);
+                    ptr += PyArray_STRIDE(ap1, oind);
+                    rptr += PyArray_STRIDE(ret, oind);
+                }
+            }
+        }
+        else if (typenum == NPY_CDOUBLE) {
+            if (l == 1) {
+                npy_cdouble *ptr1, *ptr2, *res;
+
+                ptr1 = (npy_cdouble *)PyArray_DATA(ap2);
+                ptr2 = (npy_cdouble *)PyArray_DATA(ap1);
+                res = (npy_cdouble *)PyArray_DATA(ret);
+                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
+                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
+            }
+            else if (ap1shape != _matrix) {
+                cblas_zaxpy(l,
+                            (double *)PyArray_DATA(ap2),
+                            (double *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(npy_cdouble),
+                            (double *)PyArray_DATA(ret), 1);
+            }
+            else {
+                int maxind, oind, i, a1s, rets;
+                char *ptr, *rptr;
+                double *pval;
+
+                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
+                oind = 1 - maxind;
+                ptr = PyArray_DATA(ap1);
+                rptr = PyArray_DATA(ret);
+                l = PyArray_DIM(ap1, maxind);
+                pval = (double *)PyArray_DATA(ap2);
+                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(npy_cdouble);
+                rets = PyArray_STRIDE(ret, maxind) / sizeof(npy_cdouble);
+                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
+                    cblas_zaxpy(l, pval, (double *)ptr, a1s,
+                                (double *)rptr, rets);
+                    ptr += PyArray_STRIDE(ap1, oind);
+                    rptr += PyArray_STRIDE(ret, oind);
+                }
+            }
+        }
+        else if (typenum == NPY_FLOAT) {
+            if (l == 1) {
+                *((float *)PyArray_DATA(ret)) = *((float *)PyArray_DATA(ap2)) *
+                    *((float *)PyArray_DATA(ap1));
+            }
+            else if (ap1shape != _matrix) {
+                cblas_saxpy(l,
+                            *((float *)PyArray_DATA(ap2)),
+                            (float *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(float),
+                            (float *)PyArray_DATA(ret), 1);
+            }
+            else {
+                int maxind, oind, i, a1s, rets;
+                char *ptr, *rptr;
+                float val;
+
+                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
+                oind = 1 - maxind;
+                ptr = PyArray_DATA(ap1);
+                rptr = PyArray_DATA(ret);
+                l = PyArray_DIM(ap1, maxind);
+                val = *((float *)PyArray_DATA(ap2));
+                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(float);
+                rets = PyArray_STRIDE(ret, maxind) / sizeof(float);
+                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
+                    cblas_saxpy(l, val, (float *)ptr, a1s,
+                                (float *)rptr, rets);
+                    ptr += PyArray_STRIDE(ap1, oind);
+                    rptr += PyArray_STRIDE(ret, oind);
+                }
+            }
+        }
+        else if (typenum == NPY_CFLOAT) {
+            if (l == 1) {
+                npy_cfloat *ptr1, *ptr2, *res;
+
+                ptr1 = (npy_cfloat *)PyArray_DATA(ap2);
+                ptr2 = (npy_cfloat *)PyArray_DATA(ap1);
+                res = (npy_cfloat *)PyArray_DATA(ret);
+                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
+                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
+            }
+            else if (ap1shape != _matrix) {
+                cblas_caxpy(l,
+                            (float *)PyArray_DATA(ap2),
+                            (float *)PyArray_DATA(ap1),
+                            ap1stride/sizeof(npy_cfloat),
+                            (float *)PyArray_DATA(ret), 1);
+            }
+            else {
+                int maxind, oind, i, a1s, rets;
+                char *ptr, *rptr;
+                float *pval;
+
+                maxind = (PyArray_DIM(ap1, 0) >= PyArray_DIM(ap1, 1) ? 0 : 1);
+                oind = 1 - maxind;
+                ptr = PyArray_DATA(ap1);
+                rptr = PyArray_DATA(ret);
+                l = PyArray_DIM(ap1, maxind);
+                pval = (float *)PyArray_DATA(ap2);
+                a1s = PyArray_STRIDE(ap1, maxind) / sizeof(npy_cfloat);
+                rets = PyArray_STRIDE(ret, maxind) / sizeof(npy_cfloat);
+                for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
+                    cblas_caxpy(l, pval, (float *)ptr, a1s,
+                                (float *)rptr, rets);
+                    ptr += PyArray_STRIDE(ap1, oind);
+                    rptr += PyArray_STRIDE(ret, oind);
+                }
+            }
+        }
+        NPY_END_ALLOW_THREADS;
+    }
+    else if ((ap2shape == _column) && (ap1shape != _matrix)) {
+        NPY_BEGIN_ALLOW_THREADS;
+
+        /* Dot product between two vectors -- Level 1 BLAS */
+        blas_dot(typenum, l,
+                 PyArray_DATA(ap1), PyArray_STRIDE(ap1, (ap1shape == _row)),
+                 PyArray_DATA(ap2), PyArray_STRIDE(ap2, 0),
+                 PyArray_DATA(ret));
+        NPY_END_ALLOW_THREADS;
+    }
+    else if (ap1shape == _matrix && ap2shape != _matrix) {
+        /* Matrix vector multiplication -- Level 2 BLAS */
+        /* lda must be MAX(M,1) */
+        enum CBLAS_ORDER Order;
+        int ap2s;
+
+        if (!PyArray_ISONESEGMENT(ap1)) {
+            PyObject *new;
+            new = PyArray_Copy(ap1);
+            Py_DECREF(ap1);
+            ap1 = (PyArrayObject *)new;
+            if (new == NULL) {
+                goto fail;
+            }
+        }
+        NPY_BEGIN_ALLOW_THREADS
+        if (PyArray_ISCONTIGUOUS(ap1)) {
+            Order = CblasRowMajor;
+            lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
+        }
+        else {
+            Order = CblasColMajor;
+            lda = (PyArray_DIM(ap1, 0) > 1 ? PyArray_DIM(ap1, 0) : 1);
+        }
+        ap2s = PyArray_STRIDE(ap2, 0) / PyArray_ITEMSIZE(ap2);
+        gemv(typenum, Order, CblasNoTrans, ap1, lda, ap2, ap2s, ret);
+        NPY_END_ALLOW_THREADS;
+    }
+    else if (ap1shape != _matrix && ap2shape == _matrix) {
+        /* Vector matrix multiplication -- Level 2 BLAS */
+        enum CBLAS_ORDER Order;
+        int ap1s;
+
+        if (!PyArray_ISONESEGMENT(ap2)) {
+            PyObject *new;
+            new = PyArray_Copy(ap2);
+            Py_DECREF(ap2);
+            ap2 = (PyArrayObject *)new;
+            if (new == NULL) {
+                goto fail;
+            }
+        }
+        NPY_BEGIN_ALLOW_THREADS
+        if (PyArray_ISCONTIGUOUS(ap2)) {
+            Order = CblasRowMajor;
+            lda = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
+        }
+        else {
+            Order = CblasColMajor;
+            lda = (PyArray_DIM(ap2, 0) > 1 ? PyArray_DIM(ap2, 0) : 1);
+        }
+        if (ap1shape == _row) {
+            ap1s = PyArray_STRIDE(ap1, 1) / PyArray_ITEMSIZE(ap1);
+        }
+        else {
+            ap1s = PyArray_STRIDE(ap1, 0) / PyArray_ITEMSIZE(ap1);
+        }
+        gemv(typenum, Order, CblasTrans, ap2, lda, ap1, ap1s, ret);
+        NPY_END_ALLOW_THREADS;
+    }
+    else {
+        /*
+         * (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2)
+         * Matrix matrix multiplication -- Level 3 BLAS
+         *  L x M  multiplied by M x N
+         */
+        enum CBLAS_ORDER Order;
+        enum CBLAS_TRANSPOSE Trans1, Trans2;
+        int M, N, L;
+
+        /* Optimization possible: */
+        /*
+         * We may be able to handle single-segment arrays here
+         * using appropriate values of Order, Trans1, and Trans2.
+         */
+        if (!PyArray_IS_C_CONTIGUOUS(ap2) && !PyArray_IS_F_CONTIGUOUS(ap2)) {
+            PyObject *new = PyArray_Copy(ap2);
+
+            Py_DECREF(ap2);
+            ap2 = (PyArrayObject *)new;
+            if (new == NULL) {
+                goto fail;
+            }
+        }
+        if (!PyArray_IS_C_CONTIGUOUS(ap1) && !PyArray_IS_F_CONTIGUOUS(ap1)) {
+            PyObject *new = PyArray_Copy(ap1);
+
+            Py_DECREF(ap1);
+            ap1 = (PyArrayObject *)new;
+            if (new == NULL) {
+                goto fail;
+            }
+        }
+
+        NPY_BEGIN_ALLOW_THREADS;
+
+        Order = CblasRowMajor;
+        Trans1 = CblasNoTrans;
+        Trans2 = CblasNoTrans;
+        L = PyArray_DIM(ap1, 0);
+        N = PyArray_DIM(ap2, 1);
+        M = PyArray_DIM(ap2, 0);
+        lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
+        ldb = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
+
+        /*
+         * Avoid temporary copies for arrays in Fortran order
+         */
+        if (PyArray_IS_F_CONTIGUOUS(ap1)) {
+            Trans1 = CblasTrans;
+            lda = (PyArray_DIM(ap1, 0) > 1 ? PyArray_DIM(ap1, 0) : 1);
+        }
+        if (PyArray_IS_F_CONTIGUOUS(ap2)) {
+            Trans2 = CblasTrans;
+            ldb = (PyArray_DIM(ap2, 0) > 1 ? PyArray_DIM(ap2, 0) : 1);
+        }
+        gemm(typenum, Order, Trans1, Trans2, L, N, M, ap1, lda, ap2, ldb, ret);
+        NPY_END_ALLOW_THREADS;
+    }
+
+
+    Py_DECREF(ap1);
+    Py_DECREF(ap2);
+    return PyArray_Return(ret);
+
+fail:
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    Py_XDECREF(ret);
+    return NULL;
+}
+
+#if 0
+
+/*
+ * innerproduct(a,b)
+ *
+ * Returns the inner product of a and b for arrays of
+ * floating point types. Like the generic NumPy equivalent the product
+ * sum is over the last dimension of a and b.
+ * NB: The first argument is not conjugated.
+ */
+
+NPY_NO_EXPORT PyObject *
+cblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    PyObject *op1, *op2;
+    PyArrayObject *ap1, *ap2, *ret;
+    int j, l, lda, ldb;
+    int typenum, nd;
+    npy_intp dimensions[NPY_MAXDIMS];
+    PyTypeObject *subtype;
+    double prior1, prior2;
+
+    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+        return NULL;
+    }
+
+    /*
+     * Inner product using the BLAS.  The product sum is taken along the last
+     * dimensions of the two arrays.
+     * Only speeds things up for float double and complex types.
+     */
+
+
+    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op2, typenum);
+
+    /* This function doesn't handle other types */
+    if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
+            typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
+        return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(op1, op2));
+    }
+
+    ret = NULL;
+    ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
+    if (ap1 == NULL) {
+        return NULL;
+    }
+    ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
+    if (ap2 == NULL) {
+        goto fail;
+    }
+
+    if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
+        /* This function doesn't handle dimensions greater than 2 */
+        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
+                                                    (PyObject *)ap2);
+        Py_DECREF(ap1);
+        Py_DECREF(ap2);
+        return PyArray_Return(ret);
+    }
+
+    if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
+        /* One of ap1 or ap2 is a scalar */
+        if (PyArray_NDIM(ap1) == 0) {
+            /* Make ap2 the scalar */
+            PyArrayObject *t = ap1;
+            ap1 = ap2;
+            ap2 = t;
+        }
+        for (l = 1, j = 0; j < PyArray_NDIM(ap1); j++) {
+            dimensions[j] = PyArray_DIM(ap1, j);
+            l *= dimensions[j];
+        }
+        nd = PyArray_NDIM(ap1);
+    }
+    else {
+        /*
+         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
+         *  Both ap1 and ap2 are vectors or matrices
+         */
+        l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
+
+        if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
+            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+            goto fail;
+        }
+        nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
+
+        if (nd == 1)
+            dimensions[0] = (PyArray_NDIM(ap1) == 2) ?
+                            PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 0);
+        else if (nd == 2) {
+            dimensions[0] = PyArray_DIM(ap1, 0);
+            dimensions[1] = PyArray_DIM(ap2, 0);
+        }
+    }
+
+    /* Choose which subtype to return */
+    prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
+    prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
+    subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
+
+    ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
+                                       typenum, NULL, NULL, 0, 0,
+                                       (PyObject *)(prior2 > prior1 ? ap2 : ap1));
+
+    if (ret == NULL) {
+        goto fail;
+    }
+    NPY_BEGIN_ALLOW_THREADS
+    memset(PyArray_DATA(ret), 0, PyArray_NBYTES(ret));
+
+    if (PyArray_NDIM(ap2) == 0) {
+        /* Multiplication by a scalar -- Level 1 BLAS */
+        if (typenum == NPY_DOUBLE) {
+            cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1), 1,
+                        (double *)PyArray_DATA(ret), 1);
+        }
+        else if (typenum == NPY_CDOUBLE) {
+            cblas_zaxpy(l, (double *)PyArray_DATA(ap2), (double *)PyArray_DATA(ap1), 1,
+                        (double *)PyArray_DATA(ret), 1);
+        }
+        else if (typenum == NPY_FLOAT) {
+            cblas_saxpy(l, *((float *)PyArray_DATA(ap2)), (float *)PyArray_DATA(ap1), 1,
+                        (float *)PyArray_DATA(ret), 1);
+        }
+        else if (typenum == NPY_CFLOAT) {
+            cblas_caxpy(l, (float *)PyArray_DATA(ap2), (float *)PyArray_DATA(ap1), 1,
+                        (float *)PyArray_DATA(ret), 1);
+        }
+    }
+    else if (PyArray_NDIM(ap1) == 1 && PyArray_NDIM(ap2) == 1) {
+        /* Dot product between two vectors -- Level 1 BLAS */
+        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
+                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
+    }
+    else if (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 1) {
+        /* Matrix-vector multiplication -- Level 2 BLAS */
+        lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
+        gemv(typenum, CblasRowMajor, CblasNoTrans, ap1, lda, ap2, 1, ret);
+    }
+    else if (PyArray_NDIM(ap1) == 1 && PyArray_NDIM(ap2) == 2) {
+        /* Vector matrix multiplication -- Level 2 BLAS */
+        lda = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
+        gemv(typenum, CblasRowMajor, CblasNoTrans, ap2, lda, ap1, 1, ret);
+    }
+    else {
+        /*
+         * (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2)
+         * Matrix matrix multiplication -- Level 3 BLAS
+         */
+        lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
+        ldb = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
+        gemm(typenum, CblasRowMajor, CblasNoTrans, CblasTrans,
+             PyArray_DIM(ap1, 0), PyArray_DIM(ap2, 0), PyArray_DIM(ap1, 1),
+             ap1, lda, ap2, ldb, ret);
+    }
+    NPY_END_ALLOW_THREADS
+    Py_DECREF(ap1);
+    Py_DECREF(ap2);
+    return PyArray_Return(ret);
+
+ fail:
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    Py_XDECREF(ret);
+    return NULL;
+}
+
+
+/*
+ * vdot(a,b)
+ *
+ * Returns the dot product of a and b for scalars and vectors of
+ * floating point and complex types.  The first argument, a, is conjugated.
+ */
+static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
+    PyObject *op1, *op2;
+    PyArrayObject *ap1 = NULL, *ap2  = NULL, *ret = NULL;
+    int l;
+    int typenum;
+    npy_intp dimensions[NPY_MAXDIMS];
+    PyArray_Descr *type;
+
+    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+        return NULL;
+    }
+
+    /*
+     * Conjugating dot product using the BLAS for vectors.
+     * Multiplies op1 and op2, each of which must be vector.
+     */
+    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op2, typenum);
+
+    type = PyArray_DescrFromType(typenum);
+    Py_INCREF(type);
+    ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
+    if (ap1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
+    op1 = PyArray_Flatten(ap1, 0);
+    if (op1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
+    Py_DECREF(ap1);
+    ap1 = (PyArrayObject *)op1;
+
+    ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
+    if (ap2 == NULL) {
+        goto fail;
+    }
+    op2 = PyArray_Flatten(ap2, 0);
+    if (op2 == NULL) {
+        goto fail;
+    }
+    Py_DECREF(ap2);
+    ap2 = (PyArrayObject *)op2;
+
+    if (typenum != NPY_FLOAT && typenum != NPY_DOUBLE &&
+        typenum != NPY_CFLOAT && typenum != NPY_CDOUBLE) {
+        if (PyTypeNum_ISCOMPLEX(typenum)) {
+            op1 = PyArray_Conjugate(ap1, NULL);
+            if (op1 == NULL) {
+                goto fail;
+            }
+            Py_DECREF(ap1);
+            ap1 = (PyArrayObject *)op1;
+        }
+        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
+                                                    (PyObject *)ap2);
+        Py_DECREF(ap1);
+        Py_DECREF(ap2);
+        return PyArray_Return(ret);
+    }
+
+    if (PyArray_DIM(ap2, 0) != PyArray_DIM(ap1, PyArray_NDIM(ap1)-1)) {
+        PyErr_SetString(PyExc_ValueError, "vectors have different lengths");
+        goto fail;
+    }
+    l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
+
+    ret = (PyArrayObject *)PyArray_SimpleNew(0, dimensions, typenum);
+    if (ret == NULL) {
+        goto fail;
+    }
+
+    NPY_BEGIN_ALLOW_THREADS;
+
+    /* Dot product between two vectors -- Level 1 BLAS */
+    if (typenum == NPY_DOUBLE || typenum == NPY_FLOAT) {
+        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
+                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
+    }
+    else if (typenum == NPY_CDOUBLE) {
+        cblas_zdotc_sub(l, (double *)PyArray_DATA(ap1), 1,
+                        (double *)PyArray_DATA(ap2), 1, (double *)PyArray_DATA(ret));
+    }
+    else if (typenum == NPY_CFLOAT) {
+        cblas_cdotc_sub(l, (float *)PyArray_DATA(ap1), 1,
+                        (float *)PyArray_DATA(ap2), 1, (float *)PyArray_DATA(ret));
+    }
+
+    NPY_END_ALLOW_THREADS;
+
+    Py_DECREF(ap1);
+    Py_DECREF(ap2);
+    return PyArray_Return(ret);
+
+ fail:
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    Py_XDECREF(ret);
+    return NULL;
+}
+#endif
diff --git a/numpy/core/src/multiarray/cblasfuncs.h b/numpy/core/src/multiarray/cblasfuncs.h
new file mode 100644
index 000000000000..66ce4ca5becb
--- /dev/null
+++ b/numpy/core/src/multiarray/cblasfuncs.h
@@ -0,0 +1,7 @@
+#ifndef _NPY_CBLASFUNCS_H_
+#define _NPY_CBLASFUNCS_H_
+
+NPY_NO_EXPORT PyObject *
+cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
+
+#endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 8e9b656cf336..8b8c1b8a7fe7 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -57,6 +57,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "ufunc_override.h"
 #include "scalarmathmodule.h" /* for npy_mul_with_overflow_intp */
 #include "multiarraymodule.h"
+#include "cblasfuncs.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -915,7 +916,17 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
 }
 
 /*NUMPY_API
- * Numeric.matrixproduct(a,v,out)
+ * Numeric.matrixproduct(a,v)
+ * just like inner product but does the swapaxes stuff on the fly
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
+{
+    return PyArray_MatrixProduct2(op1, op2, NULL);
+}
+
+/*NUMPY_API
+ * Numeric.matrixproduct2(a,v,out)
  * just like inner product but does the swapaxes stuff on the fly
  */
 NPY_NO_EXPORT PyObject *
@@ -950,8 +961,18 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     ap2 = (PyArrayObject *)PyArray_FromAny(op2, typec, 0, 0,
                                         NPY_ARRAY_ALIGNED, NULL);
     if (ap2 == NULL) {
-        goto fail;
+        Py_DECREF(ap1);
+        return NULL;
+    }
+
+#if defined(HAVE_CBLAS)
+    if (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2 &&
+            (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+             NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+        return cblas_matrixproduct(typenum, ap1, ap2, out);
     }
+#endif
+
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         ret = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
         ret = (PyArrayObject *)Py_TYPE(ret)->tp_as_number->nb_multiply(
@@ -986,14 +1007,9 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     if(PyArray_NDIM(ap2) > 1) {
         dimensions[j++] = PyArray_DIMS(ap2)[PyArray_NDIM(ap2)-1];
     }
-    /*
-      fprintf(stderr, "nd=%d dimensions=", nd);
-      for(i=0; i<j; i++)
-      fprintf(stderr, "%d ", dimensions[i]);
-      fprintf(stderr, "\n");
-    */
 
-    is1 = PyArray_STRIDES(ap1)[PyArray_NDIM(ap1)-1]; is2 = PyArray_STRIDES(ap2)[matchDim];
+    is1 = PyArray_STRIDES(ap1)[PyArray_NDIM(ap1)-1];
+    is2 = PyArray_STRIDES(ap2)[matchDim];
     /* Choose which subtype to return */
     ret = new_array_for_sum(ap1, ap2, out, nd, dimensions, typenum);
     if (ret == NULL) {
@@ -1052,15 +1068,6 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     return NULL;
 }
 
-/*NUMPY_API
- *Numeric.matrixproduct(a,v)
- * just like inner product but does the swapaxes stuff on the fly
- */
-NPY_NO_EXPORT PyObject *
-PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
-{
-    return PyArray_MatrixProduct2(op1, op2, NULL);
-}
 
 /*NUMPY_API
  * Copy and Transpose
@@ -2203,6 +2210,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
     static PyUFuncObject *cached_npy_dot = NULL;
     PyObject *override = NULL;
     PyObject *v, *a, *o = NULL;
+    PyArrayObject *ret;
     char* kwlist[] = {"a", "b", "out", NULL };
     PyObject *module;
 
@@ -2235,7 +2243,8 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
                         "'out' must be an array");
         return NULL;
     }
-    return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(a, v, (PyArrayObject *)o));
+    ret = (PyArrayObject *)PyArray_MatrixProduct2(a, v, (PyArrayObject *)o);
+    return PyArray_Return(ret);
 }
 
 static int
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index 2d05c20ef007..010b88ce1def 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -2,7 +2,7 @@
  * This file includes all the .c files needed for a complete multiarray module.
  * This is used in the case where separate compilation is not enabled
  *
- * Note that the order of the includs matters
+ * Note that the order of the includes matters
  */
 
 #include "common.c"
@@ -50,9 +50,10 @@
 #include "array_assign_scalar.c"
 #include "array_assign_array.c"
 #include "ucsnarrow.c"
-
 #include "arrayobject.c"
-
 #include "numpymemoryview.c"
-
 #include "multiarraymodule.c"
+
+#if defined(HAVE_CBLAS)
+#include "cblasfuncs.c"
+#endif
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 7c2ede982863..e40deeef1dec 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -25,10 +25,9 @@ def test_vecself(self):
 
 @dec.skipif(_dotblas is None, "Numpy is not compiled with _dotblas")
 def test_blasdot_used():
-    from numpy.core import dot, vdot, inner
-    assert_(dot is _dotblas.dot)
-    assert_(vdot is _dotblas.vdot)
-    assert_(inner is _dotblas.inner)
+    from numpy.core import vdot, inner
+    assert_(vdot is _dotblas.vdot, "vdot")
+    assert_(inner is _dotblas.inner, "inner")
 
 
 def test_dot_2args():
diff --git a/numpy/linalg/bscript b/numpy/linalg/bscript
index deed4fd72fec..70fdd9de3b5d 100644
--- a/numpy/linalg/bscript
+++ b/numpy/linalg/bscript
@@ -20,7 +20,7 @@ def pbuild(context):
         return context.default_builder(extension,
                                        includes=includes,
                                        **kw)
- 
+
     context.register_builder("lapack_lite", build_lapack_lite)
     context.register_builder("_umath_linalg", build_lapack_lite)
 

From 028d2dec1c91668156efe6a10725f1e4ad488929 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sat, 16 Aug 2014 19:01:58 -0600
Subject: [PATCH 06/19] MAINT: Update waf to 1.7.16

---
 tools/travis-test.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/travis-test.sh b/tools/travis-test.sh
index d970daab3d3e..f342aa7b13b1 100755
--- a/tools/travis-test.sh
+++ b/tools/travis-test.sh
@@ -59,9 +59,9 @@ setup_bento()
   cd ..
 
   # Waf
-  wget http://waf.googlecode.com/files/waf-1.7.13.tar.bz2
-  tar xjvf waf-1.7.13.tar.bz2
-  cd waf-1.7.13
+  wget http://ftp.waf.io/pub/release/waf-1.7.16.tar.bz2
+  tar xjvf waf-1.7.16.tar.bz2
+  cd waf-1.7.16
   python waf-light
   export WAFDIR=$PWD
   cd ..

From 52b8ab30b1866ee15fc956946b981db7b525958e Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sun, 17 Aug 2014 15:10:08 -0600
Subject: [PATCH 07/19] ENH: Move dotblas_innerproduct down into multiarray.

Move the dotblas_innerproduct function from the _dotblas.c file to a new
cblasfuncs.c file in multiarray and rename it cblas_innerproduct.
Modify it so that it can called directly from PyArray_InnerProduct and
do so.  Fix up numeric.py and the tests to reflect these changes.
---
 numpy/core/blasdot/_dotblas.c                | 289 -------------------
 numpy/core/numeric.py                        |   5 +-
 numpy/core/src/multiarray/cblasfuncs.c       |  83 ++----
 numpy/core/src/multiarray/cblasfuncs.h       |   3 +
 numpy/core/src/multiarray/multiarraymodule.c |  12 +-
 numpy/core/tests/test_blasdot.py             |   1 -
 6 files changed, 48 insertions(+), 345 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 4fc645358b5f..7d75266263ad 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -9,7 +9,6 @@
 #include "numpy/arrayobject.h"
 #include "npy_config.h"
 #include "npy_pycompat.h"
-#include "ufunc_override.h"
 #ifndef CBLAS_HEADER
 #define CBLAS_HEADER "cblas.h"
 #endif
@@ -48,75 +47,6 @@ static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
 static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
 
 
-/*
- * Helper: dispatch to appropriate cblas_?gemm for typenum.
- */
-static void
-gemm(int typenum, enum CBLAS_ORDER order,
-     enum CBLAS_TRANSPOSE transA, enum CBLAS_TRANSPOSE transB,
-     int m, int n, int k,
-     PyArrayObject *A, int lda, PyArrayObject *B, int ldb, PyArrayObject *R)
-{
-    const void *Adata = PyArray_DATA(A), *Bdata = PyArray_DATA(B);
-    void *Rdata = PyArray_DATA(R);
-
-    int ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
-
-    switch (typenum) {
-        case NPY_DOUBLE:
-            cblas_dgemm(order, transA, transB, m, n, k, 1.,
-                        Adata, lda, Bdata, ldb, 0., Rdata, ldc);
-            break;
-        case NPY_FLOAT:
-            cblas_sgemm(order, transA, transB, m, n, k, 1.f,
-                        Adata, lda, Bdata, ldb, 0.f, Rdata, ldc);
-            break;
-        case NPY_CDOUBLE:
-            cblas_zgemm(order, transA, transB, m, n, k, oneD,
-                        Adata, lda, Bdata, ldb, zeroD, Rdata, ldc);
-            break;
-        case NPY_CFLOAT:
-            cblas_cgemm(order, transA, transB, m, n, k, oneF,
-                        Adata, lda, Bdata, ldb, zeroF, Rdata, ldc);
-            break;
-    }
-}
-
-
-/*
- * Helper: dispatch to appropriate cblas_?gemv for typenum.
- */
-static void
-gemv(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
-     PyArrayObject *A, int lda, PyArrayObject *X, int incX,
-     PyArrayObject *R)
-{
-    const void *Adata = PyArray_DATA(A), *Xdata = PyArray_DATA(X);
-    void *Rdata = PyArray_DATA(R);
-
-    int m = PyArray_DIM(A, 0), n = PyArray_DIM(A, 1);
-
-    switch (typenum) {
-        case NPY_DOUBLE:
-            cblas_dgemv(order, trans, m, n, 1., Adata, lda, Xdata, incX,
-                        0., Rdata, 1);
-            break;
-        case NPY_FLOAT:
-            cblas_sgemv(order, trans, m, n, 1.f, Adata, lda, Xdata, incX,
-                        0.f, Rdata, 1);
-            break;
-        case NPY_CDOUBLE:
-            cblas_zgemv(order, trans, m, n, oneD, Adata, lda, Xdata, incX,
-                        zeroD, Rdata, 1);
-            break;
-        case NPY_CFLOAT:
-            cblas_cgemv(order, trans, m, n, oneF, Adata, lda, Xdata, incX,
-                        zeroF, Rdata, 1);
-            break;
-    }
-}
-
-
 /*
  * Initialize oldFunctions table.
  */
@@ -148,222 +78,6 @@ init_oldFunctions(void)
 typedef enum {_scalar, _column, _row, _matrix} MatrixShape;
 
 
-static MatrixShape
-_select_matrix_shape(PyArrayObject *array)
-{
-    switch (PyArray_NDIM(array)) {
-        case 0:
-            return _scalar;
-        case 1:
-            if (PyArray_DIM(array, 0) > 1)
-                return _column;
-            return _scalar;
-        case 2:
-            if (PyArray_DIM(array, 0) > 1) {
-                if (PyArray_DIM(array, 1) == 1)
-                    return _column;
-                else
-                    return _matrix;
-            }
-            if (PyArray_DIM(array, 1) == 1)
-                return _scalar;
-            return _row;
-    }
-    return _matrix;
-}
-
-
-/*
- * This also makes sure that the data segment is aligned with
- * an itemsize address as well by returning one if not true.
- */
-static int
-_bad_strides(PyArrayObject *ap)
-{
-    int itemsize = PyArray_ITEMSIZE(ap);
-    int i, N=PyArray_NDIM(ap);
-    npy_intp *strides = PyArray_STRIDES(ap);
-
-    if (((npy_intp)(PyArray_DATA(ap)) % itemsize) != 0) {
-        return 1;
-    }
-    for (i = 0; i < N; i++) {
-        if ((strides[i] < 0) || (strides[i] % itemsize) != 0) {
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
-/*
- * innerproduct(a,b)
- *
- * Returns the inner product of a and b for arrays of
- * floating point types. Like the generic NumPy equivalent the product
- * sum is over the last dimension of a and b.
- * NB: The first argument is not conjugated.
- */
-
-static PyObject *
-dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *ap1, *ap2, *ret;
-    int j, l, lda, ldb;
-    int typenum, nd;
-    npy_intp dimensions[NPY_MAXDIMS];
-    PyTypeObject *subtype;
-    double prior1, prior2;
-
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
-        return NULL;
-    }
-
-    /*
-     * Inner product using the BLAS.  The product sum is taken along the last
-     * dimensions of the two arrays.
-     * Only speeds things up for float double and complex types.
-     */
-
-
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    /* This function doesn't handle other types */
-    if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-            typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(op1, op2));
-    }
-
-    ret = NULL;
-    ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
-    if (ap1 == NULL) {
-        return NULL;
-    }
-    ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
-    if (ap2 == NULL) {
-        goto fail;
-    }
-
-    if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /* This function doesn't handle dimensions greater than 2 */
-        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
-                                                    (PyObject *)ap2);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
-
-    if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
-        /* One of ap1 or ap2 is a scalar */
-        if (PyArray_NDIM(ap1) == 0) {
-            /* Make ap2 the scalar */
-            PyArrayObject *t = ap1;
-            ap1 = ap2;
-            ap2 = t;
-        }
-        for (l = 1, j = 0; j < PyArray_NDIM(ap1); j++) {
-            dimensions[j] = PyArray_DIM(ap1, j);
-            l *= dimensions[j];
-        }
-        nd = PyArray_NDIM(ap1);
-    }
-    else {
-        /*
-         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
-         *  Both ap1 and ap2 are vectors or matrices
-         */
-        l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
-
-        if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
-            goto fail;
-        }
-        nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
-
-        if (nd == 1)
-            dimensions[0] = (PyArray_NDIM(ap1) == 2) ? PyArray_DIM(ap1, 0) : PyArray_DIM(ap2, 0);
-        else if (nd == 2) {
-            dimensions[0] = PyArray_DIM(ap1, 0);
-            dimensions[1] = PyArray_DIM(ap2, 0);
-        }
-    }
-
-    /* Choose which subtype to return */
-    prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
-    prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
-    subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
-
-    ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                       typenum, NULL, NULL, 0, 0,
-                                       (PyObject *)\
-                                       (prior2 > prior1 ? ap2 : ap1));
-
-    if (ret == NULL) {
-        goto fail;
-    }
-    NPY_BEGIN_ALLOW_THREADS
-    memset(PyArray_DATA(ret), 0, PyArray_NBYTES(ret));
-
-    if (PyArray_NDIM(ap2) == 0) {
-        /* Multiplication by a scalar -- Level 1 BLAS */
-        if (typenum == NPY_DOUBLE) {
-            cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1), 1,
-                        (double *)PyArray_DATA(ret), 1);
-        }
-        else if (typenum == NPY_CDOUBLE) {
-            cblas_zaxpy(l, (double *)PyArray_DATA(ap2), (double *)PyArray_DATA(ap1), 1,
-                        (double *)PyArray_DATA(ret), 1);
-        }
-        else if (typenum == NPY_FLOAT) {
-            cblas_saxpy(l, *((float *)PyArray_DATA(ap2)), (float *)PyArray_DATA(ap1), 1,
-                        (float *)PyArray_DATA(ret), 1);
-        }
-        else if (typenum == NPY_CFLOAT) {
-            cblas_caxpy(l, (float *)PyArray_DATA(ap2), (float *)PyArray_DATA(ap1), 1,
-                        (float *)PyArray_DATA(ret), 1);
-        }
-    }
-    else if (PyArray_NDIM(ap1) == 1 && PyArray_NDIM(ap2) == 1) {
-        /* Dot product between two vectors -- Level 1 BLAS */
-        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
-                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
-    }
-    else if (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 1) {
-        /* Matrix-vector multiplication -- Level 2 BLAS */
-        lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
-        gemv(typenum, CblasRowMajor, CblasNoTrans, ap1, lda, ap2, 1, ret);
-    }
-    else if (PyArray_NDIM(ap1) == 1 && PyArray_NDIM(ap2) == 2) {
-        /* Vector matrix multiplication -- Level 2 BLAS */
-        lda = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
-        gemv(typenum, CblasRowMajor, CblasNoTrans, ap2, lda, ap1, 1, ret);
-    }
-    else {
-        /*
-         * (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 2)
-         * Matrix matrix multiplication -- Level 3 BLAS
-         */
-        lda = (PyArray_DIM(ap1, 1) > 1 ? PyArray_DIM(ap1, 1) : 1);
-        ldb = (PyArray_DIM(ap2, 1) > 1 ? PyArray_DIM(ap2, 1) : 1);
-        gemm(typenum, CblasRowMajor, CblasNoTrans, CblasTrans,
-             PyArray_DIM(ap1, 0), PyArray_DIM(ap2, 0), PyArray_DIM(ap1, 1),
-             ap1, lda, ap2, ldb, ret);
-    }
-    NPY_END_ALLOW_THREADS
-    Py_DECREF(ap1);
-    Py_DECREF(ap2);
-    return PyArray_Return(ret);
-
- fail:
-    Py_XDECREF(ap1);
-    Py_XDECREF(ap2);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-
 /*
  * vdot(a,b)
  *
@@ -473,9 +187,6 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
 }
 
 static struct PyMethodDef dotblas_module_methods[] = {
-    {"inner",
-        (PyCFunction)dotblas_innerproduct,
-        1, NULL},
     {"vdot",
         (PyCFunction)dotblas_vdot,
         1, NULL},
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index d60756c2e6ec..d2fc520f4e4e 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -4,7 +4,7 @@
 import sys
 import warnings
 import collections
-from . import multiarray
+from numpy.core import multiarray
 from . import umath
 from .umath import (invert, sin, UFUNC_BUFSIZE_DEFAULT, ERR_IGNORE,
                     ERR_WARN, ERR_RAISE, ERR_CALL, ERR_PRINT, ERR_LOG,
@@ -1084,7 +1084,7 @@ def outer(a, b, out=None):
         os.environ['openblas_main_free'] = '1'
     if 'gotoblas_main_free' not in os.environ:
         os.environ['gotoblas_main_free'] = '1'
-    from ._dotblas import vdot, inner
+    from ._dotblas import vdot
 except ImportError:
     # docstrings are in add_newdocs.py
     inner = multiarray.inner
@@ -1096,6 +1096,7 @@ def vdot(a, b):
     del envbak
 
 dot = multiarray.dot
+inner = multiarray.inner
 
 def alterdot():
     warnings.warn("alterdot no longer does anything.", DeprecationWarning)
diff --git a/numpy/core/src/multiarray/cblasfuncs.c b/numpy/core/src/multiarray/cblasfuncs.c
index 3de9d083841b..36504c3f940c 100644
--- a/numpy/core/src/multiarray/cblasfuncs.c
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -665,7 +665,6 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
     return NULL;
 }
 
-#if 0
 
 /*
  * innerproduct(a,b)
@@ -674,57 +673,24 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
  * floating point types. Like the generic NumPy equivalent the product
  * sum is over the last dimension of a and b.
  * NB: The first argument is not conjugated.
+ *
+ * This is for use by PyArray_InnerProduct. It is assumed on entry that the
+ * arrays ap1 and ap2 have a common data type given by typenum that is
+ * float, double, cfloat, or cdouble and have dimension <= 2, and have the
+ * contiguous flag set. The * __numpy_ufunc__ nonsense is also assumed to
+ * have been taken care of.
  */
 
 NPY_NO_EXPORT PyObject *
-cblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
 {
-    PyObject *op1, *op2;
-    PyArrayObject *ap1, *ap2, *ret;
     int j, l, lda, ldb;
-    int typenum, nd;
+    int nd;
+    double prior1, prior2;
+    PyArrayObject *ret;
     npy_intp dimensions[NPY_MAXDIMS];
     PyTypeObject *subtype;
-    double prior1, prior2;
-
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
-        return NULL;
-    }
 
-    /*
-     * Inner product using the BLAS.  The product sum is taken along the last
-     * dimensions of the two arrays.
-     * Only speeds things up for float double and complex types.
-     */
-
-
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    /* This function doesn't handle other types */
-    if ((typenum != NPY_DOUBLE && typenum != NPY_CDOUBLE &&
-            typenum != NPY_FLOAT && typenum != NPY_CFLOAT)) {
-        return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(op1, op2));
-    }
-
-    ret = NULL;
-    ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
-    if (ap1 == NULL) {
-        return NULL;
-    }
-    ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
-    if (ap2 == NULL) {
-        goto fail;
-    }
-
-    if ((PyArray_NDIM(ap1) > 2) || (PyArray_NDIM(ap2) > 2)) {
-        /* This function doesn't handle dimensions greater than 2 */
-        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
-                                                    (PyObject *)ap2);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
 
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         /* One of ap1 or ap2 is a scalar */
@@ -748,7 +714,8 @@ cblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
 
         if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+            PyErr_SetString(PyExc_ValueError,
+                    "matrices are not aligned");
             goto fail;
         }
         nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
@@ -769,7 +736,8 @@ cblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
     ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
                                        typenum, NULL, NULL, 0, 0,
-                                       (PyObject *)(prior2 > prior1 ? ap2 : ap1));
+                                       (PyObject *)
+                                           (prior2 > prior1 ? ap2 : ap1));
 
     if (ret == NULL) {
         goto fail;
@@ -780,26 +748,36 @@ cblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (PyArray_NDIM(ap2) == 0) {
         /* Multiplication by a scalar -- Level 1 BLAS */
         if (typenum == NPY_DOUBLE) {
-            cblas_daxpy(l, *((double *)PyArray_DATA(ap2)), (double *)PyArray_DATA(ap1), 1,
+            cblas_daxpy(l,
+                        *((double *)PyArray_DATA(ap2)),
+                        (double *)PyArray_DATA(ap1), 1,
                         (double *)PyArray_DATA(ret), 1);
         }
         else if (typenum == NPY_CDOUBLE) {
-            cblas_zaxpy(l, (double *)PyArray_DATA(ap2), (double *)PyArray_DATA(ap1), 1,
+            cblas_zaxpy(l,
+                        (double *)PyArray_DATA(ap2),
+                        (double *)PyArray_DATA(ap1), 1,
                         (double *)PyArray_DATA(ret), 1);
         }
         else if (typenum == NPY_FLOAT) {
-            cblas_saxpy(l, *((float *)PyArray_DATA(ap2)), (float *)PyArray_DATA(ap1), 1,
+            cblas_saxpy(l,
+                        *((float *)PyArray_DATA(ap2)),
+                        (float *)PyArray_DATA(ap1), 1,
                         (float *)PyArray_DATA(ret), 1);
         }
         else if (typenum == NPY_CFLOAT) {
-            cblas_caxpy(l, (float *)PyArray_DATA(ap2), (float *)PyArray_DATA(ap1), 1,
+            cblas_caxpy(l,
+                        (float *)PyArray_DATA(ap2),
+                        (float *)PyArray_DATA(ap1), 1,
                         (float *)PyArray_DATA(ret), 1);
         }
     }
     else if (PyArray_NDIM(ap1) == 1 && PyArray_NDIM(ap2) == 1) {
         /* Dot product between two vectors -- Level 1 BLAS */
-        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
-                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
+        blas_dot(typenum, l,
+                 PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
+                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2),
+                 PyArray_DATA(ret));
     }
     else if (PyArray_NDIM(ap1) == 2 && PyArray_NDIM(ap2) == 1) {
         /* Matrix-vector multiplication -- Level 2 BLAS */
@@ -834,6 +812,7 @@ cblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     return NULL;
 }
 
+#if 0
 
 /*
  * vdot(a,b)
diff --git a/numpy/core/src/multiarray/cblasfuncs.h b/numpy/core/src/multiarray/cblasfuncs.h
index 66ce4ca5becb..d3ec08db608b 100644
--- a/numpy/core/src/multiarray/cblasfuncs.h
+++ b/numpy/core/src/multiarray/cblasfuncs.h
@@ -4,4 +4,7 @@
 NPY_NO_EXPORT PyObject *
 cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
 
+NPY_NO_EXPORT PyObject *
+cblas_innerproduct(int, PyArrayObject *, PyArrayObject *);
+
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 8b8c1b8a7fe7..90955985131a 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -836,8 +836,18 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     ap2 = (PyArrayObject *)PyArray_FromAny(op2, typec, 0, 0,
                                         NPY_ARRAY_ALIGNED, NULL);
     if (ap2 == NULL) {
-        goto fail;
+        Py_DECREF(ap1);
+        return NULL;
     }
+
+#if defined(HAVE_CBLAS)
+    if (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2 &&
+            (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+             NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+        return cblas_innerproduct(typenum, ap1, ap2);
+    }
+#endif
+
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         ret = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
         ret = (PyArrayObject *)Py_TYPE(ret)->tp_as_number->nb_multiply(
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index e40deeef1dec..9a177c2fd408 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -27,7 +27,6 @@ def test_vecself(self):
 def test_blasdot_used():
     from numpy.core import vdot, inner
     assert_(vdot is _dotblas.vdot, "vdot")
-    assert_(inner is _dotblas.inner, "inner")
 
 
 def test_dot_2args():

From df0b65c03caeacdda6cb9de89ef5a10579186680 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sun, 17 Aug 2014 16:51:57 -0600
Subject: [PATCH 08/19] MAINT: Refactor ndarray.dot method to call
 PyArray_MatrixProduct2.

Now that PyArray_MatrixProduct2 is blas enabled, call it directly.
---
 numpy/core/src/multiarray/methods.c          | 60 +++++++++++++-------
 numpy/core/src/multiarray/multiarraymodule.c |  5 +-
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index a791c2c222aa..79475d97cbc6 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -9,9 +9,8 @@
 #include "numpy/arrayscalars.h"
 
 #include "npy_config.h"
-
 #include "npy_pycompat.h"
-
+#include "ufunc_override.h"
 #include "common.h"
 #include "ctors.h"
 #include "calculation.h"
@@ -1998,31 +1997,52 @@ array_cumprod(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_dot(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *fname, *ret, *b, *out = NULL;
-    static PyObject *numpycore = NULL;
-    char * kwords[] = {"b", "out", NULL };
+    static PyUFuncObject *cached_npy_dot = NULL;
+    int errval;
+    PyObject *override = NULL;
+    PyObject *a = (PyObject *)self, *b, *o = Py_None;
+    PyObject *newargs;
+    PyArrayObject *ret;
+    char* kwlist[] = {"b", "out", NULL };
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwords, &b, &out)) {
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, &b, &o)) {
         return NULL;
     }
 
-    /* Since blas-dot is exposed only on the Python side, we need to grab it
-     * from there */
-    if (numpycore == NULL) {
-        numpycore = PyImport_ImportModule("numpy.core");
-        if (numpycore == NULL) {
-            return NULL;
-        }
+    if (cached_npy_dot == NULL) {
+        PyObject *module = PyImport_ImportModule("numpy.core.multiarray");
+        cached_npy_dot = (PyUFuncObject*)PyDict_GetItemString(
+                                              PyModule_GetDict(module), "dot");
+
+        Py_INCREF(cached_npy_dot);
+        Py_DECREF(module);
     }
-    fname = PyUString_FromString("dot");
-    if (out == NULL) {
-        ret = PyObject_CallMethodObjArgs(numpycore, fname, self, b, NULL);
+
+    if ((newargs = PyTuple_Pack(3, a, b, o)) == NULL) {
+        return NULL;
     }
-    else {
-        ret = PyObject_CallMethodObjArgs(numpycore, fname, self, b, out, NULL);
+    errval = PyUFunc_CheckOverride(cached_npy_dot, "__call__",
+                                   newargs, NULL, &override, 2);
+    Py_DECREF(newargs);
+
+    if (errval) {
+        return NULL;
     }
-    Py_DECREF(fname);
-    return ret;
+    else if (override) {
+        return override;
+    }
+
+    if (o == Py_None) {
+        o = NULL;
+    }
+    if (o != NULL && !PyArray_Check(o)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "'out' must be an array");
+        return NULL;
+    }
+    ret = (PyArrayObject *)PyArray_MatrixProduct2(a, b, (PyArrayObject *)o);
+    return PyArray_Return(ret);
 }
 
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 90955985131a..a1c1cc11a9ca 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -2216,16 +2216,15 @@ array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 static PyObject *
 array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
 {
-    int errval;
     static PyUFuncObject *cached_npy_dot = NULL;
+    int errval;
     PyObject *override = NULL;
     PyObject *v, *a, *o = NULL;
     PyArrayObject *ret;
     char* kwlist[] = {"a", "b", "out", NULL };
-    PyObject *module;
 
     if (cached_npy_dot == NULL) {
-        module = PyImport_ImportModule("numpy.core.multiarray");
+        PyObject *module = PyImport_ImportModule("numpy.core.multiarray");
         cached_npy_dot = (PyUFuncObject*)PyDict_GetItemString(
                                               PyModule_GetDict(module), "dot");
 

From 63cc74ef7e951bea898d8bc0f7c728de9e62de5e Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Wed, 20 Aug 2014 12:39:40 -0600
Subject: [PATCH 09/19] ENH: Move vdot to multiarray.

Remove vdot from _dotblas and implement it in multiarray.
Remove the files in core/blasdot as they are no longer needed.
Fix tests and build to reflect the changes.

The vdot function is now a bit faster in the common case as ravel
is used instead of flatten. There is also no need to conjugate the
files for clongdouble.
---
 numpy/core/bento.info                         |   3 -
 numpy/core/blasdot/_dotblas.c                 | 233 -------
 numpy/core/blasdot/cblas.h                    | 578 ------------------
 numpy/core/bscript                            |  13 +-
 numpy/core/numeric.py                         |  19 +-
 numpy/core/setup.py                           |  27 +-
 numpy/core/src/multiarray/arraytypes.c.src    |  58 +-
 numpy/core/src/multiarray/arraytypes.h        |   8 +-
 numpy/core/src/multiarray/cblasfuncs.c        | 120 +---
 numpy/core/src/multiarray/common.h            |  30 +
 numpy/core/src/multiarray/multiarraymodule.c  | 122 +++-
 .../src/multiarray/multiarraymodule_onefile.c |   1 +
 numpy/core/src/multiarray/vdot.c              | 183 ++++++
 numpy/core/src/multiarray/vdot.h              |  18 +
 numpy/core/tests/test_blasdot.py              |  10 -
 15 files changed, 383 insertions(+), 1040 deletions(-)
 delete mode 100644 numpy/core/blasdot/_dotblas.c
 delete mode 100644 numpy/core/blasdot/cblas.h
 create mode 100644 numpy/core/src/multiarray/vdot.c
 create mode 100644 numpy/core/src/multiarray/vdot.h

diff --git a/numpy/core/bento.info b/numpy/core/bento.info
index 299bd8ca516c..0a22dc710e3a 100644
--- a/numpy/core/bento.info
+++ b/numpy/core/bento.info
@@ -33,9 +33,6 @@ Library:
         Sources:
             src/private/scalarmathmodule.h.src,
             src/scalarmathmodule.c.src
-    Extension: _dotblas
-        Sources:
-            blasdot/_dotblas.c
     Extension: test_rational
         Sources:
             src/umath/test_rational.c.src
diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
deleted file mode 100644
index 7d75266263ad..000000000000
--- a/numpy/core/blasdot/_dotblas.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * This module provides a BLAS optimized matrix multiply,
- * inner product and dot for numpy arrays
- */
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include "Python.h"
-
-#include "numpy/arrayobject.h"
-#include "npy_config.h"
-#include "npy_pycompat.h"
-#ifndef CBLAS_HEADER
-#define CBLAS_HEADER "cblas.h"
-#endif
-#include CBLAS_HEADER
-
-#include <assert.h>
-#include <limits.h>
-#include <stdio.h>
-
-
-static char module_doc[] =
-    "This module provides a BLAS optimized\n"
-    "matrix multiply, inner product and dot for numpy arrays";
-
-
-static PyArray_DotFunc *oldFunctions[NPY_NTYPES];
-
-
-/*
- * Helper: call appropriate BLAS dot function for typenum.
- * Strides are NumPy strides.
- */
-static void
-blas_dot(int typenum, npy_intp n,
-         void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
-{
-    PyArray_DotFunc *dot;
-
-    dot = oldFunctions[typenum];
-    assert(dot != NULL);
-    dot(a, stridea, b, strideb, res, n, NULL);
-}
-
-
-static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
-static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
-
-
-/*
- * Initialize oldFunctions table.
- */
-static void
-init_oldFunctions(void)
-{
-    PyArray_Descr *descr;
-    int i;
-
-    /* Initialise the array of dot functions */
-    for (i = 0; i < NPY_NTYPES; i++)
-        oldFunctions[i] = NULL;
-
-    /* index dot functions we want to use here */
-    descr = PyArray_DescrFromType(NPY_FLOAT);
-    oldFunctions[NPY_FLOAT] = descr->f->dotfunc;
-
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    oldFunctions[NPY_DOUBLE] = descr->f->dotfunc;
-
-    descr = PyArray_DescrFromType(NPY_CFLOAT);
-    oldFunctions[NPY_CFLOAT] = descr->f->dotfunc;
-
-    descr = PyArray_DescrFromType(NPY_CDOUBLE);
-    oldFunctions[NPY_CDOUBLE] = descr->f->dotfunc;
-}
-
-
-typedef enum {_scalar, _column, _row, _matrix} MatrixShape;
-
-
-/*
- * vdot(a,b)
- *
- * Returns the dot product of a and b for scalars and vectors of
- * floating point and complex types.  The first argument, a, is conjugated.
- */
-static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
-    PyObject *op1, *op2;
-    PyArrayObject *ap1 = NULL, *ap2  = NULL, *ret = NULL;
-    int l;
-    int typenum;
-    npy_intp dimensions[NPY_MAXDIMS];
-    PyArray_Descr *type;
-
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
-        return NULL;
-    }
-
-    /*
-     * Conjugating dot product using the BLAS for vectors.
-     * Multiplies op1 and op2, each of which must be vector.
-     */
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    type = PyArray_DescrFromType(typenum);
-    Py_INCREF(type);
-    ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
-    if (ap1 == NULL) {
-        Py_DECREF(type);
-        goto fail;
-    }
-    op1 = PyArray_Flatten(ap1, 0);
-    if (op1 == NULL) {
-        Py_DECREF(type);
-        goto fail;
-    }
-    Py_DECREF(ap1);
-    ap1 = (PyArrayObject *)op1;
-
-    ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
-    if (ap2 == NULL) {
-        goto fail;
-    }
-    op2 = PyArray_Flatten(ap2, 0);
-    if (op2 == NULL) {
-        goto fail;
-    }
-    Py_DECREF(ap2);
-    ap2 = (PyArrayObject *)op2;
-
-    if (typenum != NPY_FLOAT && typenum != NPY_DOUBLE &&
-        typenum != NPY_CFLOAT && typenum != NPY_CDOUBLE) {
-        if (PyTypeNum_ISCOMPLEX(typenum)) {
-            op1 = PyArray_Conjugate(ap1, NULL);
-            if (op1 == NULL) {
-                goto fail;
-            }
-            Py_DECREF(ap1);
-            ap1 = (PyArrayObject *)op1;
-        }
-        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
-                                                    (PyObject *)ap2);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
-
-    if (PyArray_DIM(ap2, 0) != PyArray_DIM(ap1, PyArray_NDIM(ap1)-1)) {
-        PyErr_SetString(PyExc_ValueError, "vectors have different lengths");
-        goto fail;
-    }
-    l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
-
-    ret = (PyArrayObject *)PyArray_SimpleNew(0, dimensions, typenum);
-    if (ret == NULL) {
-        goto fail;
-    }
-
-    NPY_BEGIN_ALLOW_THREADS;
-
-    /* Dot product between two vectors -- Level 1 BLAS */
-    if (typenum == NPY_DOUBLE || typenum == NPY_FLOAT) {
-        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
-                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
-    }
-    else if (typenum == NPY_CDOUBLE) {
-        cblas_zdotc_sub(l, (double *)PyArray_DATA(ap1), 1,
-                        (double *)PyArray_DATA(ap2), 1, (double *)PyArray_DATA(ret));
-    }
-    else if (typenum == NPY_CFLOAT) {
-        cblas_cdotc_sub(l, (float *)PyArray_DATA(ap1), 1,
-                        (float *)PyArray_DATA(ap2), 1, (float *)PyArray_DATA(ret));
-    }
-
-    NPY_END_ALLOW_THREADS;
-
-    Py_DECREF(ap1);
-    Py_DECREF(ap2);
-    return PyArray_Return(ret);
-
- fail:
-    Py_XDECREF(ap1);
-    Py_XDECREF(ap2);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-static struct PyMethodDef dotblas_module_methods[] = {
-    {"vdot",
-        (PyCFunction)dotblas_vdot,
-        1, NULL},
-    {NULL, NULL, 0, NULL}               /* sentinel */
-};
-
-#if defined(NPY_PY3K)
-static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        "_dotblas",
-        NULL,
-        -1,
-        dotblas_module_methods,
-        NULL,
-        NULL,
-        NULL,
-        NULL
-};
-#endif
-
-/* Initialization function for the module */
-#if defined(NPY_PY3K)
-#define RETVAL m
-PyMODINIT_FUNC PyInit__dotblas(void)
-#else
-#define RETVAL
-PyMODINIT_FUNC init_dotblas(void)
-#endif
-{
-#if defined(NPY_PY3K)
-    PyObject *m;
-    m = PyModule_Create(&moduledef);
-#else
-    Py_InitModule3("_dotblas", dotblas_module_methods, module_doc);
-#endif
-
-    /* Import the array object */
-    import_array();
-
-    /* initialize oldFunctions table */
-    init_oldFunctions();
-
-    return RETVAL;
-}
diff --git a/numpy/core/blasdot/cblas.h b/numpy/core/blasdot/cblas.h
deleted file mode 100644
index 25de09edfe8c..000000000000
--- a/numpy/core/blasdot/cblas.h
+++ /dev/null
@@ -1,578 +0,0 @@
-#ifndef CBLAS_H
-#define CBLAS_H
-#include <stddef.h>
-
-/* Allow the use in C++ code.  */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/*
- * Enumerated and derived types
- */
-#define CBLAS_INDEX size_t  /* this may vary between platforms */
-
-enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
-enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
-enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
-enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
-enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS functions (complex are recast as routines)
- * ===========================================================================
- */
-float  cblas_sdsdot(const int N, const float alpha, const float *X,
-                    const int incX, const float *Y, const int incY);
-double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
-                   const int incY);
-float  cblas_sdot(const int N, const float  *X, const int incX,
-                  const float  *Y, const int incY);
-double cblas_ddot(const int N, const double *X, const int incX,
-                  const double *Y, const int incY);
-
-/*
- * Functions having prefixes Z and C only
- */
-void   cblas_cdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu);
-void   cblas_cdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc);
-
-void   cblas_zdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu);
-void   cblas_zdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc);
-
-
-/*
- * Functions having prefixes S D SC DZ
- */
-float  cblas_snrm2(const int N, const float *X, const int incX);
-float  cblas_sasum(const int N, const float *X, const int incX);
-
-double cblas_dnrm2(const int N, const double *X, const int incX);
-double cblas_dasum(const int N, const double *X, const int incX);
-
-float  cblas_scnrm2(const int N, const void *X, const int incX);
-float  cblas_scasum(const int N, const void *X, const int incX);
-
-double cblas_dznrm2(const int N, const void *X, const int incX);
-double cblas_dzasum(const int N, const void *X, const int incX);
-
-
-/*
- * Functions having standard 4 prefixes (S D C Z)
- */
-CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
-CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
-CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
-CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS routines
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (s, d, c, z)
- */
-void cblas_sswap(const int N, float *X, const int incX,
-                 float *Y, const int incY);
-void cblas_scopy(const int N, const float *X, const int incX,
-                 float *Y, const int incY);
-void cblas_saxpy(const int N, const float alpha, const float *X,
-                 const int incX, float *Y, const int incY);
-
-void cblas_dswap(const int N, double *X, const int incX,
-                 double *Y, const int incY);
-void cblas_dcopy(const int N, const double *X, const int incX,
-                 double *Y, const int incY);
-void cblas_daxpy(const int N, const double alpha, const double *X,
-                 const int incX, double *Y, const int incY);
-
-void cblas_cswap(const int N, void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_ccopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_caxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY);
-
-void cblas_zswap(const int N, void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_zcopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_zaxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY);
-
-
-/*
- * Routines with S and D prefix only
- */
-void cblas_srotg(float *a, float *b, float *c, float *s);
-void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
-void cblas_srot(const int N, float *X, const int incX,
-                float *Y, const int incY, const float c, const float s);
-void cblas_srotm(const int N, float *X, const int incX,
-                float *Y, const int incY, const float *P);
-
-void cblas_drotg(double *a, double *b, double *c, double *s);
-void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
-void cblas_drot(const int N, double *X, const int incX,
-                double *Y, const int incY, const double c, const double  s);
-void cblas_drotm(const int N, double *X, const int incX,
-                double *Y, const int incY, const double *P);
-
-
-/*
- * Routines with S D C Z CS and ZD prefixes
- */
-void cblas_sscal(const int N, const float alpha, float *X, const int incX);
-void cblas_dscal(const int N, const double alpha, double *X, const int incX);
-void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_csscal(const int N, const float alpha, void *X, const int incX);
-void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
-
-/*
- * ===========================================================================
- * Prototypes for level 2 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 const float *X, const int incX, const float beta,
-                 float *Y, const int incY);
-void cblas_sgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const float alpha,
-                 const float *A, const int lda, const float *X,
-                 const int incX, const float beta, float *Y, const int incY);
-void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *Ap, float *X, const int incX);
-void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *A, const int lda, float *X,
-                 const int incX);
-void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *Ap, float *X, const int incX);
-
-void cblas_dgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *X, const int incX, const double beta,
-                 double *Y, const int incY);
-void cblas_dgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const double alpha,
-                 const double *A, const int lda, const double *X,
-                 const int incX, const double beta, double *Y, const int incY);
-void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *Ap, double *X, const int incX);
-void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *A, const int lda, double *X,
-                 const int incX);
-void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *Ap, double *X, const int incX);
-
-void cblas_cgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY);
-void cblas_cgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const void *alpha,
-                 const void *A, const int lda, const void *X,
-                 const int incX, const void *beta, void *Y, const int incY);
-void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda, void *X,
-                 const int incX);
-void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-
-void cblas_zgemv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY);
-void cblas_zgbmv(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const void *alpha,
-                 const void *A, const int lda, const void *X,
-                 const int incX, const void *beta, void *Y, const int incY);
-void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda, void *X,
-                 const int incX);
-void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-
-
-/*
- * Routines with S and D prefixes only
- */
-void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const float alpha, const float *Ap,
-                 const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N,
-                const float alpha, const float *X, const int incX,
-                const float *Y, const int incY, float *A, const int lda);
-void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, float *A, const int lda);
-void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, float *Ap);
-void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, const float *Y, const int incY, float *A,
-                const int lda);
-void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, const float *Y, const int incY, float *A);
-
-void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *A,
-                 const int lda, const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const double alpha, const double *A,
-                 const int lda, const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *Ap,
-                 const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N,
-                const double alpha, const double *X, const int incX,
-                const double *Y, const int incY, double *A, const int lda);
-void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, double *A, const int lda);
-void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, double *Ap);
-void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, const double *Y, const int incY, double *A,
-                const int lda);
-void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, const double *Y, const int incY, double *A);
-
-
-/*
- * Routines with C and Z prefixes only
- */
-void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const void *X, const int incX,
-                void *A, const int lda);
-void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const void *X,
-                const int incX, void *A);
-void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *A, const int lda);
-void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *Ap);
-
-void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const void *X, const int incX,
-                void *A, const int lda);
-void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const void *X,
-                const int incX, void *A);
-void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *A, const int lda);
-void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *Ap);
-
-/*
- * ===========================================================================
- * Prototypes for level 3 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const float alpha, const float *A,
-                 const int lda, const float *B, const int ldb,
-                 const float beta, float *C, const int ldc);
-void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta,
-                 float *C, const int ldc);
-void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const float alpha, const float *A, const int lda,
-                 const float beta, float *C, const int ldc);
-void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const float alpha, const float *A, const int lda,
-                  const float *B, const int ldb, const float beta,
-                  float *C, const int ldc);
-void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
-void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
-
-void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const double alpha, const double *A,
-                 const int lda, const double *B, const int ldb,
-                 const double beta, double *C, const int ldc);
-void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *B, const int ldb, const double beta,
-                 double *C, const int ldc);
-void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const double alpha, const double *A, const int lda,
-                 const double beta, double *C, const int ldc);
-void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const double alpha, const double *A, const int lda,
-                  const double *B, const int ldb, const double beta,
-                  double *C, const int ldc);
-void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
-void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
-
-void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc);
-void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const void *alpha, const void *A, const int lda,
-                 const void *beta, void *C, const int ldc);
-void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const void *beta,
-                  void *C, const int ldc);
-void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-
-void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc);
-void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const void *alpha, const void *A, const int lda,
-                 const void *beta, void *C, const int ldc);
-void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const void *beta,
-                  void *C, const int ldc);
-void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-
-
-/*
- * Routines with prefixes C and Z only
- */
-void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const float alpha, const void *A, const int lda,
-                 const float beta, void *C, const int ldc);
-void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const float beta,
-                  void *C, const int ldc);
-
-void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const double alpha, const void *A, const int lda,
-                 const double beta, void *C, const int ldc);
-void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const double beta,
-                  void *C, const int ldc);
-
-void cblas_xerbla(int p, const char *rout, const char *form, ...);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 260fba4e7d4c..da9ff2799e92 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -443,7 +443,8 @@ def pre_build(context):
                 "src/multiarray/einsum.c.src"]
         bld(target="multiarray_templates", source=multiarray_templates)
         if ENABLE_SEPARATE_COMPILATION:
-            sources = [pjoin('src', 'multiarray', 'arrayobject.c'),
+            sources = [
+                pjoin('src', 'multiarray', 'arrayobject.c'),
                 pjoin('src', 'multiarray', 'alloc.c'),
                 pjoin('src', 'multiarray', 'arraytypes.c.src'),
                 pjoin('src', 'multiarray', 'array_assign.c'),
@@ -485,7 +486,9 @@ def pre_build(context):
                 pjoin('src', 'multiarray', 'sequence.c'),
                 pjoin('src', 'multiarray', 'shape.c'),
                 pjoin('src', 'multiarray', 'ucsnarrow.c'),
-                pjoin('src', 'multiarray', 'usertypes.c')]
+                pjoin('src', 'multiarray', 'usertypes.c'),
+                pjoin('src', 'multiarray', 'vdot.c'),
+                ]
 
             if bld.env.HAS_CBLAS:
                 sources.append(pjoin('src', 'multiarray', 'cblasfuncs.c'))
@@ -548,9 +551,3 @@ def pre_build(context):
     context.tweak_extension("scalarmath", use="npymath", includes=["src/private"])
     context.tweak_extension("multiarray_tests", use="npymath", includes=["src/private"])
     context.tweak_extension("umath_tests", use="npymath", includes=["src/private"])
-
-    def build_dotblas(extension):
-        if bld.env.HAS_CBLAS:
-            return context.default_builder(extension, use="CBLAS",
-                                           includes=["src/multiarray", "src/private"])
-    context.register_builder("_dotblas", build_dotblas)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index d2fc520f4e4e..72b8e1b68ff5 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1077,26 +1077,9 @@ def outer(a, b, out=None):
 
 # try to import blas optimized dot if available
 envbak = os.environ.copy()
-try:
-    # disables openblas affinity setting of the main thread that limits
-    # python threads or processes to one core
-    if 'openblas_main_free' not in os.environ:
-        os.environ['openblas_main_free'] = '1'
-    if 'gotoblas_main_free' not in os.environ:
-        os.environ['gotoblas_main_free'] = '1'
-    from ._dotblas import vdot
-except ImportError:
-    # docstrings are in add_newdocs.py
-    inner = multiarray.inner
-    def vdot(a, b):
-        return dot(asarray(a).ravel().conj(), asarray(b).ravel())
-finally:
-    os.environ.clear()
-    os.environ.update(envbak)
-    del envbak
-
 dot = multiarray.dot
 inner = multiarray.inner
+vdot = multiarray.vdot
 
 def alterdot():
     warnings.warn("alterdot no longer does anything.", DeprecationWarning)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 48d5c255b67f..2da2e837a9f7 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -772,6 +772,7 @@ def generate_multiarray_templated_sources(ext, build_dir):
             join('src', 'multiarray', 'shape.h'),
             join('src', 'multiarray', 'ucsnarrow.h'),
             join('src', 'multiarray', 'usertypes.h'),
+            join('src', 'multiarray', 'vdot.h'),
             join('src', 'private', 'lowlevel_strided_loops.h'),
             join('include', 'numpy', 'arrayobject.h'),
             join('include', 'numpy', '_neighborhood_iterator_imp.h'),
@@ -838,7 +839,9 @@ def generate_multiarray_templated_sources(ext, build_dir):
             join('src', 'multiarray', 'scalarapi.c'),
             join('src', 'multiarray', 'scalartypes.c.src'),
             join('src', 'multiarray', 'usertypes.c'),
-            join('src', 'multiarray', 'ucsnarrow.c')]
+            join('src', 'multiarray', 'ucsnarrow.c'),
+            join('src', 'multiarray', 'vdot.c'),
+            ]
 
     blas_info = get_info('blas_opt', 0)
     if blas_info and  ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
@@ -950,28 +953,6 @@ def generate_umath_c(ext, build_dir):
                          libraries = ['npymath'],
                          )
 
-    #######################################################################
-    #                          _dotblas module                            #
-    #######################################################################
-
-    # Configure blasdot
-    blas_info = get_info('blas_opt', 0)
-    #blas_info = {}
-    def get_dotblas_sources(ext, build_dir):
-        if blas_info:
-            if ('HAVE_CBLAS', None) in blas_info.get('define_macros', []):
-                return ext.depends[:1]
-        # dotblas needs CBLAS, Fortran compiled BLAS will not work.
-        return None
-
-    config.add_extension('_dotblas',
-                         sources = [get_dotblas_sources],
-                         depends = [join('blasdot', '_dotblas.c'),
-                                  join('blasdot', 'cblas.h'),
-                                  ],
-                         include_dirs = ['blasdot'],
-                         extra_info = blas_info
-                         )
 
     #######################################################################
     #                        umath_tests module                           #
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 953734ddc8a1..6fa8a65dacdf 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3,15 +3,10 @@
 #include "Python.h"
 #include "structmember.h"
 
-#include <limits.h>
-#if defined(HAVE_CBLAS)
-#include <cblas.h>
-#endif
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
-
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "npy_pycompat.h"
@@ -29,6 +24,12 @@
 
 #include "numpyos.h"
 
+#if defined(HAVE_CBLAS)
+#include "cblasfuncs.h"
+#include <cblas.h>
+#include <limits.h>
+#endif
+
 
 /*
  *****************************************************************************
@@ -3083,41 +3084,6 @@ static int
 
 /************************** MAYBE USE CBLAS *********************************/
 
-/*
- * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
- * (BLAS won't handle negative or zero strides the way we want).
- */
-#if defined(HAVE_CBLAS)
-static NPY_INLINE int
-blas_stride(npy_intp stride, unsigned itemsize)
-{
-    /*
-     * Should probably check pointer alignment also, but this may cause
-     * problems if we require complex to be 16 byte aligned.
-     */
-    if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
-        stride /= itemsize;
-        if (stride <= INT_MAX) {
-            return stride;
-        }
-    }
-    return 0;
-}
-#endif
-
-
-/*
- * The following functions do a "chunked" dot product using BLAS when
- * sizeof(npy_intp) > sizeof(int), because BLAS libraries can typically not
- * handle more than INT_MAX elements per call.
- *
- * The chunksize is the greatest power of two less than INT_MAX.
- */
-#if NPY_MAX_INTP > INT_MAX
-# define CHUNKSIZE  (INT_MAX / 2 + 1)
-#else
-# define CHUNKSIZE  NPY_MAX_INTP
-#endif
 
 /**begin repeat
  *
@@ -3138,7 +3104,7 @@ NPY_NO_EXPORT void
         double sum = 0.;  /* double for stability */
 
         while (n > 0) {
-            int chunk = n < CHUNKSIZE ? n : CHUNKSIZE;
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
 
             sum += cblas_@prefix@dot(chunk,
                                      (@type@ *) ip1, is1b,
@@ -3186,7 +3152,7 @@ NPY_NO_EXPORT void
         double sum[2] = {0., 0.};  /* double for stability */
 
         while (n > 0) {
-            int chunk = n < CHUNKSIZE ? n : CHUNKSIZE;
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
             @type@ tmp[2];
 
             cblas_@prefix@dotu_sub((int)n, ip1, is1b, ip2, is2b, tmp);
@@ -3214,12 +3180,13 @@ NPY_NO_EXPORT void
             const @type@ ip2i = ((@type@ *)ip2)[1];
 
             sumr += ip1r * ip2r - ip1i * ip2i;
-            sumi += ip1i * ip2r + ip1r * ip2i;
+            sumi += ip1r * ip2i + ip1i * ip2r;
         }
         ((@type@ *)op)[0] = sumr;
         ((@type@ *)op)[1] = sumi;
     }
 }
+
 /**end repeat**/
 
 /**************************** NO CBLAS VERSIONS *****************************/
@@ -3281,7 +3248,8 @@ HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
     *((npy_half *)op) = npy_float_to_half(tmp);
 }
 
-static void CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+static void
+CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
                             char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
     npy_longdouble tmpr = 0.0L;
@@ -3295,7 +3263,7 @@ static void CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
         const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
 
         tmpr += ip1r * ip2r - ip1i * ip2i;
-        tmpi += ip1i * ip2r + ip1r * ip2i;
+        tmpi += ip1r * ip2i + ip1i * ip2r;
     }
     ((npy_longdouble *)op)[0] = tmpr;
     ((npy_longdouble *)op)[1] = tmpi;
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h
index 9626ad9cbc83..9eed3e92b462 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h
@@ -7,6 +7,10 @@
 extern NPY_NO_EXPORT PyArray_Descr LONGLONG_Descr;
 extern NPY_NO_EXPORT PyArray_Descr LONG_Descr;
 extern NPY_NO_EXPORT PyArray_Descr INT_Descr;
+#endif
+
+NPY_NO_EXPORT int
+set_typeinfo(PyObject *dict);
 
 /* needed for blasfuncs */
 NPY_NO_EXPORT void
@@ -20,9 +24,5 @@ DOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
 
 NPY_NO_EXPORT void
 CDOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
-#endif
-
-NPY_NO_EXPORT int
-set_typeinfo(PyObject *dict);
 
 #endif
diff --git a/numpy/core/src/multiarray/cblasfuncs.c b/numpy/core/src/multiarray/cblasfuncs.c
index 36504c3f940c..42453890f41d 100644
--- a/numpy/core/src/multiarray/cblasfuncs.c
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -691,7 +691,6 @@ cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
     npy_intp dimensions[NPY_MAXDIMS];
     PyTypeObject *subtype;
 
-
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
         /* One of ap1 or ap2 is a scalar */
         if (PyArray_NDIM(ap1) == 0) {
@@ -711,14 +710,14 @@ cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
          * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
          *  Both ap1 and ap2 are vectors or matrices
          */
-        l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
+        l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
 
-        if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
+        if (PyArray_DIM(ap2, PyArray_NDIM(ap2) - 1) != l) {
             PyErr_SetString(PyExc_ValueError,
                     "matrices are not aligned");
             goto fail;
         }
-        nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
+        nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
 
         if (nd == 1)
             dimensions[0] = (PyArray_NDIM(ap1) == 2) ?
@@ -742,7 +741,8 @@ cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
     if (ret == NULL) {
         goto fail;
     }
-    NPY_BEGIN_ALLOW_THREADS
+
+    NPY_BEGIN_ALLOW_THREADS;
     memset(PyArray_DATA(ret), 0, PyArray_NBYTES(ret));
 
     if (PyArray_NDIM(ap2) == 0) {
@@ -800,115 +800,6 @@ cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
              PyArray_DIM(ap1, 0), PyArray_DIM(ap2, 0), PyArray_DIM(ap1, 1),
              ap1, lda, ap2, ldb, ret);
     }
-    NPY_END_ALLOW_THREADS
-    Py_DECREF(ap1);
-    Py_DECREF(ap2);
-    return PyArray_Return(ret);
-
- fail:
-    Py_XDECREF(ap1);
-    Py_XDECREF(ap2);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-#if 0
-
-/*
- * vdot(a,b)
- *
- * Returns the dot product of a and b for scalars and vectors of
- * floating point and complex types.  The first argument, a, is conjugated.
- */
-static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
-    PyObject *op1, *op2;
-    PyArrayObject *ap1 = NULL, *ap2  = NULL, *ret = NULL;
-    int l;
-    int typenum;
-    npy_intp dimensions[NPY_MAXDIMS];
-    PyArray_Descr *type;
-
-    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
-        return NULL;
-    }
-
-    /*
-     * Conjugating dot product using the BLAS for vectors.
-     * Multiplies op1 and op2, each of which must be vector.
-     */
-    typenum = PyArray_ObjectType(op1, 0);
-    typenum = PyArray_ObjectType(op2, typenum);
-
-    type = PyArray_DescrFromType(typenum);
-    Py_INCREF(type);
-    ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
-    if (ap1 == NULL) {
-        Py_DECREF(type);
-        goto fail;
-    }
-    op1 = PyArray_Flatten(ap1, 0);
-    if (op1 == NULL) {
-        Py_DECREF(type);
-        goto fail;
-    }
-    Py_DECREF(ap1);
-    ap1 = (PyArrayObject *)op1;
-
-    ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
-    if (ap2 == NULL) {
-        goto fail;
-    }
-    op2 = PyArray_Flatten(ap2, 0);
-    if (op2 == NULL) {
-        goto fail;
-    }
-    Py_DECREF(ap2);
-    ap2 = (PyArrayObject *)op2;
-
-    if (typenum != NPY_FLOAT && typenum != NPY_DOUBLE &&
-        typenum != NPY_CFLOAT && typenum != NPY_CDOUBLE) {
-        if (PyTypeNum_ISCOMPLEX(typenum)) {
-            op1 = PyArray_Conjugate(ap1, NULL);
-            if (op1 == NULL) {
-                goto fail;
-            }
-            Py_DECREF(ap1);
-            ap1 = (PyArrayObject *)op1;
-        }
-        ret = (PyArrayObject *)PyArray_InnerProduct((PyObject *)ap1,
-                                                    (PyObject *)ap2);
-        Py_DECREF(ap1);
-        Py_DECREF(ap2);
-        return PyArray_Return(ret);
-    }
-
-    if (PyArray_DIM(ap2, 0) != PyArray_DIM(ap1, PyArray_NDIM(ap1)-1)) {
-        PyErr_SetString(PyExc_ValueError, "vectors have different lengths");
-        goto fail;
-    }
-    l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
-
-    ret = (PyArrayObject *)PyArray_SimpleNew(0, dimensions, typenum);
-    if (ret == NULL) {
-        goto fail;
-    }
-
-    NPY_BEGIN_ALLOW_THREADS;
-
-    /* Dot product between two vectors -- Level 1 BLAS */
-    if (typenum == NPY_DOUBLE || typenum == NPY_FLOAT) {
-        blas_dot(typenum, l, PyArray_DATA(ap1), PyArray_ITEMSIZE(ap1),
-                 PyArray_DATA(ap2), PyArray_ITEMSIZE(ap2), PyArray_DATA(ret));
-    }
-    else if (typenum == NPY_CDOUBLE) {
-        cblas_zdotc_sub(l, (double *)PyArray_DATA(ap1), 1,
-                        (double *)PyArray_DATA(ap2), 1, (double *)PyArray_DATA(ret));
-    }
-    else if (typenum == NPY_CFLOAT) {
-        cblas_cdotc_sub(l, (float *)PyArray_DATA(ap1), 1,
-                        (float *)PyArray_DATA(ap2), 1, (float *)PyArray_DATA(ret));
-    }
-
     NPY_END_ALLOW_THREADS;
 
     Py_DECREF(ap1);
@@ -921,4 +812,3 @@ static PyObject *dotblas_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args) {
     Py_XDECREF(ret);
     return NULL;
 }
-#endif
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 6b49d6b4cf5c..37840ec655aa 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -3,6 +3,7 @@
 #include <numpy/npy_common.h>
 #include <numpy/npy_cpu.h>
 #include <numpy/ndarraytypes.h>
+#include <limits.h>
 
 #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
 
@@ -208,6 +209,35 @@ _is_basic_python_type(PyObject * obj)
     return 0;
 }
 
+/*
+ * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
+ * (BLAS won't handle negative or zero strides the way we want).
+ */
+static NPY_INLINE int
+blas_stride(npy_intp stride, unsigned itemsize)
+{
+    /*
+     * Should probably check pointer alignment also, but this may cause
+     * problems if we require complex to be 16 byte aligned.
+     */
+    if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
+        stride /= itemsize;
+        if (stride <= INT_MAX) {
+            return stride;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Define a chunksize for CBLAS. CBLAS counts in integers.
+ */
+#if NPY_MAX_INTP > INT_MAX
+# define NPY_CBLAS_CHUNK  (INT_MAX / 2 + 1)
+#else
+# define NPY_CBLAS_CHUNK  NPY_MAX_INTP
+#endif
+
 #include "ucsnarrow.h"
 
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index a1c1cc11a9ca..2b3c8ec1985f 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -58,6 +58,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "scalarmathmodule.h" /* for npy_mul_with_overflow_intp */
 #include "multiarraymodule.h"
 #include "cblasfuncs.h"
+#include "vdot.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -893,7 +894,8 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     }
     is1 = PyArray_STRIDES(ap1)[PyArray_NDIM(ap1) - 1];
     is2 = PyArray_STRIDES(ap2)[PyArray_NDIM(ap2) - 1];
-    op = PyArray_DATA(ret); os = PyArray_DESCR(ret)->elsize;
+    op = PyArray_DATA(ret);
+    os = PyArray_DESCR(ret)->elsize;
     axis = PyArray_NDIM(ap1) - 1;
     it1 = (PyArrayIterObject *) PyArray_IterAllButAxis((PyObject *)ap1, &axis);
     axis = PyArray_NDIM(ap2) - 1;
@@ -1970,8 +1972,9 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
                 if (tmpobj == NULL) {
                     /* More informative error message */
                     PyErr_SetString(PyExc_ValueError,
-                                    ("Failed to encode Numpy scalar data string to latin1. "
-                                     "pickle.load(a, encoding='latin1') is assumed if unpickling."));
+                            "Failed to encode Numpy scalar data string to "
+                            "latin1,\npickle.load(a, encoding='latin1') is "
+                            "assumed if unpickling.");
                     return NULL;
                 }
             }
@@ -2256,6 +2259,116 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
     return PyArray_Return(ret);
 }
 
+
+static PyObject *
+array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    int typenum;
+    char *ip1, *ip2, *op;
+    npy_intp n, stride;
+    PyObject *op1, *op2;
+    PyArrayObject *ap1 = NULL, *ap2  = NULL, *ret = NULL;
+    PyArray_Descr *type;
+    PyArray_DotFunc *vdot;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) {
+        return NULL;
+    }
+
+    /*
+     * Conjugating dot product using the BLAS for vectors.
+     * Flattens both op1 and op2 before dotting.
+     */
+    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op2, typenum);
+
+    type = PyArray_DescrFromType(typenum);
+    Py_INCREF(type);
+    ap1 = (PyArrayObject *)PyArray_FromAny(op1, type, 0, 0, 0, NULL);
+    if (ap1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
+    op1 = PyArray_Ravel(ap1, NPY_CORDER);
+    if (op1 == NULL) {
+        Py_DECREF(type);
+        goto fail;
+    }
+    Py_DECREF(ap1);
+    ap1 = (PyArrayObject *)op1;
+
+    ap2 = (PyArrayObject *)PyArray_FromAny(op2, type, 0, 0, 0, NULL);
+    if (ap2 == NULL) {
+        goto fail;
+    }
+    op2 = PyArray_Ravel(ap2, NPY_CORDER);
+    if (op2 == NULL) {
+        goto fail;
+    }
+    Py_DECREF(ap2);
+    ap2 = (PyArrayObject *)op2;
+
+    if (PyArray_DIM(ap2, 0) != PyArray_DIM(ap1, 0)) {
+        PyErr_SetString(PyExc_ValueError,
+                "vectors have different lengths");
+        goto fail;
+    }
+
+    /* array scalar output */
+    ret = new_array_for_sum(ap1, ap2, NULL, 0, (npy_intp *)NULL, typenum);
+    if (ret == NULL) {
+        goto fail;
+    }
+
+    n = PyArray_DIM(ap1, 0);
+    stride = type->elsize;
+    ip1 = PyArray_DATA(ap1);
+    ip2 = PyArray_DATA(ap2);
+    op = PyArray_DATA(ret);
+
+    switch (typenum) {
+        case NPY_CFLOAT:
+            vdot = (PyArray_DotFunc *)CFLOAT_vdot;
+            break;
+        case NPY_CDOUBLE:
+            vdot = (PyArray_DotFunc *)CDOUBLE_vdot;
+            break;
+        case NPY_CLONGDOUBLE:
+            vdot = (PyArray_DotFunc *)CLONGDOUBLE_vdot;
+            break;
+        case NPY_OBJECT:
+            vdot = (PyArray_DotFunc *)OBJECT_vdot;
+            break;
+        default:
+            vdot = type->f->dotfunc;
+            if (vdot == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "function not available for this data type");
+                goto fail;
+            }
+    }
+
+    if (n < 500) {
+        vdot(ip1, stride, ip2, stride, op, n, NULL);
+    }
+    else {
+        NPY_BEGIN_THREADS_DESCR(type);
+        vdot(ip1, stride, ip2, stride, op, n, NULL);
+        NPY_END_THREADS_DESCR(type);
+    }
+
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    return PyArray_Return(ret);
+fail:
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    Py_XDECREF(ret);
+    return NULL;
+}
+
+
 static int
 einsum_sub_op_from_str(PyObject *args, PyObject **str_obj, char **subscripts,
                        PyArrayObject **op)
@@ -3823,6 +3936,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"dot",
         (PyCFunction)array_matrixproduct,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"vdot",
+        (PyCFunction)array_vdot,
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index 010b88ce1def..04fef61ce0f3 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -15,6 +15,7 @@
 #include "datetime_busday.c"
 #include "datetime_busdaycal.c"
 #include "arraytypes.c"
+#include "vdot.c"
 
 #include "hashdescr.c"
 #include "numpyos.c"
diff --git a/numpy/core/src/multiarray/vdot.c b/numpy/core/src/multiarray/vdot.c
new file mode 100644
index 000000000000..0b80b6ccaa7c
--- /dev/null
+++ b/numpy/core/src/multiarray/vdot.c
@@ -0,0 +1,183 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <Python.h>
+#include "common.h"
+#include "vdot.h"
+
+#if defined(HAVE_CBLAS)
+#include <cblas.h>
+#endif
+
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+CFLOAT_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+            char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(npy_cfloat));
+    int is2b = blas_stride(is2, sizeof(npy_cfloat));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            float tmp[2];
+
+            cblas_cdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((float *)op)[0] = (float)sum[0];
+        ((float *)op)[1] = (float)sum[1];
+    }
+    else
+#endif
+    {
+        float sumr = (float)0.0;
+        float sumi = (float)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const float ip1r = ((float *)ip1)[0];
+            const float ip1i = ((float *)ip1)[1];
+            const float ip2r = ((float *)ip2)[0];
+            const float ip2i = ((float *)ip2)[1];
+
+            sumr += ip1r * ip2r + ip1i * ip2i;
+            sumi += ip1r * ip2i - ip1i * ip2r;
+        }
+        ((float *)op)[0] = sumr;
+        ((float *)op)[1] = sumi;
+    }
+}
+
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+CDOUBLE_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+             char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(npy_cdouble));
+    int is2b = blas_stride(is2, sizeof(npy_cdouble));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            double tmp[2];
+
+            cblas_zdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((double *)op)[0] = (double)sum[0];
+        ((double *)op)[1] = (double)sum[1];
+    }
+    else
+#endif
+    {
+        double sumr = (double)0.0;
+        double sumi = (double)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const double ip1r = ((double *)ip1)[0];
+            const double ip1i = ((double *)ip1)[1];
+            const double ip2r = ((double *)ip2)[0];
+            const double ip2i = ((double *)ip2)[1];
+
+            sumr += ip1r * ip2r + ip1i * ip2i;
+            sumi += ip1r * ip2i - ip1i * ip2r;
+        }
+        ((double *)op)[0] = sumr;
+        ((double *)op)[1] = sumi;
+    }
+}
+
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+CLONGDOUBLE_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+                 char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+    npy_longdouble tmpr = 0.0L;
+    npy_longdouble tmpi = 0.0L;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        const npy_longdouble ip1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble ip1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble ip2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
+
+        tmpr += ip1r * ip2r + ip1i * ip2i;
+        tmpi += ip1r * ip2i - ip1i * ip2r;
+    }
+    ((npy_longdouble *)op)[0] = tmpr;
+    ((npy_longdouble *)op)[1] = tmpi;
+}
+
+/*
+ * All data is assumed aligned.
+ */
+NPY_NO_EXPORT void
+OBJECT_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+            void *NPY_UNUSED(ignore))
+{
+    npy_intp i;
+    PyObject *tmp0, *tmp1, *tmp2, *tmp = NULL;
+    PyObject **tmp3;
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        if ((*((PyObject **)ip1) == NULL) || (*((PyObject **)ip2) == NULL)) {
+            tmp1 = Py_False;
+            Py_INCREF(Py_False);
+        }
+        else {
+            tmp0 = PyObject_CallMethod(*((PyObject **)ip1), "conjugate", NULL);
+            if (tmp0 == NULL) {
+                Py_XDECREF(tmp);
+                return;
+            }
+            tmp1 = PyNumber_Multiply(tmp0, *((PyObject **)ip2));
+            Py_DECREF(tmp0);
+            if (tmp1 == NULL) {
+                Py_XDECREF(tmp);
+                return;
+            }
+        }
+        if (i == 0) {
+            tmp = tmp1;
+        }
+        else {
+            tmp2 = PyNumber_Add(tmp, tmp1);
+            Py_XDECREF(tmp);
+            Py_XDECREF(tmp1);
+            if (tmp2 == NULL) {
+                return;
+            }
+            tmp = tmp2;
+        }
+    }
+    tmp3 = (PyObject**) op;
+    tmp2 = *tmp3;
+    *((PyObject **)op) = tmp;
+    Py_XDECREF(tmp2);
+}
diff --git a/numpy/core/src/multiarray/vdot.h b/numpy/core/src/multiarray/vdot.h
new file mode 100644
index 000000000000..0f60ca6d19a5
--- /dev/null
+++ b/numpy/core/src/multiarray/vdot.h
@@ -0,0 +1,18 @@
+#ifndef _NPY_VDOT_H_
+#define _NPY_VDOT_H_
+
+#include "common.h"
+
+NPY_NO_EXPORT void
+CFLOAT_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CDOUBLE_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+OBJECT_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+#endif
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 9a177c2fd408..87b9abaa7cb1 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -18,16 +18,6 @@ def test_vecself(self):
         p = inner_(a, a)
         assert_almost_equal(p, 0, decimal = DECPREC)
 
-try:
-    import numpy.core._dotblas as _dotblas
-except ImportError:
-    _dotblas = None
-
-@dec.skipif(_dotblas is None, "Numpy is not compiled with _dotblas")
-def test_blasdot_used():
-    from numpy.core import vdot, inner
-    assert_(vdot is _dotblas.vdot, "vdot")
-
 
 def test_dot_2args():
     from numpy.core import dot

From fad1377e3de77fa26574336e7985888e040a51e3 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Fri, 22 Aug 2014 12:54:21 -0600
Subject: [PATCH 10/19] DOC: Update docs to reflect deprecation of alterdot and
 restoredot.

Also move docstrings into the versions in numpy/core/numeric.py as
the functions are no longer in the defunct _dotblas module.
---
 doc/release/1.10.0-notes.rst | 12 ++++++++++
 numpy/add_newdocs.py         | 37 ------------------------------
 numpy/core/numeric.py        | 44 ++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/doc/release/1.10.0-notes.rst b/doc/release/1.10.0-notes.rst
index 3f52bd5af4d2..386edac45a95 100644
--- a/doc/release/1.10.0-notes.rst
+++ b/doc/release/1.10.0-notes.rst
@@ -11,6 +11,7 @@ Highlights
 Dropped Support
 ===============
 * The polytemplate.py file has been removed.
+* The _dotblas module is no longer available.
 
 
 Future Changes
@@ -33,6 +34,17 @@ Improvements
 Changes
 =======
 
+dotblas functionality moved to multiarray
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The cblas versions of dot, inner, and vdot have been integrated into
+the multiarray module. In particular, vdot is now a multiarray function,
+which it was not before.
+
 
 Deprecations
 ============
+
+alterdot, restoredot Deprecated
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The alterdot and restoredot functions no longer do anything, and are
+deprecated.
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 09311a5364d4..6f3a10de4b0c 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -2174,43 +2174,6 @@ def luf(lamdaexpr, *args, **kwargs):
 
     """)
 
-add_newdoc('numpy.core', 'alterdot',
-    """
-    Change `dot`, `vdot`, and `inner` to use accelerated BLAS functions.
-
-    Typically, as a user of Numpy, you do not explicitly call this function. If
-    Numpy is built with an accelerated BLAS, this function is automatically
-    called when Numpy is imported.
-
-    When Numpy is built with an accelerated BLAS like ATLAS, these functions
-    are replaced to make use of the faster implementations.  The faster
-    implementations only affect float32, float64, complex64, and complex128
-    arrays. Furthermore, the BLAS API only includes matrix-matrix,
-    matrix-vector, and vector-vector products. Products of arrays with larger
-    dimensionalities use the built in functions and are not accelerated.
-
-    See Also
-    --------
-    restoredot : `restoredot` undoes the effects of `alterdot`.
-
-    """)
-
-add_newdoc('numpy.core', 'restoredot',
-    """
-    Restore `dot`, `vdot`, and `innerproduct` to the default non-BLAS
-    implementations.
-
-    Typically, the user will only need to call this when troubleshooting and
-    installation problem, reproducing the conditions of a build without an
-    accelerated BLAS, or when being very careful about benchmarking linear
-    algebra operations.
-
-    See Also
-    --------
-    alterdot : `restoredot` undoes the effects of `alterdot`.
-
-    """)
-
 add_newdoc('numpy.core', 'vdot',
     """
     vdot(a, b)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 72b8e1b68ff5..5751c5c2fdaf 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1082,10 +1082,54 @@ def outer(a, b, out=None):
 vdot = multiarray.vdot
 
 def alterdot():
+    """
+    Change `dot`, `vdot`, and `inner` to use accelerated BLAS functions.
+
+    Typically, as a user of Numpy, you do not explicitly call this
+    function. If Numpy is built with an accelerated BLAS, this function is
+    automatically called when Numpy is imported.
+
+    When Numpy is built with an accelerated BLAS like ATLAS, these
+    functions are replaced to make use of the faster implementations.  The
+    faster implementations only affect float32, float64, complex64, and
+    complex128 arrays. Furthermore, the BLAS API only includes
+    matrix-matrix, matrix-vector, and vector-vector products. Products of
+    arrays with larger dimensionalities use the built in functions and are
+    not accelerated.
+
+    .. note:: Deprecated in Numpy 1.10
+              The cblas functions have been integrated into the multarray
+              module and alterdot now longer does anything. It will be
+              removed in Numpy 1.11.0.
+
+    See Also
+    --------
+    restoredot : `restoredot` undoes the effects of `alterdot`.
+
+    """
     warnings.warn("alterdot no longer does anything.", DeprecationWarning)
 
 
 def restoredot():
+    """
+    Restore `dot`, `vdot`, and `innerproduct` to the default non-BLAS
+    implementations.
+
+    Typically, the user will only need to call this when troubleshooting
+    and installation problem, reproducing the conditions of a build without
+    an accelerated BLAS, or when being very careful about benchmarking
+    linear algebra operations.
+
+    .. note:: Deprecated in Numpy 1.10
+              The cblas functions have been integrated into the multarray
+              module and restoredot now longer does anything. It will be
+              removed in Numpy 1.11.0.
+
+    See Also
+    --------
+    alterdot : `restoredot` undoes the effects of `alterdot`.
+
+    """
     warnings.warn("restoredot no longer does anything.", DeprecationWarning)
 
 

From f6ab313b57a851391faf45d0b6cd1038f361bbce Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Fri, 22 Aug 2014 13:55:12 -0600
Subject: [PATCH 11/19] TST: Add vdot tests, move tests from test_blasdot to
 test_multiarray.

Not all tests from test_blasdot were relevant, as they checked the cblas
implementations against the non-cblas implementations. Those two are no
longer separate.

Remove test_blasdot.py as there nothing is left in it.
---
 numpy/core/tests/test_blasdot.py    | 157 ----------------------------
 numpy/core/tests/test_multiarray.py |  75 ++++++++++++-
 2 files changed, 74 insertions(+), 158 deletions(-)
 delete mode 100644 numpy/core/tests/test_blasdot.py

diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
deleted file mode 100644
index 87b9abaa7cb1..000000000000
--- a/numpy/core/tests/test_blasdot.py
+++ /dev/null
@@ -1,157 +0,0 @@
-from __future__ import division, absolute_import, print_function
-
-import numpy as np
-import sys
-from numpy.core import zeros, float64
-from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
-     assert_raises, assert_array_equal, assert_allclose, assert_equal
-from numpy.core.multiarray import inner as inner_
-
-DECPREC = 14
-
-class TestInner(TestCase):
-    def test_vecself(self):
-        """Ticket 844."""
-        # Inner product of a vector with itself segfaults or give meaningless
-        # result
-        a = zeros(shape = (1, 80), dtype = float64)
-        p = inner_(a, a)
-        assert_almost_equal(p, 0, decimal = DECPREC)
-
-
-def test_dot_2args():
-    from numpy.core import dot
-
-    a = np.array([[1, 2], [3, 4]], dtype=float)
-    b = np.array([[1, 0], [1, 1]], dtype=float)
-    c = np.array([[3, 2], [7, 4]], dtype=float)
-
-    d = dot(a, b)
-    assert_allclose(c, d)
-
-def test_dot_3args():
-    np.random.seed(22)
-    f = np.random.random_sample((1024, 16))
-    v = np.random.random_sample((16, 32))
-
-    r = np.empty((1024, 32))
-    for i in range(12):
-        np.dot(f, v, r)
-    assert_equal(sys.getrefcount(r), 2)
-    r2 = np.dot(f, v, out=None)
-    assert_array_equal(r2, r)
-    assert_(r is np.dot(f, v, out=r))
-
-    v = v[:, 0].copy() # v.shape == (16,)
-    r = r[:, 0].copy() # r.shape == (1024,)
-    r2 = np.dot(f, v)
-    assert_(r is np.dot(f, v, r))
-    assert_array_equal(r2, r)
-
-def test_dot_3args_errors():
-    np.random.seed(22)
-    f = np.random.random_sample((1024, 16))
-    v = np.random.random_sample((16, 32))
-
-    r = np.empty((1024, 31))
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((1024,))
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((32,))
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((32, 1024))
-    assert_raises(ValueError, np.dot, f, v, r)
-    assert_raises(ValueError, np.dot, f, v, r.T)
-
-    r = np.empty((1024, 64))
-    assert_raises(ValueError, np.dot, f, v, r[:, ::2])
-    assert_raises(ValueError, np.dot, f, v, r[:, :32])
-
-    r = np.empty((1024, 32), dtype=np.float32)
-    assert_raises(ValueError, np.dot, f, v, r)
-
-    r = np.empty((1024, 32), dtype=int)
-    assert_raises(ValueError, np.dot, f, v, r)
-
-def test_dot_array_order():
-    """ Test numpy dot with different order C, F
-
-    Comparing results with multiarray dot.
-    Double and single precisions array are compared using relative
-    precision of 7 and 5 decimals respectively.
-    Use 30 decimal when comparing exact operations like:
-        (a.b)' = b'.a'
-    """
-    _dot = np.core.multiarray.dot
-    a_dim, b_dim, c_dim = 10, 4, 7
-    orders = ["C", "F"]
-    dtypes_prec = {np.float64: 7, np.float32: 5}
-    np.random.seed(7)
-
-    for arr_type, prec in dtypes_prec.items():
-        for a_order in orders:
-            a = np.asarray(np.random.randn(a_dim, a_dim),
-                dtype=arr_type, order=a_order)
-            assert_array_equal(np.dot(a, a), a.dot(a))
-            # (a.a)' = a'.a', note that mse~=1e-31 needs almost_equal
-            assert_almost_equal(a.dot(a), a.T.dot(a.T).T, decimal=prec)
-
-            #
-            # Check with making explicit copy
-            #
-            a_T = a.T.copy(order=a_order)
-            assert_almost_equal(a_T.dot(a_T), a.T.dot(a.T), decimal=prec)
-            assert_almost_equal(a.dot(a_T), a.dot(a.T), decimal=prec)
-            assert_almost_equal(a_T.dot(a), a.T.dot(a), decimal=prec)
-
-            #
-            # Compare with multiarray dot
-            #
-            assert_almost_equal(a.dot(a), _dot(a, a), decimal=prec)
-            assert_almost_equal(a.T.dot(a), _dot(a.T, a), decimal=prec)
-            assert_almost_equal(a.dot(a.T), _dot(a, a.T), decimal=prec)
-            assert_almost_equal(a.T.dot(a.T), _dot(a.T, a.T), decimal=prec)
-            for res in a.dot(a), a.T.dot(a), a.dot(a.T), a.T.dot(a.T):
-                assert res.flags.c_contiguous
-
-            for b_order in orders:
-                b = np.asarray(np.random.randn(a_dim, b_dim),
-                    dtype=arr_type, order=b_order)
-                b_T = b.T.copy(order=b_order)
-                assert_almost_equal(a_T.dot(b), a.T.dot(b), decimal=prec)
-                assert_almost_equal(b_T.dot(a), b.T.dot(a), decimal=prec)
-                # (b'.a)' = a'.b
-                assert_almost_equal(b.T.dot(a), a.T.dot(b).T, decimal=prec)
-                assert_almost_equal(a.dot(b), _dot(a, b), decimal=prec)
-                assert_almost_equal(b.T.dot(a), _dot(b.T, a), decimal=prec)
-
-
-                for c_order in orders:
-                    c = np.asarray(np.random.randn(b_dim, c_dim),
-                        dtype=arr_type, order=c_order)
-                    c_T = c.T.copy(order=c_order)
-                    assert_almost_equal(c.T.dot(b.T), c_T.dot(b_T), decimal=prec)
-                    assert_almost_equal(c.T.dot(b.T).T, b.dot(c), decimal=prec)
-                    assert_almost_equal(b.dot(c), _dot(b, c), decimal=prec)
-                    assert_almost_equal(c.T.dot(b.T), _dot(c.T, b.T), decimal=prec)
-
-def test_dot_override():
-    class A(object):
-        def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
-            return "A"
-
-    class B(object):
-        def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
-            return NotImplemented
-
-    a = A()
-    b = B()
-    c = np.array([[1]])
-
-    assert_equal(np.dot(a, b), "A")
-    assert_equal(c.dot(a), "A")
-    assert_raises(TypeError, np.dot, b, c)
-    assert_raises(TypeError, c.dot, b)
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 70398ee84fce..5a2fafbfde37 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -3349,6 +3349,44 @@ def __array_finalize__(self, obj):
         res = dat.var(1)
         assert_(res.info == dat.info)
 
+class TestVdot(TestCase):
+    def test_basic(self):
+        dt_numeric = np.typecodes['AllFloat'] + np.typecodes['AllInteger']
+        dt_complex = np.typecodes['Complex']
+
+        # test real
+        a = np.eye(3)
+        for dt in dt_numeric + 'O':
+            b = a.astype(dt)
+            res = np.vdot(b, b)
+            assert_(np.isscalar(res))
+            assert_equal(np.vdot(b, b), 3)
+
+        # test complex
+        a = np.eye(3) * 1j
+        for dt in dt_complex + 'O':
+            b = a.astype(dt)
+            res = np.vdot(b, b)
+            assert_(np.isscalar(res))
+            assert_equal(np.vdot(b, b), 3)
+
+        # test boolean
+        b = np.eye(3, dtype=np.bool)
+        res = np.vdot(b, b)
+        assert_(np.isscalar(res))
+        assert_equal(np.vdot(b, b), True)
+
+    def test_vdot_array_order(self):
+        a = array([[1, 2], [3, 4]], order='C')
+        b = array([[1, 2], [3, 4]], order='F')
+        res = np.vdot(a, a)
+
+        # integer arrays are exact
+        assert_equal(np.vdot(a, b), res)
+        assert_equal(np.vdot(b, a), res)
+        assert_equal(np.vdot(b, b), res)
+
+
 class TestDot(TestCase):
     def test_dot_2args(self):
         from numpy.core.multiarray import dot
@@ -3411,6 +3449,16 @@ def test_dot_3args_errors(self):
         r = np.empty((1024, 32), dtype=int)
         assert_raises(ValueError, dot, f, v, r)
 
+    def test_dot_array_order(self):
+        a = array([[1, 2], [3, 4]], order='C')
+        b = array([[1, 2], [3, 4]], order='F')
+        res = np.dot(a, a)
+
+        # integer arrays are exact
+        assert_equal(np.dot(a, b), res)
+        assert_equal(np.dot(b, a), res)
+        assert_equal(np.dot(b, b), res)
+
     def test_dot_scalar_and_matrix_of_objects(self):
         # Ticket #2469
         arr = np.matrix([1, 2], dtype=object)
@@ -3418,6 +3466,24 @@ def test_dot_scalar_and_matrix_of_objects(self):
         assert_equal(np.dot(arr, 3), desired)
         assert_equal(np.dot(3, arr), desired)
 
+    def test_dot_override(self):
+        class A(object):
+            def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
+                return "A"
+
+        class B(object):
+            def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
+                return NotImplemented
+
+        a = A()
+        b = B()
+        c = np.array([[1]])
+
+        assert_equal(np.dot(a, b), "A")
+        assert_equal(c.dot(a), "A")
+        assert_raises(TypeError, np.dot, b, c)
+        assert_raises(TypeError, c.dot, b)
+
 
 class TestInner(TestCase):
 
@@ -3428,6 +3494,14 @@ def test_inner_scalar_and_matrix_of_objects(self):
         assert_equal(np.inner(arr, 3), desired)
         assert_equal(np.inner(3, arr), desired)
 
+    def test_vecself(self):
+        # Ticket 844.
+        # Inner product of a vector with itself segfaults or give
+        # meaningless result
+        a = zeros(shape = (1, 80), dtype = float64)
+        p = inner(a, a)
+        assert_almost_equal(p, 0, decimal=14)
+
 
 class TestSummarization(TestCase):
     def test_1d(self):
@@ -3448,7 +3522,6 @@ def test_2d(self):
                 '       [ 501,  502,  503, ...,  999, 1000, 1001]])'
         assert_(repr(A) == reprA)
 
-
 class TestChoose(TestCase):
     def setUp(self):
         self.x = 2*ones((3,), dtype=int)

From d8af083f3873da3fa041f20d1b981709cc61d972 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Sat, 23 Aug 2014 18:22:22 +0200
Subject: [PATCH 12/19] ENH: np.dot: better "matrices not aligned" message

---
 numpy/core/blasdot/_dotblas.c                | 10 +++++++---
 numpy/core/src/multiarray/common.h           | 12 ++++++++++++
 numpy/core/src/multiarray/multiarraymodule.c |  6 ++++--
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index 48aa39ff87df..a16cbe5d0b91 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -9,6 +9,7 @@
 #include "numpy/arrayobject.h"
 #include "npy_config.h"
 #include "npy_pycompat.h"
+#include "common.h"
 #include "ufunc_override.h"
 #ifndef CBLAS_HEADER
 #define CBLAS_HEADER "cblas.h"
@@ -529,7 +530,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
             l = PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1);
 
             if (PyArray_DIM(oap2, 0) != l) {
-                PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+                not_aligned(PyArray_NDIM(oap1) - 1, 0,
+                            l, PyArray_DIM(oap2, 0));
                 goto fail;
             }
             nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
@@ -579,7 +581,8 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
 
         if (PyArray_DIM(ap2, 0) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+            not_aligned(PyArray_NDIM(ap1) - 1, 0,
+                        l, PyArray_DIM(ap2, 0));
             goto fail;
         }
         nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
@@ -1007,7 +1010,8 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
 
         if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
-            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+            not_aligned(PyArray_NDIM(ap1) - 1, PyArray_NDIM(ap2) - 1,
+                        l, PyArray_DIM(ap2, PyArray_NDIM(ap2) - 1));
             goto fail;
         }
         nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 6b49d6b4cf5c..5edf4e6ba1ca 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -208,6 +208,18 @@ _is_basic_python_type(PyObject * obj)
     return 0;
 }
 
+/*
+ * Sets ValueError with "matrices not aligned" message for np.dot and friends
+ * when shape[i] = m doesn't match shape[j] = n.
+ */
+static NPY_INLINE void
+not_aligned(int i, int j, Py_ssize_t m, Py_ssize_t n)
+{
+    PyErr_Format(PyExc_ValueError,
+                 "matrices are not aligned: shape[%d] (%zd) != shape[%d] (%zd)",
+                 i, m, j, n);
+}
+
 #include "ucsnarrow.h"
 
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 682705a1b8e4..a3165b67c647 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -841,7 +841,8 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
 
     l = PyArray_DIMS(ap1)[PyArray_NDIM(ap1) - 1];
     if (PyArray_DIMS(ap2)[PyArray_NDIM(ap2) - 1] != l) {
-        PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
+        not_aligned(PyArray_NDIM(ap1) - 1, PyArray_NDIM(ap2) - 1,
+                    l, PyArray_DIMS(ap2)[PyArray_NDIM(ap2) - 1]);
         goto fail;
     }
 
@@ -961,7 +962,8 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
         matchDim = 0;
     }
     if (PyArray_DIMS(ap2)[matchDim] != l) {
-        PyErr_SetString(PyExc_ValueError, "objects are not aligned");
+        not_aligned(PyArray_NDIM(ap1) - 1, matchDim,
+                    l, PyArray_DIMS(ap2)[matchDim]);
         goto fail;
     }
     nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;

From a746e3a0f6ca930d388ab8aff8304a456f3f6b7e Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sat, 23 Aug 2014 12:23:17 -0600
Subject: [PATCH 13/19] BUG: Capitalize environmental variables in
 numpy/core/__init__.py.

---
 numpy/core/__init__.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 976977621cda..371d34b588ee 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -3,15 +3,14 @@
 from .info import __doc__
 from numpy.version import version as __version__
 
-
-# disables openblas affinity setting of the main thread that limits
+# disables OpenBLAS affinity setting of the main thread that limits
 # python threads or processes to one core
 import os
 envbak = os.environ.copy()
-if 'openblas_main_free' not in os.environ:
-    os.environ['openblas_main_free'] = '1'
-if 'gotoblas_main_free' not in os.environ:
-    os.environ['gotoblas_main_free'] = '1'
+if 'OPENBLAS_MAIN_FREE' not in os.environ:
+    os.environ['OPENBLAS_MAIN_FREE'] = '1'
+if 'GOTOBLAS_MAIN_FREE' not in os.environ:
+    os.environ['GOTOBLAS_MAIN_FREE'] = '1'
 from . import multiarray
 os.environ.clear()
 os.environ.update(envbak)

From d07c4c72793fc120bf4cc807857567517e775c5b Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Sat, 23 Aug 2014 22:30:07 +0200
Subject: [PATCH 14/19] ENH: include shapes in "matrices not aligned" msg

I had to move convert_shape_to_string to common.h so that _dotblas.c
can access it.
---
 numpy/core/blasdot/_dotblas.c                |  10 +-
 numpy/core/src/multiarray/common.c           |  61 ----------
 numpy/core/src/multiarray/common.h           | 119 +++++++++++++++++--
 numpy/core/src/multiarray/multiarraymodule.c |   6 +-
 4 files changed, 117 insertions(+), 79 deletions(-)

diff --git a/numpy/core/blasdot/_dotblas.c b/numpy/core/blasdot/_dotblas.c
index a16cbe5d0b91..0679e38f8162 100644
--- a/numpy/core/blasdot/_dotblas.c
+++ b/numpy/core/blasdot/_dotblas.c
@@ -530,8 +530,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
             l = PyArray_DIM(oap1, PyArray_NDIM(oap1) - 1);
 
             if (PyArray_DIM(oap2, 0) != l) {
-                not_aligned(PyArray_NDIM(oap1) - 1, 0,
-                            l, PyArray_DIM(oap2, 0));
+                not_aligned(oap1, PyArray_NDIM(oap1) - 1, oap2, 0);
                 goto fail;
             }
             nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
@@ -581,8 +580,7 @@ dotblas_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwa
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
 
         if (PyArray_DIM(ap2, 0) != l) {
-            not_aligned(PyArray_NDIM(ap1) - 1, 0,
-                        l, PyArray_DIM(ap2, 0));
+            not_aligned(ap1, PyArray_NDIM(ap1) - 1, ap2, 0);
             goto fail;
         }
         nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;
@@ -1010,8 +1008,8 @@ dotblas_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1)-1);
 
         if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
-            not_aligned(PyArray_NDIM(ap1) - 1, PyArray_NDIM(ap2) - 1,
-                        l, PyArray_DIM(ap2, PyArray_NDIM(ap2) - 1));
+            not_aligned(ap1, PyArray_NDIM(ap1) - 1,
+                        ap2, PyArray_NDIM(ap2) - 1);
             goto fail;
         }
         nd = PyArray_NDIM(ap1)+PyArray_NDIM(ap2)-2;
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 54b9e3c1a101..02f16b9bce45 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -779,64 +779,3 @@ offset_bounds_from_strides(const int itemsize, const int nd,
     *lower_offset = lower;
     *upper_offset = upper;
 }
-
-
-/**
- * Convert an array shape to a string such as "(1, 2)".
- *
- * @param Dimensionality of the shape
- * @param npy_intp pointer to shape array
- * @param String to append after the shape `(1, 2)%s`.
- * 
- * @return Python unicode string
- */
-NPY_NO_EXPORT PyObject *
-convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending)
-{
-    npy_intp i;
-    PyObject *ret, *tmp;
-
-    /*
-     * Negative dimension indicates "newaxis", which can
-     * be discarded for printing if it's a leading dimension.
-     * Find the first non-"newaxis" dimension.
-     */
-    for (i = 0; i < n && vals[i] < 0; i++);
-
-    if (i == n) {
-        return PyUString_FromFormat("()%s", ending);
-    }
-    else {
-        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
-        if (ret == NULL) {
-            return NULL;
-        }
-    }
-
-    for (; i < n; ++i) {
-        if (vals[i] < 0) {
-            tmp = PyUString_FromString(",newaxis");
-        }
-        else {
-            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
-        }
-        if (tmp == NULL) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-
-        PyUString_ConcatAndDel(&ret, tmp);
-        if (ret == NULL) {
-            return NULL;
-        }
-    }
-
-    if (i == 1) {
-        tmp = PyUString_FromFormat(",)%s", ending);
-    }
-    else {
-        tmp = PyUString_FromFormat(")%s", ending);
-        }
-    PyUString_ConcatAndDel(&ret, tmp);
-    return ret;
-}
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 5edf4e6ba1ca..2de31e4674ae 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -3,6 +3,7 @@
 #include <numpy/npy_common.h>
 #include <numpy/npy_cpu.h>
 #include <numpy/ndarraytypes.h>
+#include <numpy/npy_3kcompat.h>
 
 #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
 
@@ -69,9 +70,6 @@ offset_bounds_from_strides(const int itemsize, const int nd,
                            const npy_intp *dims, const npy_intp *strides,
                            npy_intp *lower_offset, npy_intp *upper_offset);
 
-NPY_NO_EXPORT PyObject *
-convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending);
-
 
 /*
  * Returns -1 and sets an exception if *index is an invalid index for
@@ -208,16 +206,121 @@ _is_basic_python_type(PyObject * obj)
     return 0;
 }
 
+
+/**
+ * Convert an array shape to a string such as "(1, 2)".
+ *
+ * @param Dimensionality of the shape
+ * @param npy_intp pointer to shape array
+ * @param String to append after the shape `(1, 2)%s`.
+ *
+ * @return Python unicode string
+ */
+static NPY_INLINE PyObject *
+convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending)
+{
+    npy_intp i;
+    PyObject *ret, *tmp;
+
+    /*
+     * Negative dimension indicates "newaxis", which can
+     * be discarded for printing if it's a leading dimension.
+     * Find the first non-"newaxis" dimension.
+     */
+    for (i = 0; i < n && vals[i] < 0; i++);
+
+    if (i == n) {
+        return PyUString_FromFormat("()%s", ending);
+    }
+    else {
+        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
+        if (ret == NULL) {
+            return NULL;
+        }
+    }
+
+    for (; i < n; ++i) {
+        if (vals[i] < 0) {
+            tmp = PyUString_FromString(",newaxis");
+        }
+        else {
+            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
+        }
+        if (tmp == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        PyUString_ConcatAndDel(&ret, tmp);
+        if (ret == NULL) {
+            return NULL;
+        }
+    }
+
+    if (i == 1) {
+        tmp = PyUString_FromFormat(",)%s", ending);
+    }
+    else {
+        tmp = PyUString_FromFormat(")%s", ending);
+    }
+    PyUString_ConcatAndDel(&ret, tmp);
+    return ret;
+}
+
+
 /*
  * Sets ValueError with "matrices not aligned" message for np.dot and friends
- * when shape[i] = m doesn't match shape[j] = n.
+ * when a.shape[i] should match b.shape[j], but doesn't.
  */
 static NPY_INLINE void
-not_aligned(int i, int j, Py_ssize_t m, Py_ssize_t n)
+not_aligned(PyArrayObject *a, int i, PyArrayObject *b, int j)
 {
-    PyErr_Format(PyExc_ValueError,
-                 "matrices are not aligned: shape[%d] (%zd) != shape[%d] (%zd)",
-                 i, m, j, n);
+    PyObject *errmsg = NULL, *format = NULL, *fmt_args = NULL,
+             *i_obj = NULL, *j_obj = NULL,
+             *shape1 = NULL, *shape2 = NULL,
+             *shape1_i = NULL, *shape2_j = NULL;
+
+    format = PyUString_FromString("shapes %s and %s not aligned:"
+                                  " %d (dim %d) != %d (dim %d)");
+
+    shape1 = convert_shape_to_string(PyArray_NDIM(a), PyArray_DIMS(a), "");
+    shape2 = convert_shape_to_string(PyArray_NDIM(b), PyArray_DIMS(b), "");
+
+    i_obj = PyLong_FromLong(i);
+    j_obj = PyLong_FromLong(j);
+
+    shape1_i = PyLong_FromSsize_t(PyArray_DIM(a, i));
+    shape2_j = PyLong_FromSsize_t(PyArray_DIM(b, j));
+
+    if (!format || !shape1 || !shape2 || !i_obj || !j_obj ||
+            !shape1_i || !shape2_j) {
+        goto end;
+    }
+
+    fmt_args = PyTuple_Pack(6, shape1, shape2,
+                            shape1_i, i_obj, shape2_j, j_obj);
+    if (fmt_args == NULL) {
+        goto end;
+    }
+
+    errmsg = PyUString_Format(format, fmt_args);
+    if (errmsg != NULL) {
+        PyErr_SetObject(PyExc_ValueError, errmsg);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "shapes are not aligned");
+    }
+
+end:
+    Py_XDECREF(errmsg);
+    Py_XDECREF(fmt_args);
+    Py_XDECREF(format);
+    Py_XDECREF(i_obj);
+    Py_XDECREF(j_obj);
+    Py_XDECREF(shape1);
+    Py_XDECREF(shape2);
+    Py_XDECREF(shape1_i);
+    Py_XDECREF(shape2_j);
 }
 
 #include "ucsnarrow.h"
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index a3165b67c647..8f5bb6d4abf9 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -841,8 +841,7 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
 
     l = PyArray_DIMS(ap1)[PyArray_NDIM(ap1) - 1];
     if (PyArray_DIMS(ap2)[PyArray_NDIM(ap2) - 1] != l) {
-        not_aligned(PyArray_NDIM(ap1) - 1, PyArray_NDIM(ap2) - 1,
-                    l, PyArray_DIMS(ap2)[PyArray_NDIM(ap2) - 1]);
+        not_aligned(ap1, PyArray_NDIM(ap1) - 1, ap2, PyArray_NDIM(ap2) - 1);
         goto fail;
     }
 
@@ -962,8 +961,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
         matchDim = 0;
     }
     if (PyArray_DIMS(ap2)[matchDim] != l) {
-        not_aligned(PyArray_NDIM(ap1) - 1, matchDim,
-                    l, PyArray_DIMS(ap2)[matchDim]);
+        not_aligned(ap1, PyArray_NDIM(ap1) - 1, ap2, matchDim);
         goto fail;
     }
     nd = PyArray_NDIM(ap1) + PyArray_NDIM(ap2) - 2;

From ce32d9e56c0f848da3f05147f0100f1a9761ec7e Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Sun, 24 Aug 2014 20:10:37 +0200
Subject: [PATCH 15/19] BLD: check for CBLAS header in "unoptimized" blas

Allows building against system installed blas library that might be
optimized.
---
 numpy/distutils/system_info.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 75c2c0ea5201..ddb1513c4258 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -1560,9 +1560,33 @@ def calc_info(self):
         info = self.check_libs(lib_dirs, blas_libs, [])
         if info is None:
             return
-        info['language'] = 'f77'  # XXX: is it generally true?
+        if self.has_cblas():
+            info['language'] = 'c'
+            info['define_macros'] = [('HAVE_CBLAS', None)]
+        else:
+            info['language'] = 'f77'  # XXX: is it generally true?
         self.set_info(**info)
 
+    def has_cblas(self):
+        # primitive cblas check by looking for the header
+        res = False
+        c = distutils.ccompiler.new_compiler()
+        tmpdir = tempfile.mkdtemp()
+        s = """#include <cblas.h>"""
+        src = os.path.join(tmpdir, 'source.c')
+        try:
+            with open(src, 'wt') as f:
+                f.write(s)
+            try:
+                c.compile([src], output_dir=tmpdir,
+                          include_dirs=self.get_include_dirs())
+                res = True
+            except distutils.ccompiler.CompileError:
+                res = False
+        finally:
+            shutil.rmtree(tmpdir)
+        return res
+
 
 class openblas_info(blas_info):
     section = 'openblas'

From a3c70cc6c501a7554abade331f2834ca0532936a Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Mon, 25 Aug 2014 08:33:12 -0600
Subject: [PATCH 16/19] STY: Add spaces around '-'.

---
 numpy/core/src/multiarray/cblasfuncs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/cblasfuncs.c b/numpy/core/src/multiarray/cblasfuncs.c
index 5584a6e71124..bc09ed6e8013 100644
--- a/numpy/core/src/multiarray/cblasfuncs.c
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -710,7 +710,7 @@ cblas_innerproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2)
          */
         l = PyArray_DIM(ap1, PyArray_NDIM(ap1) - 1);
 
-        if (PyArray_DIM(ap2, PyArray_NDIM(ap2)-1) != l) {
+        if (PyArray_DIM(ap2, PyArray_NDIM(ap2) - 1) != l) {
             dot_alignment_error(ap1, PyArray_NDIM(ap1) - 1,
                                 ap2, PyArray_NDIM(ap2) - 1);
             goto fail;

From affeaf54bd08f42c87d966f03cbdc8a896d4e90f Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Fri, 29 Aug 2014 09:01:42 -0600
Subject: [PATCH 17/19] TST: Silence some warning that turns up on OpenBSD.

On OpenBSD with gcc-4.2 log1p(nan) raises an invalid value warning.
This PR disables both 'divide' and 'invalid' for the log1p tests.

Closes #5017.
---
 numpy/core/tests/test_umath.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index b3ddc239813b..72ba1d5c7554 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -333,11 +333,10 @@ def test_log1p(self):
         assert_almost_equal(ncu.log1p(1e-6), ncu.log(1+1e-6))
 
     def test_special(self):
-        assert_equal(ncu.log1p(np.nan), np.nan)
-        assert_equal(ncu.log1p(np.inf), np.inf)
-        with np.errstate(divide="ignore"):
+        with np.errstate(invalid="ignore", divide="ignore"):
+            assert_equal(ncu.log1p(np.nan), np.nan)
+            assert_equal(ncu.log1p(np.inf), np.inf)
             assert_equal(ncu.log1p(-1.), -np.inf)
-        with np.errstate(invalid="ignore"):
             assert_equal(ncu.log1p(-2.), np.nan)
             assert_equal(ncu.log1p(-np.inf), np.nan)
 

From 4097ec3ec10c41d399518867f4bebb0a53ee8a5c Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sun, 31 Aug 2014 15:43:39 +0100
Subject: [PATCH 18/19] BUG: fix percentage reporting when
 testing.assert_allclose fails.

---
 numpy/core/numeric.py             | 62 ++++++++++++++++++-------------
 numpy/testing/tests/test_utils.py |  5 +++
 numpy/testing/utils.py            |  2 +-
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index efd8af45dbd2..eb0c38b0b6fe 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -2153,6 +2153,41 @@ def identity(n, dtype=None):
     from numpy import eye
     return eye(n, dtype=dtype)
 
+def _allclose_points(a, b, rtol=1.e-5, atol=1.e-8):
+    """
+    This is the point-wise inner calculation of 'allclose', which is subtly
+    different from 'isclose'.  Provided as a comparison routine for use in
+    testing.assert_allclose.
+    See those routines for further details.
+
+    """
+    x = array(a, copy=False, ndmin=1)
+    y = array(b, copy=False, ndmin=1)
+
+    # make sure y is an inexact type to avoid abs(MIN_INT); will cause
+    # casting of x later.
+    dtype = multiarray.result_type(y, 1.)
+    y = array(y, dtype=dtype, copy=False)
+
+    xinf = isinf(x)
+    yinf = isinf(y)
+    if any(xinf) or any(yinf):
+        # Check that x and y have inf's only in the same positions
+        if not all(xinf == yinf):
+            return False
+        # Check that sign of inf's in x and y is the same
+        if not all(x[xinf] == y[xinf]):
+            return False
+
+        x = x[~xinf]
+        y = y[~xinf]
+
+    # ignore invalid fpe's
+    with errstate(invalid='ignore'):
+        r = less_equal(abs(x - y), atol + rtol * abs(y))
+
+    return r
+
 def allclose(a, b, rtol=1.e-5, atol=1.e-8):
     """
     Returns True if two arrays are element-wise equal within a tolerance.
@@ -2208,32 +2243,7 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8):
     False
 
     """
-    x = array(a, copy=False, ndmin=1)
-    y = array(b, copy=False, ndmin=1)
-
-    # make sure y is an inexact type to avoid abs(MIN_INT); will cause
-    # casting of x later.
-    dtype = multiarray.result_type(y, 1.)
-    y = array(y, dtype=dtype, copy=False)
-
-    xinf = isinf(x)
-    yinf = isinf(y)
-    if any(xinf) or any(yinf):
-        # Check that x and y have inf's only in the same positions
-        if not all(xinf == yinf):
-            return False
-        # Check that sign of inf's in x and y is the same
-        if not all(x[xinf] == y[xinf]):
-            return False
-
-        x = x[~xinf]
-        y = y[~xinf]
-
-    # ignore invalid fpe's
-    with errstate(invalid='ignore'):
-        r = all(less_equal(abs(x - y), atol + rtol * abs(y)))
-
-    return r
+    return all(_allclose_points(a, b, rtol=rtol, atol=atol))
 
 def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     """
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index aa0a2669fd7d..c1d0e83daf3b 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -449,6 +449,11 @@ def test_min_int(self):
         # Should not raise:
         assert_allclose(a, a)
 
+    def test_report_fail_percentage(self):
+        a = np.array([1, 1, 1, 1])
+        b = np.array([1, 1, 1, 2])
+        with self.assertRaisesRegexp(AssertionError, "25.0%"):
+            assert_allclose(a, b)
 
 class TestArrayAlmostEqualNulp(unittest.TestCase):
     @dec.knownfailureif(True, "Github issue #347")
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index bd184d922041..71c7145f9c51 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -1289,7 +1289,7 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=0,
     """
     import numpy as np
     def compare(x, y):
-        return np.allclose(x, y, rtol=rtol, atol=atol)
+        return np.core.numeric._allclose_points(x, y, rtol=rtol, atol=atol)
 
     actual, desired = np.asanyarray(actual), np.asanyarray(desired)
     header = 'Not equal to tolerance rtol=%g, atol=%g' % (rtol, atol)

From ea32c9057f47d634bde5370f373a88aa6bfa7478 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 1 Sep 2014 17:22:12 +0100
Subject: [PATCH 19/19] Use more portable test methods.

---
 numpy/testing/tests/test_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index c1d0e83daf3b..5189f7e633f4 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -452,8 +452,12 @@ def test_min_int(self):
     def test_report_fail_percentage(self):
         a = np.array([1, 1, 1, 1])
         b = np.array([1, 1, 1, 2])
-        with self.assertRaisesRegexp(AssertionError, "25.0%"):
+        try:
             assert_allclose(a, b)
+            msg = ''
+        except AssertionError as exc:
+            msg = exc.args[0]
+        self.assertTrue("mismatch 25.0%" in msg)
 
 class TestArrayAlmostEqualNulp(unittest.TestCase):
     @dec.knownfailureif(True, "Github issue #347")