numpy · njsmith · Jan 17, 2016 · Dec 6, 2015 · Jan 7, 2016 · Jan 13, 2016
diff --git a/doc/release/1.11.0-notes.rst b/doc/release/1.11.0-notes.rst
@@ -204,6 +204,10 @@ extended to ``@``, ``numpy.dot``, ``numpy.inner``, and ``numpy.matmul``.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 This matches the behavior of ``assert_raises``.
 
+Speed improvement for np.random.shuffle
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+``np.random.shuffle`` is now much faster for 1d ndarrays.
+
 Changes
 =======
 Pyrex support was removed from ``numpy.distutils``.  The method

diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
@@ -24,6 +24,8 @@
 include "Python.pxi"
 include "numpy.pxd"
 
+from libc cimport string
+
 cdef extern from "math.h":
     double exp(double x)
     double log(double x)
@@ -4979,33 +4981,52 @@ cdef class RandomState:
                [0, 1, 2]])
 
         """
-        cdef npy_intp i, j
-
-        i = len(x) - 1
-
-        # Logic adapted from random.shuffle()
-        if isinstance(x, np.ndarray) and \
-           (x.ndim > 1 or x.dtype.fields is not None):
-            # For a multi-dimensional ndarray, indexing returns a view onto
-            # each row. So we can't just use ordinary assignment to swap the
-            # rows; we need a bounce buffer.
+        cdef:
+            npy_intp i, j, n = len(x), stride, itemsize
+            char* x_ptr
+            char* buf_ptr
+
+        if type(x) is np.ndarray and x.ndim == 1 and x.size:
+            # Fast, statically typed path: shuffle the underlying buffer.
+            # Only for non-empty, 1d objects of class ndarray (subclasses such
+            # as MaskedArrays may not support this approach).
+            x_ptr = <char*><size_t>x.ctypes.data
+            stride = x.strides[0]
+            itemsize = x.dtype.itemsize
+            buf = np.empty_like(x[0])  # GC'd at function exit
+            buf_ptr = <char*><size_t>buf.ctypes.data
+            with self.lock:
+                # We trick gcc into providing a specialized implementation for
+                # the most common case, yielding a ~33% performance improvement.
+                # Note that apparently, only one branch can ever be specialized.
+                if itemsize == sizeof(npy_intp):
+                    self._shuffle_raw(n, sizeof(npy_intp), stride, x_ptr, buf_ptr)
+                else:
+                    self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr)
+        elif isinstance(x, np.ndarray) and x.ndim > 1 and x.size:
+            # Multidimensional ndarrays require a bounce buffer.
             buf = np.empty_like(x[0])
             with self.lock:
-                while i > 0:
+                for i in reversed(range(1, n)):
                     j = rk_interval(i, self.internal_state)
                     buf[...] = x[j]
                     x[j] = x[i]
                     x[i] = buf
-                    i = i - 1
         else:
-            # For single-dimensional arrays, lists, and any other Python
-            # sequence types, indexing returns a real object that's
-            # independent of the array contents, so we can just swap directly.
+            # Untyped path.
             with self.lock:
-                while i > 0:
+                for i in reversed(range(1, n)):
                     j = rk_interval(i, self.internal_state)
                     x[i], x[j] = x[j], x[i]
-                    i = i - 1
+
+    cdef inline _shuffle_raw(self, npy_intp n, npy_intp itemsize,
+                             npy_intp stride, char* data, char* buf):
+        cdef npy_intp i, j
+        for i in reversed(range(1, n)):
+            j = rk_interval(i, self.internal_state)
+            string.memcpy(buf, data + j * stride, itemsize)
+            string.memcpy(data + j * stride, data + i * stride, itemsize)
+            string.memcpy(data + i * stride, buf, itemsize)
 
     def permutation(self, object x):
         """

diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
@@ -10,6 +10,7 @@
 import warnings
 
 
+
 class TestSeed(TestCase):
     def test_scalar(self):
         s = np.random.RandomState(0)
@@ -40,6 +41,7 @@ def test_invalid_array(self):
         assert_raises(ValueError, np.random.RandomState, [1, 2, 4294967296])
         assert_raises(ValueError, np.random.RandomState, [1, -2, 4294967296])
 
+
 class TestBinomial(TestCase):
     def test_n_zero(self):
         # Tests the corner case of n == 0 for the binomial distribution.
@@ -130,6 +132,7 @@ def test_negative_binomial(self):
         # arguments without truncation.
         self.prng.negative_binomial(0.5, 0.5)
 
+
 class TestRandint(TestCase):
 
     rfunc = np.random.randint
@@ -368,40 +371,41 @@ def test_bytes(self):
         np.testing.assert_equal(actual, desired)
 
     def test_shuffle(self):
-        # Test lists, arrays, and multidimensional versions of both:
-        for conv in [lambda x: x,
-                     np.asarray,
+        # Test lists, arrays (of various dtypes), and multidimensional versions
+        # of both, c-contiguous or not:
+        for conv in [lambda x: np.array([]),
+                     lambda x: x,
+                     lambda x: np.asarray(x).astype(np.int8),
+                     lambda x: np.asarray(x).astype(np.float32),
+                     lambda x: np.asarray(x).astype(np.complex64),
+                     lambda x: np.asarray(x).astype(object),
                      lambda x: [(i, i) for i in x],
-                     lambda x: np.asarray([(i, i) for i in x])]:
+                     lambda x: np.asarray([[i, i] for i in x]),
+                     lambda x: np.vstack([x, x]).T,
+                     # gh-4270
+                     lambda x: np.asarray([(i, i) for i in x],
+                                          [("a", object, 1),
+                                           ("b", np.int32, 1)])]:
             np.random.seed(self.seed)
             alist = conv([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])
             np.random.shuffle(alist)
             actual = alist
             desired = conv([0, 1, 9, 6, 2, 4, 5, 8, 7, 3])
             np.testing.assert_array_equal(actual, desired)
 
-    def test_shuffle_flexible(self):
-        # gh-4270
-        arr = [(0, 1), (2, 3)]
-        dt = np.dtype([('a', np.int32, 1), ('b', np.int32, 1)])
-        nparr = np.array(arr, dtype=dt)
-        a, b = nparr[0].copy(), nparr[1].copy()
-        for i in range(50):
-            np.random.shuffle(nparr)
-            assert_(a in nparr)
-            assert_(b in nparr)
-
     def test_shuffle_masked(self):
         # gh-3263
         a = np.ma.masked_values(np.reshape(range(20), (5,4)) % 3 - 1, -1)
         b = np.ma.masked_values(np.arange(20) % 3 - 1, -1)
-        ma = np.ma.count_masked(a)
-        mb = np.ma.count_masked(b)
+        a_orig = a.copy()
+        b_orig = b.copy()
         for i in range(50):
             np.random.shuffle(a)
-            self.assertEqual(ma, np.ma.count_masked(a))
+            assert_equal(
+                sorted(a.data[~a.mask]), sorted(a_orig.data[~a_orig.mask]))
             np.random.shuffle(b)
-            self.assertEqual(mb, np.ma.count_masked(b))
+            assert_equal(
+                sorted(b.data[~b.mask]), sorted(b_orig.data[~b_orig.mask]))
 
     def test_beta(self):
         np.random.seed(self.seed)