8000 memcpy-based fast, typed shuffle. by anntzer · Pull Request #6933 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

memcpy-based fast, typed shuffle. #6933

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 17, 2016
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Top shuffle speed for machine-sized ints/floats.
Apparently gcc only specializes one branch (the last one) so I went for
another 33% performance increase (matching #6776) in what's likely the
most common use case.
  • Loading branch information
anntzer committed Jan 17, 2016
commit b8cf7f904974294d4e3af43c68ef23f87385f2f6
26 changes: 18 additions & 8 deletions numpy/random/mtrand/mtrand.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4982,8 +4982,7 @@ cdef class RandomState:

"""
cdef:
npy_intp i, j, n = len(x)
size_t stride, nbytes
npy_intp i, j, n = len(x), stride, itemsize
char* x_ptr
char* buf_ptr

Expand All @@ -4993,15 +4992,17 @@ cdef class RandomState:
# as MaskedArrays may not support this approach).
x_ptr = <char*><size_t>x.ctypes.data
stride = x.strides[0]
nbytes = x[:1].nbytes
itemsize = x.dtype.itemsize
buf = np.empty_like(x[0]) # GC'd at function exit
buf_ptr = <char*><size_t>buf.ctypes.data
with self.lock:
for i in reversed(range(1, n)):
j = rk_interval(i, self.internal_state)
string.memcpy(buf_ptr, x_ptr + j * stride, nbytes)
string.memcpy(x_ptr + j * stride, x_ptr + i * stride, nbytes)
string.memcpy(x_ptr + i * stride, buf_ptr, nbytes)
# We trick gcc into providing a specialized implementation for
# the most common case, yielding a ~33% performance improvement.
# Note that apparently, only one branch can ever be specialized.
if itemsize == sizeof(npy_intp):
self._shuffle_raw(n, sizeof(npy_intp), stride, x_ptr, buf_ptr)
else:
self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr)
elif isinstance(x, np.ndarray) and x.ndim > 1 and x.size:
# Multidimensional ndarrays require a bounce buffer.
buf = np.empty_like(x[0])
Expand All @@ -5018,6 +5019,15 @@ cdef class RandomState:
j = rk_interval(i, self.internal_state)
x[i], x[j] = x[j], x[i]

cdef inline _shuffle_raw(self, npy_intp n, npy_intp itemsize,
npy_intp stride, char* data, char* buf):
cdef n 5583 py_intp i, j
for i in reversed(range(1, n)):
j = rk_interval(i, self.internal_state)
string.memcpy(buf, data + j * stride, itemsize)
string.memcpy(data + j * stride, data + i * stride, itemsize)
string.memcpy(data + i * stride, buf, itemsize)

def permutation(self, object x):
"""
permutation(x)
Expand Down
0