8000 ENH: add inplace cases to fast ufunc loop macros by juliantaylor · Pull Request #7999 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: add inplace cases to fast ufunc loop macros #7999

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 2, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions benchmarks/benchmarks/bench_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,45 @@ def time_or_bool(self):
(self.b | self.b)


class CustomInplace(Benchmark):
def setup(self):
self.c = np.ones(500000, dtype=np.int8)
self.i = np.ones(150000, dtype=np.int32)
self.f = np.zeros(150000, dtype=np.float32)
self.d = np.zeros(75000, dtype=np.float64)
# fault memory
self.f *= 1.
self.d *= 1.

def time_char_or(self):
np.bitwise_or(self.c, 0, out=self.c)
np.bitwise_or(0, self.c, out=self.c)

def time_char_or_temp(self):
0 | self.c | 0

def time_int_or(self):
np.bitwise_or(self.i, 0, out=self.i)
np.bitwise_or(0, self.i, out=self.i)

def time_int_or_temp(self):
0 | self.i | 0

def time_float_add(self):
np.add(self.f, 1., out=self.f)
np.add(1., self.f, out=self.f)

def time_float_add_temp(self):
1. + self.f + 1.

def time_double_add(self):
np.add(self.d, 1., out=self.d)
np.add(1., self.d, out=self.d)

def time_double_add_temp(self):
1. + self.d + 1.


class CustomScalar(Benchmark):
params = [np.float32, np.float64]
param_names = ['dtype']
Expand Down
75 changes: 45 additions & 30 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -87,22 +87,25 @@
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
#define BASE_UNARY_LOOP(tin, tout, op) \
UNARY_LOOP { \
const tin in = *(tin *)ip1; \
tout * out = (tout *)op1; \
op; \
}
#define UNARY_LOOP_FAST(tin, tout, op) \
do { \
/* condition allows compiler to optimize the generic macro */ \
if (IS_UNARY_CONT(tin, tout)) { \
UNARY_LOOP { \
const tin in = *(tin *)ip1; \
tout * out = (tout *)op1; \
op; \
if (args[0] == args[1]) { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
else { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
} \
else { \
UNARY_LOOP { \
const tin in = *(tin *)ip1; \
tout * out = (tout *)op1; \
op; \
} \
BASE_UNARY_LOOP(tin, tout, op) \
} \
} \
while (0)
Expand All @@ -128,40 +131,52 @@
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
#define BASE_BINARY_LOOP(tin, tout, op) \
BINARY_LOOP { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
tout * out = (tout *)op1; \
op; \
}
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jeesh, this macro argument passing is starting to give me the creeps ;). By now I think its correct, but could maybe use a comment here to explain things a bit more in depth, I am not used to such macros, but took me a bit to figure out that in1 always ends up pointing to the correct args[0], etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe it is just one of those things you have to stare at until it makes sense...

const tin cin = *(tin *)cinp; \
BINARY_LOOP { \
const tin vin = *(tin *)vinp; \
tout * out = (tout *)op1; \
op; \
}
#define BINARY_LOOP_FAST(tin, tout, op) \
do { \
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
BINARY_LOOP { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
tout * out = (tout *)op1; \
op; \
if (args[2] == args[0]) { \
BASE_BINARY_LOOP(tin, tout, op) \
} \
else if (args[2] == args[1]) { \
BASE_BINARY_LOOP(tin, tout, op) \
} \
else { \
BASE_BINARY_LOOP(tin, tout, op) \
} \
} \
else if (IS_BINARY_CONT_S1(tin, tout)) { \
const tin in1 = *(tin *)args[0]; \
BINARY_LOOP { \
const tin in2 = *(tin *)ip2; \
tout * out = (tout *)op1; \
op; \
if (args[1] == args[2]) { \
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
} \
} \
else if (IS_BINARY_CONT_S2(tin, tout)) { \
const tin in2 = *(tin *)args[1]; \
BINARY_LOOP { \
const tin in1 = *(tin *)ip1; \
tout * out = (tout *)op1; \
op; \
if (args[0] == args[2]) { \
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
}\
} \
else { \
BINARY_LOOP { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
tout * out = (tout *)op1; \
op; \
} \
BASE_BINARY_LOOP(tin, tout, op) \
} \
} \
while (0)
Expand Down
20 changes: 11 additions & 9 deletions numpy/core/tests/test_scalarmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ class TestBaseMath(TestCase):
def test_blocked(self):
# test alignments offsets for simd instructions
# alignments for vz + 2 * (vs - 1) + 1
for dt, sz in [(np.float32, 11), (np.float64, 7)]:
for dt, sz in [(np.float32, 11), (np.float64, 7), (np.int32, 11)]:
for out, inp1, inp2, msg in _gen_alignment_data(dtype=dt,
type='binary',
max_size=sz):
exp1 = np.ones_like(inp1)
inp1[...] = np.ones_like(inp1)
inp2[...] = np.zeros_like(inp2)
assert_almost_equal(np.add(inp1, inp2), exp1, err_msg=msg)
assert_almost_equal(np.add(inp1, 1), exp1 + 1, err_msg=msg)
assert_almost_equal(np.add(inp1, 2), exp1 + 2, err_msg=msg)
assert_almost_equal(np.add(1, inp2), exp1, err_msg=msg)

np.add(inp1, inp2, out=out)
Expand All @@ -82,15 +82,17 @@ def test_blocked(self):
inp2[...] += np.arange(inp2.size, dtype=dt) + 1
assert_almost_equal(np.square(inp2),
np.multiply(inp2, inp2), err_msg=msg)
assert_almost_equal(np.reciprocal(inp2),
np.divide(1, inp2), err_msg=msg)
# skip true divide for ints
if dt != np.int32 or sys.version_info.major < 3:
assert_almost_equal(np.reciprocal(inp2),
np.divide(1, inp2), err_msg=msg)

inp1[...] = np.ones_like(inp1)
inp2[...] = np.zeros_like(inp2)
np.add(inp1, 1, out=out)
assert_almost_equal(out, exp1 + 1, err_msg=msg)
np.add(1, inp2, out=out)
assert_almost_equal(out, exp1, err_msg=msg)
np.add(inp1, 2, out=out)
assert_almost_equal(out, exp1 + 2, err_msg=msg)
inp2[...] = np.ones_like(inp2)
np.add(2, inp2, out=out)
assert_almost_equal(out, exp1 + 2, err_msg=msg)

def test_lower_align(self):
# check data that is not aligned to element size
Expand Down
3 changes: 2 additions & 1 deletion numpy/core/tests/test_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -1202,8 +1202,9 @@ def test_abs_neg_blocked(self):
assert_array_equal(out, d, err_msg=msg)

assert_array_equal(-inp, -1*inp, err_msg=msg)
d = -1 * inp
np.negative(inp, out=out)
assert_array_equal(out, -1*inp, err_msg=msg)
assert_array_equal(out, d, err_msg=msg)

def test_lower_align(self):
# check data that is not aligned to element size
Expand Down
9 changes: 6 additions & 3 deletions numpy/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1794,7 +1794,8 @@ def _gen_alignment_data(dtype=float32, type='binary', max_size=24):
inp = lambda: arange(s, dtype=dtype)[o:]
out = empty((s,), dtype=dtype)[o:]
yield out, inp(), ufmt % (o, o, s, dtype, 'out of place')
yield inp(), inp(), ufmt % (o, o, s, dtype, 'in place')
d = inp()
yield d, d, ufmt % (o, o, s, dtype, 'in place')
yield out[1:], inp()[:-1], ufmt % \
(o + 1, o, s - 1, dtype, 'out of place')
yield out[:-1], inp()[1:], ufmt % \
Expand All @@ -1809,9 +1810,11 @@ def _gen_alignment_data(dtype=float32, type='binary', max_size=24):
out = empty((s,), dtype=dtype)[o:]
yield out, inp1(), inp2(), bfmt % \
(o, o, o, s, dtype, 'out of place')
yield inp1(), inp1(), inp2(), bfmt % \
d = inp1()
yield d, d, inp2(), bfmt % \
(o, o, o, s, dtype, 'in place1')
yield inp2(), inp1(), inp2(), bfmt % \
d = inp2()
yield d, inp1(), d, bfmt % \
(o, o, o, s, dtype, 'in place2')
yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \
(o + 1, o, o, s - 1, dtype, 'out of place')
Expand Down
0