From e6c397b974a72d6970a9ec3a7858355951d43e0a Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Tue, 30 Aug 2016 20:58:29 +0200
Subject: [PATCH 1/3] ENH: avoid temporary arrays in expressions

Temporary arrays generated in expressions are expensive as the imply
extra memory bandwidth which is the bottleneck in most numpy operations.
For example:

     r = -a + b

One can avoid temporaries when the reference count of one of the
operands is one as it cannot be referenced by any other python code
(rvalue in C++ terms).
Python itself uses this method to optimize string concatenation.
The tricky part is that via the C-API one can use the PyNumber_ directly
and skip increasing the reference count when not needed.
The python stack cannot be used to determine this as python does not
export the stack size, only the current position.
To avoid the problem we collect the backtrace of the call until the
python frame evaluation function and if it consist exclusively of
functions inside python or the c-library we can assume no other library
is using the C-API in between.

Issues are that the reliability of backtraces is unknown on non-GNU
platforms. On GNU and amd64 it should be reliable enough, even without frame
pointers, as glibc will use GCCs stack unwinder via the mandatory dwarf
stack annotations.
Other platforms with backtrace need to be tested.
Another problem is that the stack unwinding is very expensive. Unwinding
a 100 function deep stack (which is not uncommon from python) takes
around 35us, so the elision can only be done for relatively large
arrays.
Heuristicly it seems to be beneficial around 256kb array sizes (which is
about the typical L2 cache size).

The performance gain is quite significant around 1.5 to 2 times faster
operations with temporaries can be observed. The speed is similar to
rewriting the operations to inplace operations manually.
---
 numpy/core/setup.py                    |   1 +
 numpy/core/setup_common.py             |   4 +-
 numpy/core/src/multiarray/number.c     |  83 +++++-
 numpy/core/src/multiarray/temp_elide.c | 391 +++++++++++++++++++++++++
 numpy/core/src/multiarray/temp_elide.h |  15 +
 numpy/core/tests/test_multiarray.py    |  47 ++-
 6 files changed, 536 insertions(+), 5 deletions(-)
 create mode 100644 numpy/core/src/multiarray/temp_elide.c
 create mode 100644 numpy/core/src/multiarray/temp_elide.h

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index f45042bec8ff..891ba2a191bb 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -824,6 +824,7 @@ def generate_multiarray_templated_sources(ext, build_dir):
             join('src', 'multiarray', 'shape.c'),
             join('src', 'multiarray', 'scalarapi.c'),
             join('src', 'multiarray', 'scalartypes.c.src'),
+            join('src', 'multiarray', 'temp_elide.c'),
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'ucsnarrow.c'),
             join('src', 'multiarray', 'vdot.c'),
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 357051cdb045..7691a2aeb2bb 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -107,7 +107,8 @@ def check_api_version(apiversion, codegen_dir):
 OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
         "rint", "trunc", "exp2", "log2", "hypot", "atan2", "pow",
         "copysign", "nextafter", "ftello", "fseeko",
-        "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate"]
+        "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate",
+        "backtrace"]
 
 
 OPTIONAL_HEADERS = [
@@ -116,6 +117,7 @@ def check_api_version(apiversion, codegen_dir):
                 "emmintrin.h",  # SSE2
                 "features.h",  # for glibc version linux
                 "xlocale.h"  # see GH#8367
+                "dlfcn.h", # dladdr
 ]
 
 # optional gcc compiler builtins and their call arguments and optional a
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index fec015a30955..c4e675430808 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -12,6 +12,7 @@
 #include "npy_import.h"
 #include "common.h"
 #include "number.h"
+#include "temp_elide.h"
 
 /*************************************************************************
  ****************   Implement Number Protocol ****************************
@@ -352,24 +353,61 @@ PyArray_GenericInplaceUnaryFunction(PyArrayObject *m1, PyObject *op)
     return PyObject_CallFunctionObjArgs(op, m1, m1, NULL);
 }
 
+static PyObject *
+array_inplace_add(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_subtract(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_multiply(PyArrayObject *m1, PyObject *m2);
+#if !defined(NPY_PY3K)
+static PyObject *
+array_inplace_divide(PyArrayObject *m1, PyObject *m2);
+#endif
+static PyObject *
+array_inplace_true_divide(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_floor_divide(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_and(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_or(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_xor(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_left_shift(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_right_shift(PyArrayObject *m1, PyObject *m2);
+
 static PyObject *
 array_add(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__add__", "__radd__", 0, nb_add);
+    if (try_binary_elide(m1, m2, &array_inplace_add, &res, 1)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.add);
 }
 
 static PyObject *
 array_subtract(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__sub__", "__rsub__", 0, nb_subtract);
+    if (try_binary_elide(m1, m2, &array_inplace_subtract, &res, 0)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.subtract);
 }
 
 static PyObject *
 array_multiply(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__mul__", "__rmul__", 0, nb_multiply);
+    if (try_binary_elide(m1, m2, &array_inplace_multiply, &res, 1)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.multiply);
 }
 
@@ -377,7 +415,11 @@ array_multiply(PyArrayObject *m1, PyObject *m2)
 static PyObject *
 array_divide(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__div__", "__rdiv__", 0, nb_divide);
+    if (try_binary_elide(m1, m2, &array_inplace_divide, &res, 0)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.divide);
 }
 #endif
@@ -529,7 +571,7 @@ fast_scalar_power(PyArrayObject *a1, PyObject *o2, int inplace)
                 return NULL;
             }
 
-            if (inplace) {
+            if (inplace || can_elide_temp_unary(a1)) {
                 return PyArray_GenericInplaceUnaryFunction(a1, fastop);
             } else {
                 return PyArray_GenericUnaryFunction(a1, fastop);
@@ -588,53 +630,82 @@ array_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo))
 static PyObject *
 array_negative(PyArrayObject *m1)
 {
+    if (can_elide_temp_unary(m1)) {
+        return PyArray_GenericInplaceUnaryFunction(m1, n_ops.negative);
+    }
     return PyArray_GenericUnaryFunction(m1, n_ops.negative);
 }
 
 static PyObject *
 array_absolute(PyArrayObject *m1)
 {
+    if (can_elide_temp_unary(m1)) {
+        return PyArray_GenericInplaceUnaryFunction(m1, n_ops.absolute);
+    }
     return PyArray_GenericUnaryFunction(m1, n_ops.absolute);
 }
 
 static PyObject *
 array_invert(PyArrayObject *m1)
 {
+    if (can_elide_temp_unary(m1)) {
+        return PyArray_GenericInplaceUnaryFunction(m1, n_ops.invert);
+    }
     return PyArray_GenericUnaryFunction(m1, n_ops.invert);
 }
 
 static PyObject *
 array_left_shift(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__lshift__", "__rlshift__", 0, nb_lshift);
+    if (try_binary_elide(m1, m2, &array_inplace_left_shift, &res, 0)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.left_shift);
 }
 
 static PyObject *
 array_right_shift(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__rshift__", "__rrshift__", 0, nb_rshift);
+    if (try_binary_elide(m1, m2, &array_inplace_right_shift, &res, 0)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.right_shift);
 }
 
 static PyObject *
 array_bitwise_and(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__and__", "__rand__", 0, nb_and);
+    if (try_binary_elide(m1, m2, &array_inplace_bitwise_and, &res, 1)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_and);
 }
 
 static PyObject *
 array_bitwise_or(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__or__", "__ror__", 0, nb_or);
+    if (try_binary_elide(m1, m2, &array_inplace_bitwise_or, &res, 1)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_or);
 }
 
 static PyObject *
 array_bitwise_xor(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__xor__", "__rxor__", 0, nb_xor);
+    if (try_binary_elide(m1, m2, &array_inplace_bitwise_xor, &res, 1)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_xor);
 }
 
@@ -726,14 +797,24 @@ array_inplace_bitwise_xor(PyArrayObject *m1, PyObject *m2)
 static PyObject *
 array_floor_divide(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__floordiv__", "__rfloordiv__", 0, nb_floor_divide);
+    if (try_binary_elide(m1, m2, &array_inplace_floor_divide, &res, 0)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.floor_divide);
 }
 
 static PyObject *
 array_true_divide(PyArrayObject *m1, PyObject *m2)
 {
+    PyObject * res;
     GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__truediv__", "__rtruediv__", 0, nb_true_divide);
+    if (PyArray_CheckExact(m1) &&
+            (PyArray_ISFLOAT(m1) || PyArray_ISCOMPLEX(m1)) &&
+            try_binary_elide(m1, m2, &array_inplace_true_divide, &res, 0)) {
+        return res;
+    }
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.true_divide);
 }
 
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
new file mode 100644
index 000000000000..5d18e1a08b80
--- /dev/null
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -0,0 +1,391 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+/*
+ * Functions used to try to avoid/elide temporaries in python expressions
+ * of type a + b + b by translating some operations into inplace operations.
+ * This example translates to this bytecode:
+ *
+ *        0 LOAD_FAST                0 (a)
+ *        3 LOAD_FAST                1 (b)
+ *        6 BINARY_ADD
+ *        7 LOAD_FAST                1 (b)
+ *       10 BINARY_ADD
+ *
+ * The two named variables get their reference count increased by the load
+ * instructions so they always have a reference count larger than 1.
+ * The temporary of the first BINARY_ADD on the other hand only has a count of
+ * 1. Only temporaries can have a count of 1 in python so we can use this to
+ * transform the second operation into an inplace operation and not affect the
+ * output of the program.
+ * CPython does the same thing to resize memory instead of copying when doing
+ * string concatenation.
+ * The gain can be very significant (4x-6x) when avoiding the temporary allows
+ * the operation to remain in the cpu caches and can still be 50% faster for
+ * array larger than cpu cache size.
+ *
+ * A complication is that a DSO (dynamic shared object) module (e.g. cython)
+ * could call the PyNumber functions directly on arrays with reference count of
+ * 1.
+ * This is handled by checking the call stack to verify that we have been
+ * called directly from the cpython interpreter.
+ * To achieve this we check that all functions in the callstack until the
+ * cpython frame evaluation function are located in cpython or numpy.
+ * This is an expensive operation so temporaries are only avoided for rather
+ * large arrays.
+ *
+ * A possible future improvement would be to change cpython to give as access
+ * to the top of the stack. Then we could just check that the objects involved
+ * are on the cpython stack instead of checking the function callstack.
+ *
+ * Elision can be applied to all operations that do have inplace variants and
+ * do not change types (addition, subtraction, multiplication, float division,
+ * logical and bitwise operations ...)
+ * For commutative operations (addition, multiplication, ...) if eliding into
+ * the lefthand side fails it can succedd on the righthand side by swapping the
+ * arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b.
+ *
+ * TODO only supports systems with backtrace(), windows can probably be
+ * supported too by using the appropriate windows apis.
+ */
+
+#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
+/* 1 prints elided operations, 2 prints stacktraces */
+#define NPY_ELIDE_DEBUG 0
+#define NPY_MAX_STACKSIZE 10
+
+#if PY_VERSION_HEX >= 0x03060000
+/* TODO can pep523 be used to somehow? */
+#define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
+#else
+#define PYFRAMEEVAL_FUNC "PyEval_EvalFrameEx"
+#endif
+/*
+ * Heuristic size of the array in bytes at which backtrace overhead generation
+ * becomes less than speed gained by inplace operations. Depends on stack depth
+ * being checked.  Measurements with 10 stacks show it getting worthwhile
+ * around 100KiB but to be conservative put it higher around where the L2 cache
+ * spills.
+ */
+#ifndef Py_DEBUG
+#define NPY_MIN_ELIDE_BYTES (256 * 1024)
+#else
+/*
+ * in debug mode always elide but skip scalars as these can convert to 0d array
+ * during in place operations
+ */
+#define NPY_MIN_ELIDE_BYTES (32)
+#endif
+#include <dlfcn.h>
+#include <execinfo.h>
+
+/*
+ * linear search pointer in table
+ * number of pointers is usually quite small but if a performance impact can be
+ * measured this could be converted to a binary search
+ */
+static int
+find_addr(void * addresses[], npy_intp naddr, void * addr)
+{
+    npy_intp j;
+    for (j = 0; j < naddr; j++) {
+        if (addr == addresses[j]) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int
+check_callers(int * cannot)
+{
+    /*
+     * get base addresses of multiarray and python, check if
+     * backtrace is in these libraries only calling dladdr if a new max address
+     * is found.
+     * When after the initial multiarray stack everything is inside python we
+     * can elide as no C-API user could have messed up the reference counts.
+     * Only check until the python frame evaluation function is found
+     * approx 10us overhead for stack size of 10
+     *
+     * TODO some calls go over scalarmath in umath but we cannot get the base
+     * address of it from multiarraymodule as it is not linked against it
+     */
+    static int init = 0;
+    /*
+     * measured DSO object memory start and end, if an address is located
+     * inside these bounds it is part of that library so we don't need to call
+     * dladdr on it (assuming linear memory)
+     */
+    static void * pos_python_start;
+    static void * pos_python_end;
+    static void * pos_ma_start;
+    static void * pos_ma_end;
+
+    /* known address storage to save dladdr calls */
+    static void * py_addr[64];
+    static void * pyeval_addr[64];
+    static npy_intp n_py_addr = 0;
+    static npy_intp n_pyeval = 0;
+
+    void *buffer[NPY_MAX_STACKSIZE];
+    int i, nptrs;
+    int ok = 0;
+    /* cannot determine callers */
+    if (init == -1) {
+        *cannot = 1;
+        return 0;
+    }
+
+    nptrs = backtrace(buffer, NPY_MAX_STACKSIZE);
+    if (nptrs == 0) {
+        /* complete failure, disable elision */
+        init = -1;
+        *cannot = 1;
+        return 0;
+    }
+
+    /* setup DSO base addresses, ends updated later */
+    if (NPY_UNLIKELY(init == 0)) {
+        Dl_info info;
+        /* get python base address */
+        if (dladdr(&PyNumber_Or, &info)) {
+            pos_python_start = info.dli_fbase;
+            pos_python_end = info.dli_fbase;
+        }
+        else {
+            init = -1;
+            return 0;
+        }
+        /* get multiarray base address */
+        if (dladdr(&PyArray_SetNumericOps, &info)) {
+            pos_ma_start = info.dli_fbase;
+            pos_ma_end = info.dli_fbase;
+        }
+        else {
+            init = -1;
+            return 0;
+        }
+        init = 1;
+    }
+
+    /* loop over callstack addresses to check if they leave numpy or cpython */
+    for (i = 0; i < nptrs; i++) {
+        Dl_info info;
+        int in_python = 0;
+        int in_multiarray = 0;
+#if NPY_ELIDE_DEBUG >= 2
+        dladdr(buffer[i], &info);
+        printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase,
+               info.dli_sname, info.dli_saddr);
+#endif
+
+        /* check stored DSO boundaries first */
+        if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) {
+            in_python = 1;
+        }
+        else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) {
+            in_multiarray = 1;
+        }
+
+        /* update DSO boundaries via dladdr if necessary */
+        if (!in_python && !in_multiarray) {
+            if (dladdr(buffer[i], &info) == 0) {
+                init = -1;
+                ok = 0;
+                break;
+            }
+            /* update DSO end */
+            if (info.dli_fbase == pos_python_start) {
+                pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end);
+                in_python = 1;
+            }
+            else if (info.dli_fbase == pos_ma_start) {
+                pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end);
+                in_multiarray = 1;
+            }
+        }
+
+        /* no longer in ok libraries and not reached PyEval -> no elide */
+        if (!in_python && !in_multiarray) {
+            ok = 0;
+            break;
+        }
+
+        /* in python check if the frame eval function was reached */
+        if (in_python) {
+            /* if reached eval we are done */
+            if (find_addr(pyeval_addr, n_pyeval, buffer[i])) {
+                ok = 1;
+                break;
+            }
+            /*
+             * check if its some other function, use pointer lookup table to
+             * save expensive dladdr calls
+             */
+            if (find_addr(py_addr, n_py_addr, buffer[i])) {
+                continue;
+            }
+
+            /* new python address, check for PyEvalFrame */
+            if (dladdr(buffer[i], &info) == 0) {
+                init = -1;
+                ok = 0;
+                break;
+            }
+            if (info.dli_sname &&
+                    strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) {
+                if (n_pyeval < sizeof(pyeval_addr) / sizeof(pyeval_addr[0])) {
+                    /* store address to not have to dladdr it again */
+                    pyeval_addr[n_pyeval++] = buffer[i];
+                }
+                ok = 1;
+                break;
+            }
+            else if (n_py_addr < sizeof(py_addr) / sizeof(py_addr[0])) {
+                /* store other py function to not have to dladdr it again */
+                py_addr[n_py_addr++] = buffer[i];
+            }
+        }
+    }
+
+    /* all stacks after numpy are from python, we can elide */
+    if (ok) {
+        *cannot = 0;
+        return 1;
+    }
+    else {
+#if NPY_ELIDE_DEBUG != 0
+        puts("cannot elide due to c-api usage");
+#endif
+        *cannot = 1;
+        return 0;
+    }
+}
+
+/*
+ * check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we
+ * can do inplace operations instead of creating a new temporary
+ * "cannot" is set to true if it cannot be done even with swapped arguments
+ */
+static int
+can_elide_temp(PyArrayObject * alhs, PyObject * orhs, int * cannot)
+{
+    /*
+     * to be a candidate the array needs to have reference count 1, be an exact
+     * array of a basic type, own its data and size larger than threshold
+     */
+    if (Py_REFCNT(alhs) != 1 || !PyArray_CheckExact(alhs) ||
+            PyArray_DESCR(alhs)->type_num >= NPY_OBJECT ||
+            !(PyArray_FLAGS(alhs) & NPY_ARRAY_OWNDATA) ||
+            PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) {
+        return 0;
+    }
+    if (PyArray_CheckExact(orhs) ||
+        PyArray_CheckAnyScalar(orhs)) {
+        PyArrayObject * arhs;
+
+        /* create array from right hand side */
+        Py_INCREF(orhs);
+        arhs = (PyArrayObject *)PyArray_EnsureArray(orhs);
+        if (arhs == NULL) {
+            return 0;
+        }
+
+        /*
+         * if rhs is not a scalar dimensions must match
+         * TODO: one could allow broadcasting on equal types
+         */
+        if (!(PyArray_NDIM(arhs) == 0 ||
+              (PyArray_NDIM(arhs) == PyArray_NDIM(alhs) &&
+               PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs),
+                                    PyArray_NDIM(arhs))))) {
+                Py_DECREF(arhs);
+                return 0;
+        }
+
+        /* must be safe to cast (checks values for scalar in rhs) */
+        if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs),
+                                   NPY_SAFE_CASTING)) {
+            Py_DECREF(arhs);
+            return check_callers(cannot);
+        }
+        Py_DECREF(arhs);
+    }
+
+    return 0;
+}
+
+/*
+ * try eliding a binary op, if commutative is true also try swapped arguments
+ */
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+                 PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+                 PyObject ** res, int commutative)
+{
+    /* set when no elision can be done independent of argument order */
+    int cannot = 0;
+    if (can_elide_temp(m1, m2, &cannot)) {
+        *res = inplace_op(m1, m2);
+#if NPY_ELIDE_DEBUG != 0
+        puts("elided temporary in binary op");
+#endif
+        return 1;
+    }
+    else if (commutative && !cannot) {
+        if (can_elide_temp((PyArrayObject *)m2, (PyObject *)m1, &cannot)) {
+            *res = inplace_op((PyArrayObject *)m2, (PyObject *)m1);
+#if NPY_ELIDE_DEBUG != 0
+            puts("elided temporary in commutative binary op");
+#endif
+            return 1;
+        }
+    }
+    *res = NULL;
+    return 0;
+}
+
+/* try elide unary temporary */
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1)
+{
+    int cannot;
+    if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) ||
+            PyArray_DESCR(m1)->type_num == NPY_VOID ||
+            !(PyArray_FLAGS(m1) & NPY_ARRAY_OWNDATA) ||
+            PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) {
+        return 0;
+    }
+    if (check_callers(&cannot)) {
+#if NPY_ELIDE_DEBUG != 0
+        puts("elided temporary in unary op");
+#endif
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+#else /* unsupported interpreter or missing backtrace */
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1)
+{
+    return 0;
+}
+
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+                 PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+                 PyObject ** res, int commutative)
+{
+    *res = NULL;
+    return 0;
+}
+#endif
diff --git a/numpy/core/src/multiarray/temp_elide.h b/numpy/core/src/multiarray/temp_elide.h
new file mode 100644
index 000000000000..d073adf28ec6
--- /dev/null
+++ b/numpy/core/src/multiarray/temp_elide.h
@@ -0,0 +1,15 @@
+#ifndef _NPY_ARRAY_TEMP_AVOID_H_
+#define _NPY_ARRAY_TEMP_AVOID_H_
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/ndarraytypes.h>
+
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1);
+
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+                 PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+                 PyObject ** res, int commutative);
+
+#endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index fa5051ba73fa..b780c55580f2 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2743,8 +2743,9 @@ def test_extension_incref_elide(self):
         #    d = input.copy() # refcount 1
         #    return d, d + d # PyNumber_Add without increasing refcount
         from numpy.core.multiarray_tests import incref_elide
-        d = np.ones(5)
+        d = np.ones(100000)
         orig, res = incref_elide(d)
+        d + d
         # the return original should not be changed to an inplace operation
         assert_array_equal(orig, d)
         assert_array_equal(res, d + d)
@@ -2758,12 +2759,52 @@ def test_extension_incref_elide_stack(self):
         #    return l[4] + l[4] # PyNumber_Add without increasing refcount
         from numpy.core.multiarray_tests import incref_elide_l
         # padding with 1 makes sure the object on the stack is not overwriten
-        l = [1, 1, 1, 1, np.ones(5)]
+        l = [1, 1, 1, 1, np.ones(100000)]
         res = incref_elide_l(l)
         # the return original should not be changed to an inplace operation
-        assert_array_equal(l[4], np.ones(5))
+        assert_array_equal(l[4], np.ones(100000))
         assert_array_equal(res, l[4] + l[4])
 
+    def test_temporary_with_cast(self):
+        # check that we don't elide into a temporary which would need casting
+        d = np.ones(200000, dtype=np.int64)
+        assert_equal(((d + d) + 2**222).dtype, np.dtype('O'))
+
+        r = ((d + d) / 2)
+        assert_equal(r.dtype, np.dtype('f8'))
+
+        r = np.true_divide((d + d), 2)
+        assert_equal(r.dtype, np.dtype('f8'))
+
+        r = ((d + d) / 2.)
+        assert_equal(r.dtype, np.dtype('f8'))
+
+        r = ((d + d) // 2)
+        assert_equal(r.dtype, np.dtype(np.int64))
+
+        # commutative elision into the astype result
+        f = np.ones(100000, dtype=np.float32)
+        assert_equal(((f + f) + f.astype(np.float64)).dtype, np.dtype('f8'))
+
+        # no elision into f + f
+        d = f.astype(np.float64)
+        assert_equal(((f + f) + d).dtype, np.dtype('f8'))
+
+    def test_elide_broadcast(self):
+        # test no elision on broadcast to higher dimension
+        # only triggers elision code path in debug mode as triggering it in
+        # normal mode needs 256kb large matching dimension, so a lot of memory
+        d = np.ones((2000, 1), dtype=int)
+        b = np.ones((2000), dtype=np.bool)
+        r = (1 - d) + b
+        assert_equal(r, 1)
+        assert_equal(r.shape, (2000, 2000))
+
+    def test_elide_scalar(self):
+        # check inplace op does not create ndarray from scalars
+        a = np.bool_()
+        assert_(type(~(a & a)) is np.bool_)
+
     def test_ufunc_override_rop_precedence(self):
         # 2016-01-29: NUMPY_UFUNC_DISABLED
         return

From 5b16efe145f82a751625cebbe5eb5e2cd10515a3 Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Sat, 21 Jan 2017 12:32:16 +0100
Subject: [PATCH 2/3] DOC: add release note entry for the elision

---
 doc/release/1.13.0-notes.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/release/1.13.0-notes.rst b/doc/release/1.13.0-notes.rst
index ae3c64fddcea..2a1509cbc758 100644
--- a/doc/release/1.13.0-notes.rst
+++ b/doc/release/1.13.0-notes.rst
@@ -7,6 +7,8 @@ This release supports Python 2.7 and 3.4 - 3.6.
 Highlights
 ==========
 
+ * Operations like `a + b + c` will create less temporaries on some platforms
+
 
 Dropped Support
 ===============
@@ -84,6 +86,14 @@ C API
 New Features
 ============
 
+Temporary elision
+-----------------
+On platforms providing the `backtrace` function NumPy will now not create
+temporaries in expression when possible.
+For example `d = a + b + c` is transformed to `d = a + b; d += c` which can
+improve performance for large arrays as less memory bandwidth is required to
+perform the operation.
+
 ``axes`` argument for ``unique``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In an N-dimensional array, the user can now choose the axis along which to look

From c3e24b2f2860ce1912f578cf996184198a3cfd25 Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Sat, 28 Jan 2017 13:10:32 +0100
Subject: [PATCH 3/3] BENCH: add benchmarks for operations with temporaries

---
 benchmarks/benchmarks/bench_core.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 1f7c2331063a..a4e3152ea056 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -76,6 +76,26 @@ def time_tril_l10x10(self):
         np.tril(self.l10x10)
 
 
+class Temporaries(Benchmark):
+    def setup(self):
+        self.amid = np.ones(50000)
+        self.bmid = np.ones(50000)
+        self.alarge = np.ones(1000000)
+        self.blarge = np.ones(1000000)
+
+    def time_mid(self):
+        (self.amid * 2) + self.bmid
+
+    def time_mid2(self):
+        (self.amid + self.bmid) - 2
+
+    def time_large(self):
+        (self.alarge * 2) + self.blarge
+
+    def time_large2(self):
+        (self.alarge + self.blarge) - 2
+
+
 class MA(Benchmark):
     def setup(self):
         self.l100 = range(100)