diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c index 552a59a0092e..344b33254980 100644 --- a/numpy/_core/src/multiarray/nditer_api.c +++ b/numpy/_core/src/multiarray/nditer_api.c @@ -17,13 +17,7 @@ #include "nditer_impl.h" #include "templ_common.h" #include "ctors.h" -#include "refcount.h" -/* Internal helper functions private to this file */ -static npy_intp -npyiter_checkreducesize(NpyIter *iter, npy_intp count, - npy_intp *reduce_innersize, - npy_intp *reduce_outerdim); /*NUMPY_API * Removes an axis from iteration. This requires that NPY_ITER_MULTI_INDEX @@ -828,6 +822,9 @@ NpyIter_IsFirstVisit(NpyIter *iter, int iop) /*NUMPY_API * Whether the iteration could be done with no buffering. + * + * Note that the iterator may use buffering to increase the inner loop size + * even when buffering is not required. */ NPY_NO_EXPORT npy_bool NpyIter_RequiresBuffering(NpyIter *iter) @@ -1329,8 +1326,10 @@ NpyIter_GetAxisStrideArray(NpyIter *iter, int axis) /*NUMPY_API * Get an array of strides which are fixed. Any strides which may - * change during iteration receive the value NPY_MAX_INTP. Once - * the iterator is ready to iterate, call this to get the strides + * change during iteration receive the value NPY_MAX_INTP + * (as of NumPy 2.3, `NPY_MAX_INTP` will never happen but must be supported; + * we could guarantee this, but not sure if we should). + * Once the iterator is ready to iterate, call this to get the strides * which will always be fixed in the inner loop, then choose optimized * inner loop functions which take advantage of those fixed strides. * @@ -1340,75 +1339,16 @@ NPY_NO_EXPORT void NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides) { npy_uint32 itflags = NIT_ITFLAGS(iter); - int ndim = NIT_NDIM(iter); - int iop, nop = NIT_NOP(iter); + int nop = NIT_NOP(iter); NpyIter_AxisData *axisdata0 = NIT_AXISDATA(iter); - npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); if (itflags&NPY_ITFLAG_BUFFER) { - NpyIter_BufferData *data = NIT_BUFFERDATA(iter); - npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter); - npy_intp stride, *strides = NBF_STRIDES(data), - *ad_strides = NAD_STRIDES(axisdata0); - PyArray_Descr **dtypes = NIT_DTYPES(iter); - - for (iop = 0; iop < nop; ++iop) { - stride = strides[iop]; - /* - * Operands which are always/never buffered have fixed strides, - * and everything has fixed strides when ndim is 0 or 1 - */ - if (ndim <= 1 || (op_itflags[iop]& - (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_BUFNEVER))) { - out_strides[iop] = stride; - } - /* If it's a reduction, 0-stride inner loop may have fixed stride */ - else if (stride == 0 && (itflags&NPY_ITFLAG_REDUCE)) { - /* If it's a reduction operand, definitely fixed stride */ - if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) { - out_strides[iop] = stride; - } - /* - * Otherwise it's guaranteed to be a fixed stride if the - * stride is 0 for all the dimensions. - */ - else { - NpyIter_AxisData *axisdata = axisdata0; - int idim; - for (idim = 0; idim < ndim; ++idim) { - if (NAD_STRIDES(axisdata)[iop] != 0) { - break; - } - NIT_ADVANCE_AXISDATA(axisdata, 1); - } - /* If all the strides were 0, the stride won't change */ - if (idim == ndim) { - out_strides[iop] = stride; - } - else { - out_strides[iop] = NPY_MAX_INTP; - } - } - } - /* - * Inner loop contiguous array means its stride won't change when - * switching between buffering and not buffering - */ - else if (ad_strides[iop] == dtypes[iop]->elsize) { - out_strides[iop] = ad_strides[iop]; - } - /* - * Otherwise the strides can change if the operand is sometimes - * buffered, sometimes not. - */ - else { - out_strides[iop] = NPY_MAX_INTP; - } - } + /* If there is buffering we wrote the strides into the bufferdata. */ + memcpy(out_strides, NBF_STRIDES(NIT_BUFFERDATA(iter)), nop*NPY_SIZEOF_INTP); } else { - /* If there's no buffering, the strides are always fixed */ + /* If there's no buffering, the strides come from the operands. */ memcpy(out_strides, NAD_STRIDES(axisdata0), nop*NPY_SIZEOF_INTP); } } @@ -1573,6 +1513,10 @@ NpyIter_DebugPrint(NpyIter *iter) printf("VIRTUAL "); if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_WRITEMASKED) printf("WRITEMASKED "); + if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_BUF_SINGLESTRIDE) + printf("BUF_SINGLESTRIDE "); + if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_CONTIG) + printf("CONTIG "); printf("\n"); } printf("|\n"); @@ -1585,13 +1529,15 @@ NpyIter_DebugPrint(NpyIter *iter) printf("| BufferSize: %d\n", (int)NBF_BUFFERSIZE(bufferdata)); printf("| Size: %d\n", (int)NBF_SIZE(bufferdata)); printf("| BufIterEnd: %d\n", (int)NBF_BUFITEREND(bufferdata)); + printf("| BUFFER CoreSize: %d\n", + (int)NBF_CORESIZE(bufferdata)); if (itflags&NPY_ITFLAG_REDUCE) { printf("| REDUCE Pos: %d\n", (int)NBF_REDUCE_POS(bufferdata)); printf("| REDUCE OuterSize: %d\n", (int)NBF_REDUCE_OUTERSIZE(bufferdata)); printf("| REDUCE OuterDim: %d\n", - (int)NBF_REDUCE_OUTERDIM(bufferdata)); + (int)NBF_OUTERDIM(bufferdata)); } printf("| Strides: "); for (iop = 0; iop < nop; ++iop) @@ -1884,8 +1830,96 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex) NIT_ADVANCE_AXISDATA(axisdata, -1); } } + if (itflags&NPY_ITFLAG_BUFFER) { + /* Find the remainder if chunking to the buffers coresize */ + npy_intp fact = NIT_ITERINDEX(iter) / NIT_BUFFERDATA(iter)->coresize; + npy_intp offset = NIT_ITERINDEX(iter) - fact * NIT_BUFFERDATA(iter)->coresize; + NIT_BUFFERDATA(iter)->coreoffset = offset; + } +} + + +/* + * This helper fills the bufferdata copy information for an operand. It + * is very specific to copy from and to buffers. + */ +static inline void +npyiter_fill_buffercopy_params( + int nop, int iop, int ndim, npy_uint32 opitflags, npy_intp transfersize, + NpyIter_BufferData *bufferdata, + NpyIter_AxisData *axisdata, + NpyIter_AxisData *outer_axisdata, + int *ndim_transfer, + npy_intp *op_transfersize, + npy_intp *buf_stride, + npy_intp *op_strides[], npy_intp *op_shape[], npy_intp *op_coords[]) +{ + /* + * Set up if we had to do the full generic copy. + * NOTE: Except the transfersize itself everything here is fixed + * and we could create it once early on. + */ + *ndim_transfer = ndim; + *op_transfersize = transfersize; + + if ((opitflags & NPY_OP_ITFLAG_REDUCE) && (NAD_STRIDES(outer_axisdata)[iop] != 0)) { + /* + * Reduce with all inner strides ==0 (outer !=0). We buffer the outer + * stride which also means buffering only outersize items. + * (If the outer stride is 0, some inner ones are guaranteed nonzero.) + */ + assert(NAD_STRIDES(axisdata)[iop] == 0); + *ndim_transfer = 1; + *op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata); + *buf_stride = NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop]; + + *op_shape = op_transfersize; + assert(**op_coords == 0); /* initialized by caller currently */ + *op_strides = &NAD_STRIDES(outer_axisdata)[iop]; + return; + } + + /* + * The copy is now a typical copy into a contiguous buffer. + * If it is a reduce, we only copy the inner part (i.e. less). + * The buffer strides are now always contiguous. + */ + *buf_stride = NBF_STRIDES(bufferdata)[iop]; + + if (opitflags & NPY_OP_ITFLAG_REDUCE) { + /* Outer dim is reduced, so omit it from copying */ + *ndim_transfer -= 1; + if (*op_transfersize > bufferdata->coresize) { + *op_transfersize = bufferdata->coresize; + } + /* copy setup is identical to non-reduced now. */ + } + + if (opitflags & NPY_OP_ITFLAG_BUF_SINGLESTRIDE) { + *ndim_transfer = 1; + *op_shape = op_transfersize; + assert(**op_coords == 0); /* initialized by caller currently */ + *op_strides = &NAD_STRIDES(axisdata)[iop]; + if ((*op_strides)[0] == 0 && ( + !(opitflags & NPY_OP_ITFLAG_CONTIG) || + (opitflags & NPY_OP_ITFLAG_WRITE))) { + /* + * If the user didn't force contig, optimize single element. + * (Unless CONTIG was requested and this is not a write/reduce!) + */ + *op_transfersize = 1; + *buf_stride = 0; + } + } + else { + /* We do a full multi-dimensional copy */ + *op_shape = &NAD_SHAPE(axisdata); + *op_coords = &NAD_INDEX(axisdata); + *op_strides = &NAD_STRIDES(axisdata)[iop]; + } } + /* * This gets called after the buffers have been exhausted, and * their data needs to be written back to the arrays. The multi-index @@ -1901,21 +1935,17 @@ npyiter_copy_from_buffers(NpyIter *iter) npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter); NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter); - NpyIter_AxisData *axisdata = NIT_AXISDATA(iter), - *reduce_outeraxisdata = NULL; + NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); + NpyIter_AxisData *outer_axisdata = NULL; PyArray_Descr **dtypes = NIT_DTYPES(iter); + npy_intp *strides = NBF_STRIDES(bufferdata); npy_intp transfersize = NBF_SIZE(bufferdata); - npy_intp *strides = NBF_STRIDES(bufferdata), - *ad_strides = NAD_STRIDES(axisdata); - npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); + char **ad_ptrs = NAD_PTRS(axisdata); char **buffers = NBF_BUFFERS(bufferdata); char *buffer; - npy_intp reduce_outerdim = 0; - npy_intp *reduce_outerstrides = NULL; - npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) / NPY_SIZEOF_INTP; @@ -1924,17 +1954,25 @@ npyiter_copy_from_buffers(NpyIter *iter) return 0; } - NPY_IT_DBG_PRINT("Iterator: Copying buffers to outputs\n"); - - if (itflags&NPY_ITFLAG_REDUCE) { - reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata); - reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata); - reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim); + if (itflags & NPY_ITFLAG_REDUCE) { + npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); + outer_axisdata = NIT_INDEX_AXISDATA(axisdata, NBF_OUTERDIM(bufferdata)); transfersize *= NBF_REDUCE_OUTERSIZE(bufferdata); } + NPY_IT_DBG_PRINT("Iterator: Copying buffers to outputs\n"); + NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata); for (iop = 0; iop < nop; ++iop) { + if (op_itflags[iop]&NPY_OP_ITFLAG_BUFNEVER) { + continue; + } + + /* Currently, we always trash the buffer if there are references */ + if (PyDataType_REFCHK(dtypes[iop])) { + NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE; + } + buffer = buffers[iop]; /* * Copy the data back to the arrays. If the type has refs, @@ -1943,73 +1981,27 @@ npyiter_copy_from_buffers(NpyIter *iter) * The flag USINGBUFFER is set when the buffer was used, so * only copy back when this flag is on. */ - if ((transferinfo[iop].write.func != NULL) && - (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) { - npy_intp op_transfersize; - - npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape; - int ndim_transfer; - + if (transferinfo[iop].write.func != NULL) { NPY_IT_DBG_PRINT1("Iterator: Operand %d was buffered\n", (int)iop); - /* - * If this operand is being reduced in the inner loop, - * its buffering stride was set to zero, and just - * one element was copied. - */ - if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) { - if (strides[iop] == 0) { - if (reduce_outerstrides[iop] == 0) { - op_transfersize = 1; - src_stride = 0; - dst_strides = &src_stride; - dst_coords = &NAD_INDEX(reduce_outeraxisdata); - dst_shape = &NAD_SHAPE(reduce_outeraxisdata); - ndim_transfer = 1; - } - else { - op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata); - src_stride = reduce_outerstrides[iop]; - dst_strides = - &NAD_STRIDES(reduce_outeraxisdata)[iop]; - dst_coords = &NAD_INDEX(reduce_outeraxisdata); - dst_shape = &NAD_SHAPE(reduce_outeraxisdata); - ndim_transfer = ndim - reduce_outerdim; - } - } - else { - if (reduce_outerstrides[iop] == 0) { - op_transfersize = NBF_SIZE(bufferdata); - src_stride = strides[iop]; - dst_strides = &ad_strides[iop]; - dst_coords = &NAD_INDEX(axisdata); - dst_shape = &NAD_SHAPE(axisdata); - ndim_transfer = reduce_outerdim ? - reduce_outerdim : 1; - } - else { - op_transfersize = transfersize; - src_stride = strides[iop]; - dst_strides = &ad_strides[iop]; - dst_coords = &NAD_INDEX(axisdata); - dst_shape = &NAD_SHAPE(axisdata); - ndim_transfer = ndim; - } - } - } - else { - op_transfersize = transfersize; - src_stride = strides[iop]; - dst_strides = &ad_strides[iop]; - dst_coords = &NAD_INDEX(axisdata); - dst_shape = &NAD_SHAPE(axisdata); - ndim_transfer = ndim; - } + npy_intp zero = 0; /* used as coord for 1-D copies */ + int ndim_transfer; + npy_intp op_transfersize; + npy_intp src_stride; + npy_intp *dst_strides; + npy_intp *dst_coords = &zero; + npy_intp *dst_shape; - NPY_IT_DBG_PRINT2("Iterator: Copying buffer to " - "operand %d (%d items)\n", - (int)iop, (int)op_transfersize); + npyiter_fill_buffercopy_params(nop, iop, ndim, op_itflags[iop], + transfersize, bufferdata, axisdata, outer_axisdata, + &ndim_transfer, &op_transfersize, &src_stride, + &dst_strides, &dst_shape, &dst_coords); + + NPY_IT_DBG_PRINT( + "Iterator: Copying buffer to operand %d (%zd items):\n" + " transfer ndim: %d, inner stride: %zd, inner shape: %zd, buffer stride: %zd\n", + iop, op_transfersize, ndim_transfer, dst_strides[0], dst_shape[0], src_stride); /* WRITEMASKED operand */ if (op_itflags[iop] & NPY_OP_ITFLAG_WRITEMASKED) { @@ -2019,11 +2011,11 @@ npyiter_copy_from_buffers(NpyIter *iter) * The mask pointer may be in the buffer or in * the array, detect which one. */ - if ((op_itflags[maskop]&NPY_OP_ITFLAG_USINGBUFFER) != 0) { - maskptr = (npy_bool *)buffers[maskop]; + if ((op_itflags[maskop]&NPY_OP_ITFLAG_BUFNEVER)) { + maskptr = (npy_bool *)ad_ptrs[maskop]; } else { - maskptr = (npy_bool *)ad_ptrs[maskop]; + maskptr = (npy_bool *)buffers[maskop]; } if (PyArray_TransferMaskedStridedToNDim(ndim_transfer, @@ -2057,11 +2049,11 @@ npyiter_copy_from_buffers(NpyIter *iter) * The flag USINGBUFFER is set when the buffer was used, so * only decrement refs when this flag is on. */ - else if (transferinfo[iop].clear.func != NULL && - (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) { + else if (transferinfo[iop].clear.func != NULL) { NPY_IT_DBG_PRINT1( "Iterator: clearing refs of operand %d\n", (int)iop); npy_intp buf_stride = dtypes[iop]->elsize; + // TODO: transfersize is too large for reductions if (transferinfo[iop].clear.func( NULL, transferinfo[iop].clear.descr, buffer, transfersize, buf_stride, transferinfo[iop].clear.auxdata) < 0) { @@ -2090,510 +2082,165 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs) npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter); NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter); - NpyIter_AxisData *axisdata = NIT_AXISDATA(iter), - *reduce_outeraxisdata = NULL; + NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); + NpyIter_AxisData *outer_axisdata = NULL; - PyArray_Descr **dtypes = NIT_DTYPES(iter); PyArrayObject **operands = NIT_OPERANDS(iter); - npy_intp *strides = NBF_STRIDES(bufferdata), - *ad_strides = NAD_STRIDES(axisdata); + npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata); char **buffers = NBF_BUFFERS(bufferdata); - npy_intp iterindex, iterend, transfersize, - singlestridesize, reduce_innersize = 0, reduce_outerdim = 0; - int is_onestride = 0, any_buffered = 0; - - npy_intp *reduce_outerstrides = NULL; - char **reduce_outerptrs = NULL; - - /* - * Have to get this flag before npyiter_checkreducesize sets - * it for the next iteration. - */ - npy_bool reuse_reduce_loops = (prev_dataptrs != NULL) && - ((itflags&NPY_ITFLAG_REUSE_REDUCE_LOOPS) != 0); + npy_intp iterindex, iterend, transfersize; npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) / NPY_SIZEOF_INTP; - NPY_IT_DBG_PRINT("Iterator: Copying inputs to buffers\n"); - - /* Calculate the size if using any buffers */ + /* Fetch the maximum size we may wish to copy (or use if unbuffered) */ iterindex = NIT_ITERINDEX(iter); iterend = NIT_ITEREND(iter); transfersize = NBF_BUFFERSIZE(bufferdata); - if (transfersize > iterend - iterindex) { - transfersize = iterend - iterindex; - } + outer_axisdata = NIT_INDEX_AXISDATA(axisdata, bufferdata->outerdim); + npy_intp remaining_outersize = ( + outer_axisdata->shape - outer_axisdata->index); - /* If last time around, the reduce loop structure was full, we reuse it */ - if (reuse_reduce_loops) { - npy_intp full_transfersize, prev_reduce_outersize; - - prev_reduce_outersize = NBF_REDUCE_OUTERSIZE(bufferdata); - reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata); - reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata); - reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata); - reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim); - reduce_innersize = NBF_SIZE(bufferdata); - NBF_REDUCE_POS(bufferdata) = 0; - /* - * Try to do make the outersize as big as possible. This allows - * it to shrink when processing the last bit of the outer reduce loop, - * then grow again at the beginning of the next outer reduce loop. - */ - NBF_REDUCE_OUTERSIZE(bufferdata) = (NAD_SHAPE(reduce_outeraxisdata)- - NAD_INDEX(reduce_outeraxisdata)); - full_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize; - /* If the full transfer size doesn't fit in the buffer, truncate it */ - if (full_transfersize > NBF_BUFFERSIZE(bufferdata)) { - NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize; - transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize; - } - else { - transfersize = full_transfersize; - } - if (prev_reduce_outersize < NBF_REDUCE_OUTERSIZE(bufferdata)) { - /* - * If the previous time around less data was copied it may not - * be safe to reuse the buffers even if the pointers match. - */ - reuse_reduce_loops = 0; - } - NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize; + NPY_IT_DBG_PRINT("Iterator: Copying inputs to buffers\n"); + NPY_IT_DBG_PRINT(" Max transfersize=%zd, coresize=%zd\n", + transfersize, bufferdata->coresize); - NPY_IT_DBG_PRINT3("Reused reduce transfersize: %d innersize: %d " - "itersize: %d\n", - (int)transfersize, - (int)reduce_innersize, - (int)NpyIter_GetIterSize(iter)); - NPY_IT_DBG_PRINT1("Reduced reduce outersize: %d", - (int)NBF_REDUCE_OUTERSIZE(bufferdata)); - } /* - * If there are any reduction operands, we may have to make - * the size smaller so we don't copy the same value into - * a buffer twice, as the buffering does not have a mechanism - * to combine values itself. + * If there is a coreoffset just copy to the end of a single coresize + * NB: Also if the size is shrunk, we definitely won't set buffer re-use. */ - else if (itflags&NPY_ITFLAG_REDUCE) { - NPY_IT_DBG_PRINT("Iterator: Calculating reduce loops\n"); - transfersize = npyiter_checkreducesize(iter, transfersize, - &reduce_innersize, - &reduce_outerdim); - NPY_IT_DBG_PRINT3("Reduce transfersize: %d innersize: %d " - "itersize: %d\n", - (int)transfersize, - (int)reduce_innersize, - (int)NpyIter_GetIterSize(iter)); - - reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata); - reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata); - reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim); - NBF_SIZE(bufferdata) = reduce_innersize; - NBF_REDUCE_POS(bufferdata) = 0; - NBF_REDUCE_OUTERDIM(bufferdata) = reduce_outerdim; - NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize; - if (reduce_innersize == 0) { - NBF_REDUCE_OUTERSIZE(bufferdata) = 0; - return 0; - } - else { - NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize; - } + if (bufferdata->coreoffset) { + prev_dataptrs = NULL; /* No way we can re-use the buffers safely. */ + transfersize = bufferdata->coresize - bufferdata->coreoffset; + NPY_IT_DBG_PRINT(" Shrunk transfersize due to coreoffset=%zd: %zd\n", + bufferdata->coreoffset, transfersize); } - else { - NBF_SIZE(bufferdata) = transfersize; - NBF_BUFITEREND(bufferdata) = iterindex + transfersize; + else if (transfersize > bufferdata->coresize * remaining_outersize) { + /* + * Shrink transfersize to not go beyond outer axis size. If not + * a reduction, it is unclear that this is necessary. + */ + transfersize = bufferdata->coresize * remaining_outersize; + NPY_IT_DBG_PRINT(" Shrunk transfersize outer size: %zd\n", transfersize); } - /* Calculate the maximum size if using a single stride and no buffers */ - singlestridesize = NAD_SHAPE(axisdata)-NAD_INDEX(axisdata); - if (singlestridesize > iterend - iterindex) { - singlestridesize = iterend - iterindex; - } - if (singlestridesize >= transfersize) { - is_onestride = 1; + /* And ensure that we don't go beyond the iterator end (if ranged) */ + if (transfersize > iterend - iterindex) { + transfersize = iterend - iterindex; + NPY_IT_DBG_PRINT(" Shrunk transfersize to itersize: %zd\n", transfersize); } - NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata); - for (iop = 0; iop < nop; ++iop) { - - switch (op_itflags[iop]& - (NPY_OP_ITFLAG_BUFNEVER| - NPY_OP_ITFLAG_CAST| - NPY_OP_ITFLAG_REDUCE)) { - /* Never need to buffer this operand */ - case NPY_OP_ITFLAG_BUFNEVER: - ptrs[iop] = ad_ptrs[iop]; - if (itflags&NPY_ITFLAG_REDUCE) { - reduce_outerstrides[iop] = reduce_innersize * - strides[iop]; - reduce_outerptrs[iop] = ptrs[iop]; - } - /* - * Should not adjust the stride - ad_strides[iop] - * could be zero, but strides[iop] was initialized - * to the first non-trivial stride. - */ - /* The flag NPY_OP_ITFLAG_USINGBUFFER can be ignored here */ - assert(!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER)); - break; - /* Never need to buffer this operand */ - case NPY_OP_ITFLAG_BUFNEVER|NPY_OP_ITFLAG_REDUCE: - ptrs[iop] = ad_ptrs[iop]; - reduce_outerptrs[iop] = ptrs[iop]; - reduce_outerstrides[iop] = 0; - /* - * Should not adjust the stride - ad_strides[iop] - * could be zero, but strides[iop] was initialized - * to the first non-trivial stride. - */ - /* The flag NPY_OP_ITFLAG_USINGBUFFER can be ignored here */ - assert(!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER)); - break; - /* Just a copy */ - case 0: - /* Do not reuse buffer if it did not exist */ - if (!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER) && - (prev_dataptrs != NULL)) { - prev_dataptrs[iop] = NULL; - } - /* - * No copyswap or cast was requested, so all we're - * doing is copying the data to fill the buffer and - * produce a single stride. If the underlying data - * already does that, no need to copy it. - */ - if (is_onestride) { - ptrs[iop] = ad_ptrs[iop]; - strides[iop] = ad_strides[iop]; - /* Signal that the buffer is not being used */ - op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER); - } - /* If some other op is reduced, we have a double reduce loop */ - else if ((itflags&NPY_ITFLAG_REDUCE) && - (reduce_outerdim == 1) && - (transfersize/reduce_innersize <= - NAD_SHAPE(reduce_outeraxisdata) - - NAD_INDEX(reduce_outeraxisdata))) { - ptrs[iop] = ad_ptrs[iop]; - reduce_outerptrs[iop] = ptrs[iop]; - strides[iop] = ad_strides[iop]; - reduce_outerstrides[iop] = - NAD_STRIDES(reduce_outeraxisdata)[iop]; - /* Signal that the buffer is not being used */ - op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER); - } - else { - /* In this case, the buffer is being used */ - ptrs[iop] = buffers[iop]; - strides[iop] = dtypes[iop]->elsize; - if (itflags&NPY_ITFLAG_REDUCE) { - reduce_outerstrides[iop] = reduce_innersize * - strides[iop]; - reduce_outerptrs[iop] = ptrs[iop]; - } - /* Signal that the buffer is being used */ - op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER; - } - break; - /* Just a copy, but with a reduction */ - case NPY_OP_ITFLAG_REDUCE: - /* Do not reuse buffer if it did not exist */ - if (!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER) && - (prev_dataptrs != NULL)) { - prev_dataptrs[iop] = NULL; - } - if (ad_strides[iop] == 0) { - strides[iop] = 0; - /* It's all in one stride in the inner loop dimension */ - if (is_onestride) { - NPY_IT_DBG_PRINT1("reduce op %d all one stride\n", (int)iop); - ptrs[iop] = ad_ptrs[iop]; - reduce_outerstrides[iop] = 0; - /* Signal that the buffer is not being used */ - op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER); - } - /* It's all in one stride in the reduce outer loop */ - else if ((reduce_outerdim > 0) && - (transfersize/reduce_innersize <= - NAD_SHAPE(reduce_outeraxisdata) - - NAD_INDEX(reduce_outeraxisdata))) { - NPY_IT_DBG_PRINT1("reduce op %d all one outer stride\n", - (int)iop); - ptrs[iop] = ad_ptrs[iop]; - /* Outer reduce loop advances by one item */ - reduce_outerstrides[iop] = - NAD_STRIDES(reduce_outeraxisdata)[iop]; - /* Signal that the buffer is not being used */ - op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER); - } - /* In this case, the buffer is being used */ - else { - NPY_IT_DBG_PRINT1("reduce op %d must buffer\n", (int)iop); - ptrs[iop] = buffers[iop]; - /* Both outer and inner reduce loops have stride 0 */ - if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) { - reduce_outerstrides[iop] = 0; - } - /* Outer reduce loop advances by one item */ - else { - reduce_outerstrides[iop] = dtypes[iop]->elsize; - } - /* Signal that the buffer is being used */ - op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER; - } + bufferdata->size = transfersize; + NBF_BUFITEREND(bufferdata) = iterindex + transfersize; - } - else if (is_onestride) { - NPY_IT_DBG_PRINT1("reduce op %d all one stride in dim 0\n", (int)iop); - ptrs[iop] = ad_ptrs[iop]; - strides[iop] = ad_strides[iop]; - reduce_outerstrides[iop] = 0; - /* Signal that the buffer is not being used */ - op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER); - } - else { - /* It's all in one stride in the reduce outer loop */ - if ((reduce_outerdim == 1) && - (transfersize/reduce_innersize <= - NAD_SHAPE(reduce_outeraxisdata) - - NAD_INDEX(reduce_outeraxisdata))) { - ptrs[iop] = ad_ptrs[iop]; - strides[iop] = ad_strides[iop]; - /* Outer reduce loop advances by one item */ - reduce_outerstrides[iop] = - NAD_STRIDES(reduce_outeraxisdata)[iop]; - /* Signal that the buffer is not being used */ - op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER); - } - /* In this case, the buffer is being used */ - else { - ptrs[iop] = buffers[iop]; - strides[iop] = dtypes[iop]->elsize; - - if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) { - /* Reduction in outer reduce loop */ - reduce_outerstrides[iop] = 0; - } - else { - /* Advance to next items in outer reduce loop */ - reduce_outerstrides[iop] = reduce_innersize * - dtypes[iop]->elsize; - } - /* Signal that the buffer is being used */ - op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER; - } - } - reduce_outerptrs[iop] = ptrs[iop]; - break; - default: - /* In this case, the buffer is always being used */ - any_buffered = 1; - - /* Signal that the buffer is being used */ - op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER; - - if (!(op_itflags[iop]&NPY_OP_ITFLAG_REDUCE)) { - ptrs[iop] = buffers[iop]; - strides[iop] = dtypes[iop]->elsize; - if (itflags&NPY_ITFLAG_REDUCE) { - reduce_outerstrides[iop] = reduce_innersize * - strides[iop]; - reduce_outerptrs[iop] = ptrs[iop]; - } - } - /* The buffer is being used with reduction */ - else { - ptrs[iop] = buffers[iop]; - if (ad_strides[iop] == 0) { - NPY_IT_DBG_PRINT1("cast op %d has innermost stride 0\n", (int)iop); - strides[iop] = 0; - /* Both outer and inner reduce loops have stride 0 */ - if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) { - NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iop); - reduce_outerstrides[iop] = 0; - } - /* Outer reduce loop advances by one item */ - else { - NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iop); - reduce_outerstrides[iop] = dtypes[iop]->elsize; - } - } - else { - NPY_IT_DBG_PRINT1("cast op %d has innermost stride !=0\n", (int)iop); - strides[iop] = dtypes[iop]->elsize; - - if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) { - NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iop); - /* Reduction in outer reduce loop */ - reduce_outerstrides[iop] = 0; - } - else { - NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iop); - /* Advance to next items in outer reduce loop */ - reduce_outerstrides[iop] = reduce_innersize * - dtypes[iop]->elsize; - } - } - reduce_outerptrs[iop] = ptrs[iop]; - } - break; - } + if (transfersize == 0) { + return 0; + } - /* - * If OP_ITFLAG_USINGBUFFER is enabled and the read func is not NULL, - * the buffer needs to be read. - */ - if (op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER && - transferinfo[iop].read.func != NULL) { - npy_intp src_itemsize; - npy_intp op_transfersize; + NPY_IT_DBG_PRINT("Iterator: Buffer transfersize=%zd\n", transfersize); - npy_intp dst_stride, *src_strides, *src_coords, *src_shape; - int ndim_transfer; + if (itflags & NPY_ITFLAG_REDUCE) { + NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize / bufferdata->coresize; + if (NBF_REDUCE_OUTERSIZE(bufferdata) > 1) { + /* WARNING: bufferdata->size does not include reduce-outersize */ + bufferdata->size = bufferdata->coresize; + NBF_BUFITEREND(bufferdata) = iterindex + bufferdata->coresize; + } + NBF_REDUCE_POS(bufferdata) = 0; + } - npy_bool skip_transfer = 0; + NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata); + for (iop = 0; iop < nop; ++iop) { + NPY_IT_DBG_PRINT("Iterator: buffer prep for op=%d @ %p inner-stride=%zd\n", + iop, ad_ptrs[iop], NBF_STRIDES(bufferdata)[iop]); - src_itemsize = PyArray_DTYPE(operands[iop])->elsize; + if (op_itflags[iop]&NPY_OP_ITFLAG_BUFNEVER) { + ptrs[iop] = ad_ptrs[iop]; + NBF_REDUCE_OUTERPTRS(bufferdata)[iop] = ptrs[iop]; + NPY_IT_DBG_PRINT(" unbuffered op (skipping)\n"); + continue; + } + else { + /* Pointers currently never change here. */ + ptrs[iop] = buffers[iop]; + NBF_REDUCE_OUTERPTRS(bufferdata)[iop] = ptrs[iop]; + } - /* If we reach here, buffering is required */ - any_buffered = 1; + if (!(op_itflags[iop]&NPY_OP_ITFLAG_READ)) { + NPY_IT_DBG_PRINT(" non-reading op (skipping)\n"); + continue; + } + if (op_itflags[iop]&NPY_OP_ITFLAG_BUF_REUSABLE + && prev_dataptrs && prev_dataptrs[iop] == ad_ptrs[iop]) { + NPY_IT_DBG_PRINT2("Iterator: skipping operands %d " + "copy (%d items) because the data pointer didn't change\n", + (int)iop, (int)transfersize); + continue; + } + else if (transfersize == NBF_BUFFERSIZE(bufferdata) + || (transfersize >= NBF_CORESIZE(bufferdata) + && op_itflags[iop]&NPY_OP_ITFLAG_REDUCE + && NAD_STRIDES(outer_axisdata)[iop] == 0)) { /* - * If this operand is being reduced in the inner loop, - * set its buffering stride to zero, and just copy - * one element. + * If we have a full copy or a reduce with 0 stride outer and + * a copy larger than the coresize, this is now re-usable. + * NB: With a core-offset, we always copy less than the core-size. */ - if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) { - if (ad_strides[iop] == 0) { - strides[iop] = 0; - if (reduce_outerstrides[iop] == 0) { - op_transfersize = 1; - dst_stride = 0; - src_strides = &dst_stride; - src_coords = &NAD_INDEX(reduce_outeraxisdata); - src_shape = &NAD_SHAPE(reduce_outeraxisdata); - ndim_transfer = 1; - - /* - * When we're reducing a single element, and - * it's still the same element, don't overwrite - * it even when reuse reduce loops is unset. - * This preserves the precision of the - * intermediate calculation. - */ - if (prev_dataptrs && - prev_dataptrs[iop] == ad_ptrs[iop]) { - NPY_IT_DBG_PRINT1("Iterator: skipping operand %d" - " copy because it's a 1-element reduce\n", - (int)iop); - - skip_transfer = 1; - } - } - else { - op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata); - dst_stride = reduce_outerstrides[iop]; - src_strides = &NAD_STRIDES(reduce_outeraxisdata)[iop]; - src_coords = &NAD_INDEX(reduce_outeraxisdata); - src_shape = &NAD_SHAPE(reduce_outeraxisdata); - ndim_transfer = ndim - reduce_outerdim; - } - } - else { - if (reduce_outerstrides[iop] == 0) { - op_transfersize = NBF_SIZE(bufferdata); - dst_stride = strides[iop]; - src_strides = &ad_strides[iop]; - src_coords = &NAD_INDEX(axisdata); - src_shape = &NAD_SHAPE(axisdata); - ndim_transfer = reduce_outerdim ? reduce_outerdim : 1; - } - else { - op_transfersize = transfersize; - dst_stride = strides[iop]; - src_strides = &ad_strides[iop]; - src_coords = &NAD_INDEX(axisdata); - src_shape = &NAD_SHAPE(axisdata); - ndim_transfer = ndim; - } - } - } - else { - op_transfersize = transfersize; - dst_stride = strides[iop]; - src_strides = &ad_strides[iop]; - src_coords = &NAD_INDEX(axisdata); - src_shape = &NAD_SHAPE(axisdata); - ndim_transfer = ndim; - } + NPY_IT_DBG_PRINT(" marking operand %d for buffer reuse\n", iop); + NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUF_REUSABLE; + } + else if (prev_dataptrs == NULL || prev_dataptrs[iop] != ad_ptrs[iop] || + bufferdata->coreoffset) { + /* Copying at a different offset, so must unset re-use. */ + NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE; + } - /* - * If the whole buffered loop structure remains the same, - * and the source pointer for this data didn't change, - * we don't have to copy the data again. - */ - if (reuse_reduce_loops && prev_dataptrs[iop] == ad_ptrs[iop]) { - NPY_IT_DBG_PRINT2("Iterator: skipping operands %d " - "copy (%d items) because loops are reused and the data " - "pointer didn't change\n", - (int)iop, (int)op_transfersize); - skip_transfer = 1; - } + npy_intp zero = 0; /* used as coord for 1-D copies */ + int ndim_transfer; + npy_intp op_transfersize; + npy_intp dst_stride; + npy_intp *src_strides; + npy_intp *src_coords = &zero; + npy_intp *src_shape; + npy_intp src_itemsize = PyArray_DTYPE(operands[iop])->elsize; - /* - * Copy data to the buffers if necessary. - * - * We always copy if the operand has references. In that case - * a "write" function must be in use that either copies or clears - * the buffer. - * This write from buffer call does not check for skip-transfer - * so we have to assume the buffer is cleared. For dtypes that - * do not have references, we can assume that the write function - * will leave the source (buffer) unmodified. - */ - if (!skip_transfer || PyDataType_REFCHK(dtypes[iop])) { - NPY_IT_DBG_PRINT2("Iterator: Copying operand %d to " - "buffer (%d items)\n", - (int)iop, (int)op_transfersize); - - if (PyArray_TransferNDimToStrided( - ndim_transfer, ptrs[iop], dst_stride, - ad_ptrs[iop], src_strides, axisdata_incr, - src_coords, axisdata_incr, - src_shape, axisdata_incr, - op_transfersize, src_itemsize, - &transferinfo[iop].read) < 0) { - return -1; - } - } - } - } + npyiter_fill_buffercopy_params(nop, iop, ndim, op_itflags[iop], + transfersize, bufferdata, axisdata, outer_axisdata, + &ndim_transfer, &op_transfersize, &dst_stride, + &src_strides, &src_shape, &src_coords); - /* - * If buffering wasn't needed, we can grow the inner - * loop to as large as possible. - * - * TODO: Could grow REDUCE loop too with some more logic above. - */ - if (!any_buffered && (itflags&NPY_ITFLAG_GROWINNER) && - !(itflags&NPY_ITFLAG_REDUCE)) { - if (singlestridesize > transfersize) { - NPY_IT_DBG_PRINT2("Iterator: Expanding inner loop size " - "from %d to %d since buffering wasn't needed\n", - (int)NBF_SIZE(bufferdata), (int)singlestridesize); - NBF_SIZE(bufferdata) = singlestridesize; - NBF_BUFITEREND(bufferdata) = iterindex + singlestridesize; + /* + * Copy data to the buffers if necessary. + * + * We always copy if the operand has references. In that case + * a "write" function must be in use that either copies or clears + * the buffer. + * This write from buffer call does not check for skip-transfer + * so we have to assume the buffer is cleared. For dtypes that + * do not have references, we can assume that the write function + * will leave the source (buffer) unmodified. + */ + NPY_IT_DBG_PRINT( + "Iterator: Copying operand %d to buffer (%zd items):\n" + " transfer ndim: %d, inner stride: %zd, inner shape: %zd, buffer stride: %zd\n", + iop, op_transfersize, ndim_transfer, src_strides[0], src_shape[0], dst_stride); + + if (PyArray_TransferNDimToStrided( + ndim_transfer, ptrs[iop], dst_stride, + ad_ptrs[iop], src_strides, axisdata_incr, + src_coords, axisdata_incr, + src_shape, axisdata_incr, + op_transfersize, src_itemsize, + &transferinfo[iop].read) < 0) { + return -1; } } - NPY_IT_DBG_PRINT1("Any buffering needed: %d\n", any_buffered); - NPY_IT_DBG_PRINT1("Iterator: Finished copying inputs to buffers " - "(buffered size is %d)\n", (int)NBF_SIZE(bufferdata)); + "(buffered size is %zd)\n", transfersize); return 0; } @@ -2631,13 +2278,16 @@ npyiter_clear_buffers(NpyIter *iter) PyArray_Descr **dtypes = NIT_DTYPES(iter); npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter); for (int iop = 0; iop < nop; ++iop, ++buffers) { - if (transferinfo[iop].clear.func == NULL || - !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) { + if (transferinfo[iop].clear.func == NULL) { continue; } if (*buffers == 0) { continue; } + assert(!(op_itflags[iop]&NPY_OP_ITFLAG_BUFNEVER)); + /* Buffer cannot be re-used (not that we should ever try!) */ + op_itflags[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE; + int itemsize = dtypes[iop]->elsize; if (transferinfo[iop].clear.func(NULL, dtypes[iop], *buffers, NBF_SIZE(bufferdata), itemsize, @@ -2652,236 +2302,6 @@ npyiter_clear_buffers(NpyIter *iter) } -/* - * This checks how much space can be buffered without encountering the - * same value twice, or for operands whose innermost stride is zero, - * without encountering a different value. By reducing the buffered - * amount to this size, reductions can be safely buffered. - * - * Reductions are buffered with two levels of looping, to avoid - * frequent copying to the buffers. The return value is the over-all - * buffer size, and when the flag NPY_ITFLAG_REDUCE is set, reduce_innersize - * receives the size of the inner of the two levels of looping. - * - * The value placed in reduce_outerdim is the index into the AXISDATA - * for where the second level of the double loop begins. - * - * The return value is always a multiple of the value placed in - * reduce_innersize. - */ -static npy_intp -npyiter_checkreducesize(NpyIter *iter, npy_intp count, - npy_intp *reduce_innersize, - npy_intp *reduce_outerdim) -{ - npy_uint32 itflags = NIT_ITFLAGS(iter); - int idim, ndim = NIT_NDIM(iter); - int iop, nop = NIT_NOP(iter); - - NpyIter_AxisData *axisdata; - npy_intp sizeof_axisdata; - npy_intp coord, shape, *strides; - npy_intp reducespace = 1, factor; - npy_bool nonzerocoord; - - npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter); - char stride0op[NPY_MAXARGS]; - - /* Default to no outer axis */ - *reduce_outerdim = 0; - - /* If there's only one dimension, no need to calculate anything */ - if (ndim == 1 || count == 0) { - *reduce_innersize = count; - return count; - } - - sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); - axisdata = NIT_AXISDATA(iter); - - /* Indicate which REDUCE operands have stride 0 in the inner loop */ - strides = NAD_STRIDES(axisdata); - for (iop = 0; iop < nop; ++iop) { - stride0op[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) && - (strides[iop] == 0); - NPY_IT_DBG_PRINT2("Iterator: Operand %d has stride 0 in " - "the inner loop? %d\n", iop, (int)stride0op[iop]); - } - shape = NAD_SHAPE(axisdata); - coord = NAD_INDEX(axisdata); - reducespace += (shape-coord-1); - factor = shape; - NIT_ADVANCE_AXISDATA(axisdata, 1); - - /* Initialize nonzerocoord based on the first coordinate */ - nonzerocoord = (coord != 0); - - /* Go forward through axisdata, calculating the space available */ - for (idim = 1; idim < ndim && reducespace < count; - ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) { - NPY_IT_DBG_PRINT2("Iterator: inner loop reducespace %d, count %d\n", - (int)reducespace, (int)count); - - strides = NAD_STRIDES(axisdata); - for (iop = 0; iop < nop; ++iop) { - /* - * If a reduce stride switched from zero to non-zero, or - * vice versa, that's the point where the data will stop - * being the same element or will repeat, and if the - * buffer starts with an all zero multi-index up to this - * point, gives us the reduce_innersize. - */ - if((stride0op[iop] && (strides[iop] != 0)) || - (!stride0op[iop] && - (strides[iop] == 0) && - (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE))) { - NPY_IT_DBG_PRINT1("Iterator: Reduce operation limits " - "buffer to %d\n", (int)reducespace); - /* - * If we already found more elements than count, or - * the starting coordinate wasn't zero, the two-level - * looping is unnecessary/can't be done, so return. - */ - if (count <= reducespace) { - *reduce_innersize = count; - NIT_ITFLAGS(iter) |= NPY_ITFLAG_REUSE_REDUCE_LOOPS; - return count; - } - else if (nonzerocoord) { - if (reducespace < count) { - count = reducespace; - } - *reduce_innersize = count; - /* NOTE: This is similar to the (coord != 0) case below. */ - NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS; - return count; - } - else { - *reduce_innersize = reducespace; - break; - } - } - } - /* If we broke out of the loop early, we found reduce_innersize */ - if (iop != nop) { - NPY_IT_DBG_PRINT2("Iterator: Found first dim not " - "reduce (%d of %d)\n", iop, nop); - break; - } - - shape = NAD_SHAPE(axisdata); - coord = NAD_INDEX(axisdata); - if (coord != 0) { - nonzerocoord = 1; - } - reducespace += (shape-coord-1) * factor; - factor *= shape; - } - - /* - * If there was any non-zero coordinate, the reduction inner - * loop doesn't fit in the buffersize, or the reduction inner loop - * covered the entire iteration size, can't do the double loop. - */ - if (nonzerocoord || count < reducespace || idim == ndim) { - if (reducespace < count) { - count = reducespace; - } - *reduce_innersize = count; - /* In this case, we can't reuse the reduce loops */ - NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS; - return count; - } - - coord = NAD_INDEX(axisdata); - if (coord != 0) { - /* - * In this case, it is only safe to reuse the buffer if the amount - * of data copied is not more than the current axes, as is the - * case when reuse_reduce_loops was active already. - * It should be in principle OK when the idim loop returns immediately. - */ - NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS; - } - else { - /* In this case, we can reuse the reduce loops */ - NIT_ITFLAGS(iter) |= NPY_ITFLAG_REUSE_REDUCE_LOOPS; - } - - *reduce_innersize = reducespace; - count /= reducespace; - - NPY_IT_DBG_PRINT2("Iterator: reduce_innersize %d count /ed %d\n", - (int)reducespace, (int)count); - - /* - * Continue through the rest of the dimensions. If there are - * two separated reduction axes, we may have to cut the buffer - * short again. - */ - *reduce_outerdim = idim; - reducespace = 1; - factor = 1; - /* Indicate which REDUCE operands have stride 0 at the current level */ - strides = NAD_STRIDES(axisdata); - for (iop = 0; iop < nop; ++iop) { - stride0op[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) && - (strides[iop] == 0); - NPY_IT_DBG_PRINT2("Iterator: Operand %d has stride 0 in " - "the outer loop? %d\n", iop, (int)stride0op[iop]); - } - shape = NAD_SHAPE(axisdata); - reducespace += (shape-coord-1) * factor; - factor *= shape; - NIT_ADVANCE_AXISDATA(axisdata, 1); - ++idim; - - for (; idim < ndim && reducespace < count; - ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) { - NPY_IT_DBG_PRINT2("Iterator: outer loop reducespace %d, count %d\n", - (int)reducespace, (int)count); - strides = NAD_STRIDES(axisdata); - for (iop = 0; iop < nop; ++iop) { - /* - * If a reduce stride switched from zero to non-zero, or - * vice versa, that's the point where the data will stop - * being the same element or will repeat, and if the - * buffer starts with an all zero multi-index up to this - * point, gives us the reduce_innersize. - */ - if((stride0op[iop] && (strides[iop] != 0)) || - (!stride0op[iop] && - (strides[iop] == 0) && - (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE))) { - NPY_IT_DBG_PRINT1("Iterator: Reduce operation limits " - "buffer to %d\n", (int)reducespace); - /* - * This terminates the outer level of our double loop. - */ - if (count <= reducespace) { - return count * (*reduce_innersize); - } - else { - return reducespace * (*reduce_innersize); - } - } - } - - shape = NAD_SHAPE(axisdata); - coord = NAD_INDEX(axisdata); - if (coord != 0) { - nonzerocoord = 1; - } - reducespace += (shape-coord-1) * factor; - factor *= shape; - } - - if (reducespace < count) { - count = reducespace; - } - return count * (*reduce_innersize); -} - NPY_NO_EXPORT npy_bool npyiter_has_writeback(NpyIter *iter) { diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c index d906e37e5dff..41060f6b206e 100644 --- a/numpy/_core/src/multiarray/nditer_constr.c +++ b/numpy/_core/src/multiarray/nditer_constr.c @@ -99,6 +99,8 @@ npyiter_get_priority_subtype(int nop, PyArrayObject **op, static int npyiter_allocate_transfer_functions(NpyIter *iter); +static void +npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize); /*NUMPY_API * Allocate a new iterator for multiple array objects, and advanced @@ -253,28 +255,6 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags, NPY_IT_TIME_POINT(c_fill_axisdata); - if (itflags & NPY_ITFLAG_BUFFER) { - /* - * If buffering is enabled and no buffersize was given, use a default - * chosen to be big enough to get some amortization benefits, but - * small enough to be cache-friendly. - */ - if (buffersize <= 0) { - buffersize = NPY_BUFSIZE; - } - /* No point in a buffer bigger than the iteration size */ - if (buffersize > NIT_ITERSIZE(iter)) { - buffersize = NIT_ITERSIZE(iter); - } - NBF_BUFFERSIZE(bufferdata) = buffersize; - - /* - * Initialize for use in FirstVisit, which may be called before - * the buffers are filled and the reduce pos is updated. - */ - NBF_REDUCE_POS(bufferdata) = 0; - } - /* * If an index was requested, compute the strides for it. * Note that we must do this before changing the order of the @@ -472,14 +452,22 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags, } } - /* If buffering is set without delayed allocation */ + /* If buffering is set prepare it */ if (itflags & NPY_ITFLAG_BUFFER) { + npyiter_find_buffering_setup(iter, buffersize); + + /* + * Initialize for use in FirstVisit, which may be called before + * the buffers are filled and the reduce pos is updated. + */ + NBF_REDUCE_POS(bufferdata) = 0; + if (!npyiter_allocate_transfer_functions(iter)) { NpyIter_Deallocate(iter); return NULL; } if (!(itflags & NPY_ITFLAG_DELAYBUF)) { - /* Allocate the buffers */ + /* Allocate the buffers if that is not delayed */ if (!npyiter_allocate_buffers(iter, NULL)) { NpyIter_Deallocate(iter); return NULL; @@ -1006,6 +994,10 @@ npyiter_check_per_op_flags(npy_uint32 op_flags, npyiter_opitflags *op_itflags) *op_itflags |= NPY_OP_ITFLAG_VIRTUAL; } + if (op_flags & NPY_ITER_CONTIG) { + *op_itflags |= NPY_OP_ITFLAG_CONTIG; + } + return 1; } @@ -1931,6 +1923,416 @@ operand_different_than_broadcast: { } } + +/* + * At this point we (presumably) use a buffered iterator and here we want + * to find out the best way to buffer the iterator in a fashion that we don't + * have to figure out a lot of things on every outer iteration. + * + * How do we iterate? + * ------------------ + * There are currently two modes of "buffered" iteration: + * 1. The normal mode, where we either buffer each operand or not and + * then do a 1-D loop on those buffers (or operands). + * 2. The "reduce" mode. In reduce mode (ITFLAG_REDUCE) we internally use a + * a double iteration where for "reduce" operands we have: + * - One outer iteration with stride == 0 and a core with at least one + * stride != 0 (all of them if this is a true reduce/writeable operand). + * - One outer iteration with stride != 0 and a core of all strides == 0. + * This setup allows filling the buffer with only the stride != 0 and then + * doing the double loop. + * An Example for these two cases is: + * arr = np.ones((100, 10, 10))[::2, :, :] + * arr.sum(-1) + * arr.sum(-2) + * Where the slice prevents the iterator from collapsing axes and the + * result has stride 0 either along the last or the second to last axis. + * In both cases we can buffer 10x10 elements in reduce mode. + * (This iteration needs no buffer, add a cast to ensure actual buffering.) + * + * Only a writeable (reduce) operand require this reduce mode because for + * reading it is OK if the buffer holds duplicated elements. + * The benefit of the reduce mode is that it allows for larger core sizes and + * buffers since the zero strides do not allow a single 1-d iteration. + * If we use reduce-mode, we can apply it also to read-only operands as an + * optimization. + * + * The function here finds the first "outer" dimension and it's "core" to use + * that works with reductions. + * While iterating, we will fill the buffers making sure that we: + * - Never buffer beyond the first outer dimension (optimize chance of re-use). + * - If the iterator is manually set to an offset into what is part of the + * core (see second example below), then we only fill the buffer to finish + * that one core. This re-aligns us with the core and is necessary for + * reductions. (Such manual setting should be rare or happens exactly once + * for splitting the iteration into worker chunks.) + * + * And examples for these two constraints: + * Given the iteration shape is (100, 10, 10) and the core size 10 with a + * buffer size of 60 (due to limits), making dimension 1 the "outer" one. + * The first iterations/buffers would then range (excluding end-point): + * - (0, 0, 0) -> (0, 6, 0) + * - (0, 6, 0) -> (1, 0, 0) # Buffer only holds 40 of 60 possible elements. + * - (1, 0, 0) -> (1, 6, 0) + * - ... + * If the user limits to a range starting from 75, we use: + * - (0, 7, 5) -> (0, 8, 0) # Only 5 elements to re-align with core. + * - (0, 8, 0) -> (1, 0, 0) + * - ... # continue as above + * + * This means that the data stored in the buffer has always the same structure + * (except when manually moved), which allows us to fill the buffer more simply + * and optimally in some cases, and makes it easier to determine whether buffer + * content is re-usable (e.g., because it represents broadcasted operands). + * + * Best buffer and core size + * ------------------------- + * To avoid having to figure out what to copy every time we fill buffers, + * we here want to find the outer iteration dimension such that: + * - Its core size is <= the maximum buffersize if buffering is needed; + * - Reductions are possible (with or without reduce mode); + * - Iteration overhead is minimized. We estimate the total overhead with + * the number "outer" iterations: + * + * N_o = full_iterator_size / min(core_size * outer_dim_size, buffersize) + * + * This is approximately how often `iternext()` is called when the user + * is using an external-loop and how often we would fill buffers. + * The total overhead is then estimated as: + * + * (1 + n_buffers) * N_o + * + * Since the iterator size is a constant, we can estimate the overhead as: + * + * (1 + n_buffers) / min(core_size * outer_dim_size, buffersize) + * + * And when comparing two options multiply by the others divisor/size to + * avoid the division. + * + * TODO: Probably should tweak or simplify? The formula is clearly not + * the actual cost (Buffers add a constant total cost as well). + * Right now, it mostly rejects growing the core size when we are already + * close to the maximum buffersize (even overhead wise not worth it). + * That may be good enough, but maybe it can be spelled simpler? + * + * In theory, the reduction could also span multiple axes if other operands + * are buffered. We do not try to discover this. + */ +static void +npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize) +{ + int nop = iter->nop; + int ndim = iter->ndim; + npy_uint32 itflags = iter->itflags; + NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter); + + /* + * We check two things here, first how many operand dimensions can be + * iterated using a single stride (all dimensions are consistent), + * and second, whether we found a reduce dimension for the operand. + * That is an outer dimension a reduce would have to take place on. + */ + int op_single_stride_dims[NPY_MAXARGS]; + int op_reduce_outer_dim[NPY_MAXARGS]; + + npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); + NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); + npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter); + + /* + * We can only continue as long as we are within the maximum allowed size. + * When no buffering is needed and GROWINNER is set, we don't have to + * worry about this maximum. + * + * If the user passed no buffersize, default to one small enough that it + * should be cache friendly and big enough to amortize overheads. + */ + npy_intp maximum_size = buffersize <= 0 ? NPY_BUFSIZE : buffersize; + + /* The cost factor defined by: (1 + n_buffered) */ + int cost = 1; + + for (int iop = 0; iop < nop; ++iop) { + op_single_stride_dims[iop] = 1; + op_reduce_outer_dim[iop] = 0; + if (op_itflags[iop] & NPY_OP_ITFLAG_CAST) { + cost += 1; + } + } + + /* + * Once a reduce operand reaches a ==0/!=0 stride flip, this dimension + * becomes the outer reduce dimension. + */ + int outer_reduce_dim = 0; + + npy_intp size = axisdata->shape; /* the current total size */ + + /* Note that there is always one axidata that we use (even with ndim =0) */ + int best_dim = 0; + int best_cost = cost; + /* The size of the "outer" iteration and all previous dimensions: */ + npy_intp best_size = size; + npy_intp best_coresize = 1; + + NPY_IT_DBG_PRINT("Iterator: discovering best core size\n"); + for (int idim = 1; idim < ndim; idim++) { + if (outer_reduce_dim) { + /* Cannot currently expand beyond reduce dim! */ + break; + } + if (size >= maximum_size && + (cost > 1 || !(itflags & NPY_ITFLAG_GROWINNER))) { + /* Exceeded buffer size, can only improve without buffers and growinner. */ + break; + } + + npy_intp *prev_strides = NAD_STRIDES(axisdata); + npy_intp prev_shape = NAD_SHAPE(axisdata); + NIT_ADVANCE_AXISDATA(axisdata, 1); + npy_intp *strides = NAD_STRIDES(axisdata); + + for (int iop = 0; iop < nop; iop++) { + /* Check that we set things up nicely (if shape is ever 1) */ + assert((axisdata->shape == 1) ? (prev_strides[iop] == strides[iop]) : 1); + + if (op_single_stride_dims[iop] == idim) { + /* Best case: the strides still collapse for this operand. */ + if (prev_strides[iop] * prev_shape == strides[iop]) { + op_single_stride_dims[iop] += 1; + continue; + } + + /* + * Operand now requires buffering (if it was not already). + * NOTE: This is technically not true since we may still use + * an outer reduce at this point. + * So it prefers a non-reduce setup, which seems not + * ideal, but OK. + */ + if (!(op_itflags[iop] & NPY_OP_ITFLAG_CAST)) { + cost += 1; + } + } + + /* + * If this operand is a reduction operand and the stride swapped + * between !=0 and ==0 then this is the `outer_reduce_dim` and + * we will never continue further (see break at start of op loop). + */ + if ((op_itflags[iop] & NPY_OP_ITFLAG_REDUCE) + && (strides[iop] == 0 || prev_strides[iop] == 0)) { + assert(outer_reduce_dim == 0 || outer_reduce_dim == idim); + op_reduce_outer_dim[iop] = idim; + outer_reduce_dim = idim; + } + /* For clarity: op_reduce_outer_dim[iop] if set always matches. */ + assert(!op_reduce_outer_dim[iop] || op_reduce_outer_dim[iop] == outer_reduce_dim); + } + + npy_intp coresize = size; /* if we iterate here, this is the core */ + size *= axisdata->shape; + if (size == 0) { + break; /* Avoid a zero coresize. */ + } + + double bufsize = size; + if (bufsize > maximum_size && + (cost > 1 || !(itflags & NPY_ITFLAG_GROWINNER))) { + /* If we need buffering, limit size in cost calculation. */ + bufsize = maximum_size; + } + + NPY_IT_DBG_PRINT(" dim=%d, n_buffered=%d, cost=%g @bufsize=%g (prev scaled cost=%g)\n", + idim, cost - 1, cost * (double)best_size, bufsize, best_cost * bufsize); + + /* + * Compare cost (use double to avoid overflows), as explained above + * the cost is compared via the other buffersize. + */ + if (cost * (double)best_size <= best_cost * bufsize) { + /* This dimension is better! */ + best_cost = cost; + best_coresize = coresize; + best_size = size; + best_dim = idim; + } + } + + npy_bool using_reduce = outer_reduce_dim && (best_dim == outer_reduce_dim); + npy_bool iterator_must_buffer = 0; + + /* We found the best chunking store the information */ + assert(best_coresize != 0); + NIT_BUFFERDATA(iter)->coresize = best_coresize; + NIT_BUFFERDATA(iter)->outerdim = best_dim; + + /* + * We found the best dimensions to iterate on and now need to fill + * in all the buffer information related to the iteration. + * This includes filling in information about reduce outer dims + * (we do this even if it is not a reduce for simplicity). + */ + axisdata = NIT_AXISDATA(iter); + NpyIter_AxisData *reduce_axisdata = NIT_INDEX_AXISDATA(axisdata, outer_reduce_dim); + + NPY_IT_DBG_PRINT("Iterator: Found core size=%zd, outer=%zd at dim=%d:\n", + best_coresize, reduce_axisdata->shape, best_dim); + + /* If we are not using a reduce axes mark it and shrink. */ + if (using_reduce) { + assert(NIT_ITFLAGS(iter) & NPY_ITFLAG_REDUCE); + NPY_IT_DBG_PRINT(" using reduce logic\n"); + } + else { + NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REDUCE; + NPY_IT_DBG_PRINT(" not using reduce logic\n"); + } + + for (int iop = 0; iop < nop; iop++) { + /* We need to fill in the following information */ + npy_bool is_reduce_op; + npy_bool op_is_buffered = (op_itflags[iop]&NPY_OP_ITFLAG_CAST) != 0; + + /* If contig was requested and this is not writeable avoid zero strides */ + npy_bool avoid_zero_strides = ( + (op_itflags[iop] & NPY_OP_ITFLAG_CONTIG) + && !(op_itflags[iop] & NPY_OP_ITFLAG_WRITE)); + + /* + * Figure out if this is iterated as a reduce op. Even one marked + * for reduction may not be iterated as one. + */ + if (!using_reduce) { + is_reduce_op = 0; + } + else if (op_reduce_outer_dim[iop] == best_dim) { + /* This op *must* use reduce semantics. */ + is_reduce_op = 1; + } + else if (op_single_stride_dims[iop] == best_dim && !op_is_buffered) { + /* + * Optimization: This operand is not buffered and we might as well + * iterate it as an unbuffered reduce operand. + */ + is_reduce_op = 1; + } + else if (NAD_STRIDES(reduce_axisdata)[iop] == 0 + && op_single_stride_dims[iop] <= best_dim + && !avoid_zero_strides) { + /* + * Optimization: If the outer (reduce) stride is 0 on the operand + * then we can iterate this in a reduce way: buffer the core only + * and repeat it in the "outer" dimension. + * If user requested contig, we may have to avoid 0 strides, this + * is incompatible with the reduce path. + */ + is_reduce_op = 1; + } + else { + is_reduce_op = 0; + } + + /* + * See if the operand is a single stride (if we use reduce logic) + * we don't need to worry about the outermost dimension. + * If it is not a single stride, we must buffer the operand. + */ + if (op_single_stride_dims[iop] + is_reduce_op > best_dim) { + NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUF_SINGLESTRIDE; + } + else { + op_is_buffered = 1; + } + + npy_intp inner_stride; + npy_intp reduce_outer_stride; + if (op_is_buffered) { + npy_intp itemsize = NIT_DTYPES(iter)[iop]->elsize; + /* + * A buffered operand has a stride of itemsize unless we use + * reduce logic. In that case, either the inner or outer stride + * is 0. + */ + if (is_reduce_op) { + if (NAD_STRIDES(reduce_axisdata)[iop] == 0) { + inner_stride = itemsize; + reduce_outer_stride = 0; + } + else { + inner_stride = 0; + reduce_outer_stride = itemsize; + } + } + else { + if (NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE + && NAD_STRIDES(axisdata)[iop] == 0 + && !avoid_zero_strides) { + /* This op is always 0 strides, so even the buffer is that. */ + inner_stride = 0; + reduce_outer_stride = 0; + } + else { + /* normal buffered op */ + inner_stride = itemsize; + reduce_outer_stride = itemsize * best_coresize; + } + } + } + else { + inner_stride = NAD_STRIDES(axisdata)[iop]; + reduce_outer_stride = NAD_STRIDES(reduce_axisdata)[iop]; + } + + if (!using_reduce) { + /* invalidate for now, since we should not use it */ + reduce_outer_stride = NPY_MIN_INTP; + } + + NPY_IT_DBG_PRINT( + "Iterator: op=%d (buffered=%d, reduce=%d, single-stride=%d):\n" + " inner stride: %zd\n" + " reduce outer stride: %zd (if iterator uses reduce)\n", + iop, op_is_buffered, is_reduce_op, + (NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE) != 0, + inner_stride, reduce_outer_stride); + + NBF_STRIDES(bufferdata)[iop] = inner_stride; + NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop] = reduce_outer_stride; + + /* The actual reduce usage may have changed! */ + if (is_reduce_op) { + NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_REDUCE; + } + else { + NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_REDUCE; + } + + if (!op_is_buffered) { + NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUFNEVER; + } + else { + iterator_must_buffer = 1; + } + } + + /* + * If we buffer or do not have grow-inner, make sure that the size is + * below the maximum_size, but a multiple of the coresize. + */ + if (iterator_must_buffer || !(itflags & NPY_ITFLAG_GROWINNER)) { + if (maximum_size < best_size) { + best_size = best_coresize * (maximum_size / best_coresize); + } + } + NIT_BUFFERDATA(iter)->buffersize = best_size; + /* Core starts at 0 initially, if needed it is set in goto index. */ + NIT_BUFFERDATA(iter)->coreoffset = 0; + + return; +} + + /* * Replaces the AXISDATA for the iop'th operand, broadcasting * the dimensions as necessary. Assumes the replacement array is @@ -2688,18 +3090,13 @@ npyiter_allocate_arrays(NpyIter *iter, int **op_axes) { npy_uint32 itflags = NIT_ITFLAGS(iter); - int idim, ndim = NIT_NDIM(iter); + int ndim = NIT_NDIM(iter); int iop, nop = NIT_NOP(iter); int check_writemasked_reductions = 0; - NpyIter_BufferData *bufferdata = NULL; PyArrayObject **op = NIT_OPERANDS(iter); - if (itflags & NPY_ITFLAG_BUFFER) { - bufferdata = NIT_BUFFERDATA(iter); - } - if (flags & NPY_ITER_COPY_IF_OVERLAP) { /* * Perform operand memory overlap checks, if requested. @@ -2868,15 +3265,8 @@ npyiter_allocate_arrays(NpyIter *iter, */ npyiter_replace_axisdata(iter, iop, op[iop], 0, NULL); - /* - * New arrays need no cast, and in the case - * of scalars, always have stride 0 so never need buffering - */ - op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER; + /* New arrays need no cast */ op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST; - if (itflags & NPY_ITFLAG_BUFFER) { - NBF_STRIDES(bufferdata)[iop] = 0; - } } /* * Make a temporary copy if, @@ -2951,11 +3341,16 @@ npyiter_allocate_arrays(NpyIter *iter, } /* Here we can finally check for contiguous iteration */ - if (op_flags[iop] & NPY_ITER_CONTIG) { + if (op_itflags[iop] & NPY_OP_ITFLAG_CONTIG) { NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); npy_intp stride = NAD_STRIDES(axisdata)[iop]; if (stride != op_dtype[iop]->elsize) { + /* + * Need to copy to buffer (cast) to ensure contiguous + * NOTE: This is the wrong place in case of axes reorder + * (there is an xfailing test for this). + */ NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST " "because of NPY_ITER_CONTIG\n"); op_itflags[iop] |= NPY_OP_ITFLAG_CAST; @@ -2968,63 +3363,6 @@ npyiter_allocate_arrays(NpyIter *iter, } } } - - /* - * If no alignment, byte swap, or casting is needed, - * the inner stride of this operand works for the whole - * array, we can set NPY_OP_ITFLAG_BUFNEVER. - */ - if ((itflags & NPY_ITFLAG_BUFFER) && - !(op_itflags[iop] & NPY_OP_ITFLAG_CAST)) { - NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); - if (ndim <= 1) { - op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER; - NBF_STRIDES(bufferdata)[iop] = NAD_STRIDES(axisdata)[iop]; - } - else if (PyArray_NDIM(op[iop]) > 0) { - npy_intp stride, shape, innerstride = 0, innershape; - npy_intp sizeof_axisdata = - NIT_AXISDATA_SIZEOF(itflags, ndim, nop); - /* Find stride of the first non-empty shape */ - for (idim = 0; idim < ndim; ++idim) { - innershape = NAD_SHAPE(axisdata); - if (innershape != 1) { - innerstride = NAD_STRIDES(axisdata)[iop]; - break; - } - NIT_ADVANCE_AXISDATA(axisdata, 1); - } - ++idim; - NIT_ADVANCE_AXISDATA(axisdata, 1); - /* Check that everything could have coalesced together */ - for (; idim < ndim; ++idim) { - stride = NAD_STRIDES(axisdata)[iop]; - shape = NAD_SHAPE(axisdata); - if (shape != 1) { - /* - * If N times the inner stride doesn't equal this - * stride, the multi-dimensionality is needed. - */ - if (innerstride*innershape != stride) { - break; - } - else { - innershape *= shape; - } - } - NIT_ADVANCE_AXISDATA(axisdata, 1); - } - /* - * If we looped all the way to the end, one stride works. - * Set that stride, because it may not belong to the first - * dimension. - */ - if (idim == ndim) { - op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER; - NBF_STRIDES(bufferdata)[iop] = innerstride; - } - } - } } if (check_writemasked_reductions) { @@ -3095,18 +3433,28 @@ npyiter_allocate_transfer_functions(NpyIter *iter) npy_intp *strides = NAD_STRIDES(axisdata), op_stride; NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata); + npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop); + NpyIter_AxisData *reduce_axisdata = NIT_INDEX_AXISDATA(axisdata, bufferdata->outerdim); + npy_intp *reduce_strides = NAD_STRIDES(reduce_axisdata); + /* combined cast flags, the new cast flags for each cast: */ NPY_ARRAYMETHOD_FLAGS cflags = PyArrayMethod_MINIMAL_FLAGS; NPY_ARRAYMETHOD_FLAGS nc_flags; for (iop = 0; iop < nop; ++iop) { npyiter_opitflags flags = op_itflags[iop]; + /* - * Reduction operands may be buffered with a different stride, - * so we must pass NPY_MAX_INTP to the transfer function factory. + * Reduce operands buffer the outer stride if it is nonzero; compare + * `npyiter_fill_buffercopy_params`. + * (Inner strides cannot _all_ be zero if the outer is, but some.) */ - op_stride = (flags & NPY_OP_ITFLAG_REDUCE) ? NPY_MAX_INTP : - strides[iop]; + if ((op_itflags[iop] & NPY_OP_ITFLAG_REDUCE) && reduce_strides[iop] != 0) { + op_stride = reduce_strides[iop]; + } + else { + op_stride = strides[iop]; + } /* * If we have determined that a buffer may be needed, diff --git a/numpy/_core/src/multiarray/nditer_impl.h b/numpy/_core/src/multiarray/nditer_impl.h index c907464ec53e..c8ac9e4fcce4 100644 --- a/numpy/_core/src/multiarray/nditer_impl.h +++ b/numpy/_core/src/multiarray/nditer_impl.h @@ -55,13 +55,14 @@ /********** PRINTF DEBUG TRACING **************/ #define NPY_IT_DBG_TRACING 0 +/* TODO: Can remove the n-args macros, old C89 didn't have variadic macros. */ #if NPY_IT_DBG_TRACING -#define NPY_IT_DBG_PRINT(s) printf("%s", s) -#define NPY_IT_DBG_PRINT1(s, p1) printf(s, p1) -#define NPY_IT_DBG_PRINT2(s, p1, p2) printf(s, p1, p2) -#define NPY_IT_DBG_PRINT3(s, p1, p2, p3) printf(s, p1, p2, p3) +#define NPY_IT_DBG_PRINT(...) printf(__VA_ARGS__) +#define NPY_IT_DBG_PRINT1(s, p1) NPY_IT_DBG_PRINT(s, p1) +#define NPY_IT_DBG_PRINT2(s, p1, p2) NPY_IT_DBG_PRINT(s, p1, p2) +#define NPY_IT_DBG_PRINT3(s, p1, p2, p3) NPY_IT_DBG_PRINT(s, p1, p2, p3) #else -#define NPY_IT_DBG_PRINT(s) +#define NPY_IT_DBG_PRINT(...) #define NPY_IT_DBG_PRINT1(s, p1) #define NPY_IT_DBG_PRINT2(s, p1, p2) #define NPY_IT_DBG_PRINT3(s, p1, p2, p3) @@ -119,20 +120,27 @@ #define NPY_OP_ITFLAG_READ 0x0002 /* The operand needs type conversion/byte swapping/alignment */ #define NPY_OP_ITFLAG_CAST 0x0004 -/* The operand never needs buffering */ +/* The operand never needs buffering (implies BUF_SINGLESTRIDE) */ #define NPY_OP_ITFLAG_BUFNEVER 0x0008 +/* Whether the buffer filling can use a single stride (minus reduce if reduce) */ +#define NPY_OP_ITFLAG_BUF_SINGLESTRIDE 0x0010 /* The operand is being reduced */ #define NPY_OP_ITFLAG_REDUCE 0x0020 /* The operand is for temporary use, does not have a backing array */ #define NPY_OP_ITFLAG_VIRTUAL 0x0040 /* The operand requires masking when copying buffer -> array */ #define NPY_OP_ITFLAG_WRITEMASKED 0x0080 -/* The operand's data pointer is pointing into its buffer */ -#define NPY_OP_ITFLAG_USINGBUFFER 0x0100 +/* + * Whether the buffer is *fully* filled and thus ready for reuse. + * (Must check if the start pointer matches until copy-from-buffer checks) + */ +#define NPY_OP_ITFLAG_BUF_REUSABLE 0x0100 /* The operand must be copied (with UPDATEIFCOPY if also ITFLAG_WRITE) */ #define NPY_OP_ITFLAG_FORCECOPY 0x0200 /* The operand has temporary data, write it back at dealloc */ #define NPY_OP_ITFLAG_HAS_WRITEBACK 0x0400 +/* Whether the user requested a contiguous operand */ +#define NPY_OP_ITFLAG_CONTIG 0x0800 /* * The data layout of the iterator is fully specified by @@ -176,7 +184,7 @@ typedef npy_int16 npyiter_opitflags; (NPY_PTR_ALIGNED(sizeof(npyiter_opitflags) * nop)) #define NIT_BUFFERDATA_SIZEOF(itflags, ndim, nop) \ ((itflags&NPY_ITFLAG_BUFFER) ? ( \ - (NPY_SIZEOF_PY_INTPTR_T)*(6 + 5*nop) + sizeof(NpyIter_TransferInfo) * nop) : 0) + (NPY_SIZEOF_PY_INTPTR_T)*(8 + 5*nop) + sizeof(NpyIter_TransferInfo) * nop) : 0) /* Byte offsets of the iterator members starting from iter->iter_flexdata */ #define NIT_PERM_OFFSET() \ @@ -249,7 +257,7 @@ struct NpyIter_TransferInfo_tag { struct NpyIter_BufferData_tag { npy_intp buffersize, size, bufiterend, - reduce_pos, reduce_outersize, reduce_outerdim; + reduce_pos, coresize, outersize, coreoffset, outerdim; Py_intptr_t bd_flexdata; }; @@ -257,8 +265,10 @@ struct NpyIter_BufferData_tag { #define NBF_SIZE(bufferdata) ((bufferdata)->size) #define NBF_BUFITEREND(bufferdata) ((bufferdata)->bufiterend) #define NBF_REDUCE_POS(bufferdata) ((bufferdata)->reduce_pos) -#define NBF_REDUCE_OUTERSIZE(bufferdata) ((bufferdata)->reduce_outersize) -#define NBF_REDUCE_OUTERDIM(bufferdata) ((bufferdata)->reduce_outerdim) +#define NBF_CORESIZE(bufferdata) ((bufferdata)->coresize) +#define NBF_COREOFFSET(bufferdata) ((bufferdata)->coreoffset) +#define NBF_REDUCE_OUTERSIZE(bufferdata) ((bufferdata)->outersize) +#define NBF_OUTERDIM(bufferdata) ((bufferdata)->outerdim) #define NBF_STRIDES(bufferdata) ( \ &(bufferdata)->bd_flexdata + 0) #define NBF_PTRS(bufferdata) ((char **) \ diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py index 83aaa0a7795d..527265d4217d 100644 --- a/numpy/_core/tests/test_nditer.py +++ b/numpy/_core/tests/test_nditer.py @@ -2320,6 +2320,78 @@ def test_iter_buffering_growinner(): assert_equal(i[0].size, a.size) +@pytest.mark.parametrize("read_or_readwrite", ["readonly", "readwrite"]) +def test_iter_contig_flag_reduce_error(read_or_readwrite): + # Test that a non-contiguous operand is rejected without buffering. + # NOTE: This is true even for a reduction, where we return a 0-stride + # below! + with pytest.raises(TypeError, match="Iterator operand required buffering"): + it = np.nditer( + (np.zeros(()),), flags=["external_loop", "reduce_ok"], + op_flags=[(read_or_readwrite, "contig"),], itershape=(10,)) + + +@pytest.mark.parametrize("arr", [ + lambda: np.zeros(()), + lambda: np.zeros((20, 1))[::20], + lambda: np.zeros((1, 20))[:, ::20] + ]) +def test_iter_contig_flag_single_operand_strides(arr): + """ + Tests the strides with the contig flag for both broadcast and non-broadcast + operands in 3 cases where the logic is needed: + 1. When everything has a zero stride, the broadcast op needs to repeated + 2. When the reduce axis is the last axis (first to iterate). + 3. When the reduce axis is the first axis (last to iterate). + + NOTE: The semantics of the cast flag are not clearly defined when + it comes to reduction. It is unclear that there are any users. + """ + first_op = np.ones((10, 10)) + broadcast_op = arr() + red_op = arr() + # Add a first operand to ensure no axis-reordering and the result shape. + iterator = np.nditer( + (first_op, broadcast_op, red_op), + flags=["external_loop", "reduce_ok", "buffered", "delay_bufalloc"], + op_flags=[("readonly", "contig")] * 2 + [("readwrite", "contig")]) + + with iterator: + iterator.reset() + for f, b, r in iterator: + # The first operand is contigouos, we should have a view + assert np.shares_memory(f, first_op) + # Although broadcast, the second op always has a contiguous stride + assert b.strides[0] == 8 + assert not np.shares_memory(b, broadcast_op) + # The reduction has a contiguous stride or a 0 stride + if red_op.ndim == 0 or red_op.shape[-1] == 1: + assert r.strides[0] == 0 + else: + # The stride is 8, although it was not originally: + assert r.strides[0] == 8 + # If the reduce stride is 0, buffering makes no difference, but we + # do it anyway right now: + assert not np.shares_memory(r, red_op) + + +@pytest.mark.xfail(reason="The contig flag was always buggy.") +def test_iter_contig_flag_incorrect(): + # This case does the wrong thing... + iterator = np.nditer( + (np.ones((10, 10)).T, np.ones((1, 10))), + flags=["external_loop", "reduce_ok", "buffered", "delay_bufalloc"], + op_flags=[("readonly", "contig")] * 2) + + with iterator: + iterator.reset() + for a, b in iterator: + # Remove a and b from locals (pytest may want to format them) + a, b = a.strides, b.strides + assert a == 8 + assert b == 8 # should be 8 but is 0 due to axis reorder + + @pytest.mark.slow def test_iter_buffered_reduce_reuse(): # large enough array for all views, including negative strides. @@ -3296,7 +3368,7 @@ def test_debug_print(capfd): expected = """ ------ BEGIN ITERATOR DUMP ------ | Iterator Address: - | ItFlags: BUFFER REDUCE REUSE_REDUCE_LOOPS + | ItFlags: BUFFER REDUCE | NDim: 2 | NOp: 2 | IterSize: 50 @@ -3322,6 +3394,7 @@ def test_debug_print(capfd): | BufferSize: 50 | Size: 5 | BufIterEnd: 5 + | BUFFER CoreSize: 5 | REDUCE Pos: 0 | REDUCE OuterSize: 10 | REDUCE OuterDim: 1