8000 BUG: Don't convert inputs to `np.float64` in digitize · eric-wieser/numpy@307dd76 · GitHub
[go: up one dir, main page]

Skip to content

Commit 307dd76

Browse files
committed
BUG: Don't convert inputs to np.float64 in digitize
This converts digitize to a pure-python function that falls back on searchsorted. Performance doesn't really matter here anyway - if you care about performance, then you should just call searchsorted directly, rather than checking the order of the bins. Partially fixes numpygh-11022
1 parent 7394436 commit 307dd76

File tree

6 files changed

+156
-186
lines changed

6 files changed

+156
-186
lines changed

numpy/core/_add_newdocs.py

Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -5296,99 +5296,6 @@ def luf(lamdaexpr, *args, **kwargs):
52965296
#
52975297
##############################################################################
52985298

5299-
add_newdoc('numpy.core.multiarray', 'digitize',
5300-
"""
5301-
digitize(x, bins, right=False)
5302-
5303-
Return the indices of the bins to which each value in input array belongs.
5304-
5305-
========= ============= ============================
5306-
`right` order of bins returned index `i` satisfies
5307-
========= ============= ============================
5308-
``False`` increasing ``bins[i-1] <= x < bins[i]``
5309-
``True`` increasing ``bins[i-1] < x <= bins[i]``
5310-
``False`` decreasing ``bins[i-1] > x >= bins[i]``
5311-
``True`` decreasing ``bins[i-1] >= x > bins[i]``
5312-
========= ============= ============================
5313-
5314-
If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
5315-
returned as appropriate.
5316-
5317-
Parameters
5318-
----------
5319-
x : array_like
5320-
Input array to be binned. Prior to NumPy 1.10.0, this array had to
5321-
be 1-dimensional, but can now have any shape.
5322-
bins : array_like
5323-
Array of bins. It has to be 1-dimensional and monotonic.
5324-
right : bool, optional
5325-
Indicating whether the intervals include the right or the left bin
5326-
edge. Default behavior is (right==False) indicating that the interval
5327-
does not include the right edge. The left bin end is open in this
5328-
case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
5329-
monotonically increasing bins.
5330-
5331-
Returns
5332-
-------
5333-
indices : ndarray of ints
5334-
Output array of indices, of same shape as `x`.
5335-
5336-
Raises
5337-
------
5338-
ValueError
5339-
If `bins` is not monotonic.
5340-
TypeError
5341-
If the type of the input is complex.
5342-
5343-
See Also
5344-
--------
5345-
bincount, histogram, unique, searchsorted
5346-
5347-
Notes
5348-
-----
5349-
If values in `x` are such that they fall outside the bin range,
5350-
attempting to index `bins` with the indices that `digitize` returns
5351-
will result in an IndexError.
5352-
5353-
.. versionadded:: 1.10.0
5354-
5355-
`np.digitize` is implemented in terms of `np.searchsorted`. This means
5356-
that a binary search is used to bin the values, which scales much bette 10000 r
5357-
for larger number of bins than the previous linear search. It also removes
5358-
the requirement for the input array to be 1-dimensional.
5359-
5360-
For monotonically _increasing_ `bins`, the following are equivalent::
5361-
5362-
np.digitize(x, bins, right=True)
5363-
np.searchsorted(bins, x, side='left')
5364-
5365-
Note that as the order of the arguments are reversed, the side must be too.
5366-
The `searchsorted` call is marginally faster, as it does not do any
5367-
monotonicity checks. Perhaps more importantly, it supports all dtypes.
5368-
5369-
Examples
5370-
--------
5371-
>>> x = np.array([0.2, 6.4, 3.0, 1.6])
5372-
>>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
5373-
>>> inds = np.digitize(x, bins)
5374-
>>> inds
5375-
array([1, 4, 3, 2])
5376-
>>> for n in range(x.size):
5377-
... print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
5378-
...
5379-
0.0 <= 0.2 < 1.0
5380-
4.0 <= 6.4 < 10.0
5381-
2.5 <= 3.0 < 4.0
5382-
1.0 <= 1.6 < 2.5
5383-
5384-
>>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
5385-
>>> bins = np.array([0, 5, 10, 15, 20])
5386-
>>> np.digitize(x,bins,right=True)
5387-
array([1, 2, 3, 4, 4])
5388-
>>> np.digitize(x,bins,right=False)
5389-
array([1, 3, 3, 4, 5])
5390-
""")
5391-
53925299
add_newdoc('numpy.core.multiarray', 'bincount',
53935300
"""
53945301
bincount(x, weights=None, minlength=0)

numpy/core/src/multiarray/compiled_base.c

Lines changed: 31 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,17 @@
2121
* and 0 if the array is not monotonic.
2222
*/
2323
static int
24-
check_array_monotonic(const double *a, npy_int lena)
24+
check_array_monotonic(const double *a, npy_intp lena)
2525
{
2626
npy_intp i;
2727
double next;
28-
double last = a[0];
28+
double last;
29+
30+
if (lena == 0) {
31+
/* all bin edges hold the same value */
32+
return 1;
33+
}
34+
last = a[0];
2935

3036
/* Skip repeated values at the beginning of the array */
3137
for (i = 1; (i < lena) && (a[i] == last); i++);
@@ -209,106 +215,41 @@ arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
209215
return NULL;
210216
}
211217

212-
/*
213-
* digitize(x, bins, right=False) returns an array of integers the same length
214-
* as x. The values i returned are such that bins[i - 1] <= x < bins[i] if
215-
* bins is monotonically increasing, or bins[i - 1] > x >= bins[i] if bins
216-
* is monotonically decreasing. Beyond the bounds of bins, returns either
217-
* i = 0 or i = len(bins) as appropriate. If right == True the comparison
218-
* is bins [i - 1] < x <= bins[i] or bins [i - 1] >= x > bins[i]
219-
*/
218+
/* Internal function to expose check_array_monotonic to python */
220219
NPY_NO_EXPORT PyObject *
221-
arr_digitize(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
220+
arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
222221
{
222+
static char *kwlist[] = {"x", NULL};
223223
PyObject *obj_x = NULL;
224-
PyObject *obj_bins = NULL;
225224
PyArrayObject *arr_x = NULL;
226-
PyArrayObject *arr_bins = NULL;
227-
PyObject *ret = NULL;
228-
npy_intp len_bins;
229-
int monotonic, right = 0;
230-
NPY_BEGIN_THREADS_DEF
231-
232-
static char *kwlist[] = {"x", "bins", "right", NULL};
225+
long monotonic;
226+
npy_intp len_x;
227+
NPY_BEGIN_THREADS_DEF;
233228

234-
if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|i:digitize", kwlist,
235-
&obj_x, &obj_bins, &right)) {
236-
goto fail;
229+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|_monotonicity", kwlist,
230+
&obj_x)) {
231+
return NULL;
237232
}
238233

239-
/* PyArray_SearchSorted will make `x` contiguous even if we don't */
240-
arr_x = (PyArrayObject *)PyArray_FROMANY(obj_x, NPY_DOUBLE, 0, 0,
241-
NPY_ARRAY_CARRAY_RO);
234+
/*
235+
* TODO:
236+
* `x` could be strided, needs change to check_array_monotonic
237+
* `x` is forced to double for this check
238+
*/
239+
arr_x = (PyArrayObject *)PyArray_FROMANY(
240+
obj_x, NPY_DOUBLE, 1, 1, NPY_ARRAY_CARRAY_RO);
242241
if (arr_x == NULL) {
243-
goto fail;
244-
}
245-
246-
/* TODO: `bins` could be strided, needs change to check_array_monotonic */
247-
arr_bins = (PyArrayObject *)PyArray_FROMANY(obj_bins, NPY_DOUBLE, 1, 1,
248-
NPY_ARRAY_CARRAY_RO);
249-
if (arr_bins == NULL) {
250-
goto fail;
251-
}
252-
253-
len_bins = PyArray_SIZE(arr_bins);
254-
if (len_bins == 0) {
255-
PyErr_SetString(PyExc_ValueError, "bins must have non-zero length");
256-
goto fail;
242+
return NULL;
257243
}
258244

259-
NPY_BEGIN_THREADS_THRESHOLDED(len_bins)
260-
monotonic = check_array_monotonic((const double *)PyArray_DATA(arr_bins),
261-
len_bins);
245+
len_x = PyArray_SIZE(arr_x);
246+
NPY_BEGIN_THREADS_THRESHOLDED(len_x)
247+
monotonic = check_array_monotonic(
248+
(const double *)PyArray_DATA(arr_x), len_x);
262249
NPY_END_THREADS
250+
Py_DECREF(arr_x);
263251

264-
if (monotonic == 0) {
265-
PyErr_SetString(PyExc_ValueError,
266-
"bins must be monotonically increasing or decreasing");
267-
goto fail;
268-
}
269-
270-
/* PyArray_SearchSorted needs an increasing array */
271-
if (monotonic == - 1) {
272-
PyArrayObject *arr_tmp = NULL;
273-
npy_intp shape = PyArray_DIM(arr_bins, 0);
274-
npy_intp stride = -PyArray_STRIDE(arr_bins, 0);
275-
void *data = (void *)(PyArray_BYTES(arr_bins) - stride * (shape - 1));
276-
277-
arr_tmp = (PyArrayObject *)PyArray_NewFromDescrAndBase(
278-
&PyArray_Type, PyArray_DescrFromType(NPY_DOUBLE),
279-
1, &shape, &stride, data,
280-
PyArray_FLAGS(arr_bins), NULL, (PyObject *)arr_bins);
281-
Py_DECREF(arr_bins);
282-
if (!arr_tmp) {
283-
goto fail;
284-
}
285-
arr_bins = arr_tmp;
286-
}
287-
288-
ret = PyArray_SearchSorted(arr_bins, (PyObject *)arr_x,
289-
right ? NPY_SEARCHLEFT : NPY_SEARCHRIGHT, NULL);
290-
if (!ret) {
291-
goto fail;
292-
}
293-
294-
/* If bins is decreasing, ret has bins from end, not start */
295-
if (monotonic == -1) {
296-
npy_intp *ret_data =
297-
(npy_intp *)PyArray_DATA((PyArrayObject *)ret);
298-
npy_intp len_ret = PyArray_SIZE((PyArrayObject *)ret);
299-
300-
NPY_BEGIN_THREADS_THRESHOLDED(len_ret)
301-
while (len_ret--) {
302-
*ret_data = len_bins - *ret_data;
303-
ret_data++;
304-
}
305-
NPY_END_THREADS
306-
}
307-
308-
fail:
309-
Py_XDECREF(arr_x);
310-
Py_XDECREF(arr_bins);
311-
return ret;
252+
return PyInt_FromLong(monotonic);
312253
}
313254

314255
/*

numpy/core/src/multiarray/compiled_base.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ arr_insert(PyObject *, PyObject *, PyObject *);
77
NPY_NO_EXPORT PyObject *
88
arr_bincount(PyObject *, PyObject *, PyObject *);
99
NPY_NO_EXPORT PyObject *
10-
arr_digitize(PyObject *, PyObject *, PyObject *kwds);
10+
arr__monotonicity(PyObject *, PyObject *, PyObject *kwds);
1111
NPY_NO_EXPORT PyObject *
1212
arr_interp(PyObject *, PyObject *, PyObject *);
1313
NPY_NO_EXPORT PyObject *

numpy/core/src/multiarray/multiarraymodule.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4345,7 +4345,7 @@ static struct PyMethodDef array_module_methods[] = {
43454345
"indicated by mask."},
43464346
{"bincount", (PyCFunction)arr_bincount,
43474347
METH_VARARGS | METH_KEYWORDS, NULL},
4348-
{"digitize", (PyCFunction)arr_digitize,
4348+
{"_monotonicity", (PyCFunction)arr__monotonicity,
43494349
METH_VARARGS | METH_KEYWORDS, NULL},
43504350
{"interp", (PyCFunction)arr_interp,
43514351
METH_VARARGS | METH_KEYWORDS, NULL},

numpy/lib/function_base.py

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from numpy.lib.twodim_base import diag
3232
from .utils import deprecate
3333
from numpy.core.multiarray import (
34-
_insert, add_docstring, digitize, bincount, normalize_axis_index,
34+
_insert, add_docstring, bincount, normalize_axis_index, _monotonicity,
3535
interp as compiled_interp, interp_complex as compiled_interp_complex
3636
)
3737
from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
@@ -4493,3 +4493,113 @@ def append(arr, values, axis=None):
44934493
values = ravel(values)
44944494
axis = arr.ndim-1
44954495
return concatenate((arr, values), axis=axis)
4496+
4497+
4498+
def digitize(x, bins, right=False):
4499+
"""
4500+
Return the indices of the bins to which each value in input array belongs.
4501+
4502+
========= ============= ============================
4503+
`right` order of bins returned index `i` satisfies
4504+
========= ============= ============================
4505+
``False`` increasing ``bins[i-1] <= x < bins[i]``
4506+
``True`` increasing ``bins[i-1] < x <= bins[i]``
4507+
``False`` decreasing ``bins[i-1] > x >= bins[i]``
4508+
``True`` decreasing ``bins[i-1] >= x > bins[i]``
4509+
========= ============= ============================
4510+
4511+
If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
4512+
returned as appropriate.
4513+
4514+
Parameters
4515+
----------
4516+
x : array_like
4517+
Input array to be binned. Prior to NumPy 1.10.0, this array had to
4518+
be 1-dimensional, but can now have any shape.
4519+
bins : array_like
4520+
Array of bins. It has to be 1-dimensional and monotonic.
4521+
right : bool, optional
4522+
Indicating whether the intervals include the right or the left bin
4523+
edge. Default behavior is (right==False) indicating that the interval
4524+
does not include the right edge. The left bin end is open in this
4525+
case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
4526+
monotonically increasing bins.
4527+
4528+
Returns
4529+
-------
4530+
indices : ndarray of ints
4531+
Output array of indices, of same shape as `x`.
4532+
4533+
Raises
4534+
------
4535+
ValueError
4536+
If `bins` is not monotonic.
4537+
TypeError
4538+
If the type of the input is complex.
4539+
4540+
See Also
4541+
--------
4542+
bincount, histogram, unique, searchsorted
4543+
4544+
Notes
4545+
-----
4546+
If values in `x` are such that they fall outside the bin range,
4547+
attempting to index `bins` with the indices that `digitize` returns
4548+
will result in an IndexError.
4549+
4550+
.. versionadded:: 1.10.0
4551+
4552+
`np.digitize` is implemented in terms of `np.searchsorted`. This means
4553+
that a binary search is used to bin the values, which scales much better
4554+
for larger number of bins than the previous linear search. It also removes
4555+
the requirement for the input array to be 1-dimensional.
4556+
4557+
For monotonically _increasing_ `bins`, the following are equivalent::
4558+
4559+
np.digitize(x, bins, right=True)
4560+
np.searchsorted(bins, x, side='left')
4561+
4562+
Note that as the order of the arguments are reversed, the side must be too.
4563+
The `searchsorted` call is marginally faster, as it does not do any
4564+
monotonicity checks. Perhaps more importantly, it supports all dtypes.
4565+
4566+
Examples
4567+
--------
4568+
>>> x = np.array([0.2, 6.4, 3.0, 1.6])
4569+
>>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
4570+
>>> inds = np.digitize(x, bins)
4571+
>>> inds
4572+
array([1, 4, 3, 2])
4573+
>>> for n in range(x.size):
4574+
... print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
4575+
...
4576+
0.0 <= 0.2 < 1.0
4577+
4.0 <= 6.4 < 10.0
4578+
2.5 <= 3.0 < 4.0
4579+
1.0 <= 1.6 < 2.5
4580+
4581+
>>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
4582+
>>> bins = np.array([0, 5, 10, 15, 20])
4583+
>>> np.digitize(x,bins,right=True)
4584+
array([1, 2, 3, 4, 4])
4585+
>>> np.digitize(x,bins,right=False)
4586+
array([1, 3, 3, 4, 5])
4587+
"""
4588+
x = _nx.asarray(x)
4589+
bins = _nx.asarray(bins)
4590+
4591+
# here for compatibility, searchsorted below is happy to take this
4592+
if np.issubdtype(x.dtype, _nx.complexfloating):
4593+
raise TypeError("x may not be complex")
4594+
4595+
mono = _monotonicity(bins)
4596+
if mono == 0:
4597+
raise ValueError("bins must be monotonically increasing or decreasing")
4598+
4599+
# this is backwards because the arguments below are swapped
4600+
side = 'left' if right else 'right'
4601+
if mono == -1:
4602+
# reverse the bins, and invert the results
4603+
return len(bins) - _nx.searchsorted(bins[::-1], x, side=side)
4604+
else:
4605+
return _nx.searchsorted(bins, x, side=side)

0 commit comments

Comments
 (0)
0