8000 API: Optimize np.unique for integer arrays; add `kind` parameter by MilesCranmer · Pull Request #21843 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

API: Optimize np.unique for integer arrays; add kind parameter #21843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 81 additions & 5 deletions numpy/lib/arraysetops.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,15 @@ def _unpack_tuple(x):


def _unique_dispatcher(ar, return_index=None, return_inverse=None,
return_counts=None, axis=None, *, equal_nan=None):
return_counts=None, axis=None, *, equal_nan=None,
kind=None):
return (ar,)


@array_function_dispatch(_unique_dispatcher)
def unique(ar, return_index=False, return_inverse=False,
return_counts=False, axis=None, *, equal_nan=True):
return_counts=False, axis=None, *, equal_nan=True,
kind=None):
"""
Find the unique elements of an array.

Expand Down Expand Up @@ -177,6 +179,11 @@ def unique(ar, return_index=False, return_inverse=False,

.. versionadded:: 1.24

kind : {None, 'sort', 'table'}, optional
The algorithm to use. This will not affect the final result,
but will affect the speed and memory use. The default, None,
will select automatically based on memory considerations.

Returns
-------
unique : ndarray
Expand Down Expand Up @@ -272,7 +279,7 @@ def unique(ar, return_index=False, return_inverse=False,
ar = np.asanyarray(ar)
if axis is None:
ret = _unique1d(ar, return_index, return_inverse, return_counts,
equal_nan=equal_nan)
equal_nan=equal_nan, kind=kind)
return _unpack_tuple(ret)

# axis was specified and not None
Expand Down Expand Up @@ -315,18 +322,87 @@ def reshape_uniq(uniq):
return uniq

output = _unique1d(consolidated, return_index,
return_inverse, return_counts, equal_nan=equal_nan)
return_inverse, return_counts,
equal_nan=equal_nan, kind=kind)
output = (reshape_uniq(output[0]),) + output[1:]
return _unpack_tuple(output)


def _unique1d(ar, return_index=False, return_inverse=False,
return_counts=False, *, equal_nan=True):
return_counts=False, *, equal_nan=True, kind=None):
"""
Find the unique elements of an array, ignoring shape.
"""
ar = np.asanyarray(ar).flatten()

integer_array = np.issubdtype(ar.dtype, np.integer)

if kind not in {None, 'sort', 'table'}:
raise ValueError(
f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")

use_table_method = (
integer_array
and kind in {'table', None}
and not return_index
and not return_inverse
and not isinstance(ar, np.ma.MaskedArray)
and not ar.size == 0
)
if use_table_method:
try:
int(np.min(ar))
except TypeError:
# Not an integer. e.g.,
# a datetime dtype is a subdtype,
# but can't be used as an integer.
use_table_method = False

if use_table_method:
# Can use lookup table-like approach.
# TODO HACK need solution for empty array.
ar_min = np.min(ar)
ar_max = np.max(ar)

ar_range = int(ar_max) - int(ar_min)

# Constraints on whether we can actually use the table method:
range_safe_from_overflow = ar_range < np.iinfo(ar.dtype).max
below_memory_constraint = ar_range <= 6 * ar.size

if (
range_safe_from_overflow and
(below_memory_constraint or kind == 'table')
):

number = np.arange(ar_min, ar_max + 1, dtype=ar.dtype)

if return_counts:
counts = np.zeros(ar_range + 1, dtype=np.intp)
# Use `at` because of repeated indices:
np.add.at(counts, ar - ar_min, 1)
mask = counts > 0
return (number[mask], counts[mask])

else:
exists = np.zeros(ar_range + 1, dtype=bool)
exists[ar - ar_min] = 1
return (number[exists],)

elif kind == 'table': # not range_safe_from_overflow
raise RuntimeError(
"You have specified kind='table', "
"but the range of values in `ar` exceeds the "
"maximum integer of the datatype. "
"Please set `kind` to None or 'sort'."
)
elif kind == 'table':
# TODO: Specify why.
raise ValueError(
"The 'table' method is not supported "
"supported for this input."
)

optional_indices = return_index or return_inverse

if optional_indices:
Expand Down
0