diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py index c22ceaa5e42f..f7884cd6c309 100644 --- a/benchmarks/benchmarks/bench_lib.py +++ b/benchmarks/benchmarks/bench_lib.py @@ -53,6 +53,7 @@ def setup(self, shape, pad_width, mode): def time_pad(self, shape, pad_width, mode): np.pad(self.array, pad_width, mode) + class Nan(Benchmark): """Benchmarks for nan functions""" @@ -113,3 +114,26 @@ def time_nanquantile(self, array_size, percent_nans): def time_nanpercentile(self, array_size, percent_nans): np.nanpercentile(self.arr, q=50) + + +class Unique(Benchmark): + """Benchmark for np.unique with np.nan values.""" + + param_names = ["array_size", "percent_nans"] + params = [ + # sizes of the 1D arrays + [200, int(2e5)], + # percent of np.nan in arrays + [0, 0.1, 2., 50., 90.], + ] + + def setup(self, array_size, percent_nans): + np.random.seed(123) + # produce a randomly shuffled array with the + # approximate desired percentage np.nan content + base_array = np.random.uniform(size=array_size) + base_array[base_array < percent_nans / 100.] = np.nan + self.arr = base_array + + def time_unique(self, array_size, percent_nans): + np.unique(self.arr) diff --git a/doc/release/upcoming_changes/18070.improvement.rst b/doc/release/upcoming_changes/18070.improvement.rst new file mode 100644 index 000000000000..ae750fb1287c --- /dev/null +++ b/doc/release/upcoming_changes/18070.improvement.rst @@ -0,0 +1,12 @@ +``np.unique`` now returns single ``NaN`` +---------------------------------------- +When ``np.unique`` operated on an array with multiple ``NaN`` entries, +its return included a ``NaN`` for each entry that was ``NaN`` in the original array. +This is now improved such that the returned array contains just one ``NaN`` as the +last element. + +Also for complex arrays all ``NaN`` values are considered equivalent +(no matter whether the ``NaN`` is in the real or imaginary part). As the +representant for the returned array the smallest one in the +lexicographical order is chosen - see ``np.sort`` for how the lexicographical +order is defined for complex arrays. \ No newline at end of file diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index 6c6c1ff809ce..ef9d29ef7569 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -209,6 +209,16 @@ def unique(ar, return_index=False, return_inverse=False, flattened subarrays are sorted in lexicographic order starting with the first element. + .. versionchanged: NumPy 1.21 + If nan values are in the input array, a single nan is put + to the end of the sorted unique values. + + Also for complex arrays all NaN values are considered equivalent + (no matter whether the NaN is in the real or imaginary part). + As the representant for the returned array the smallest one in the + lexicographical order is chosen - see np.sort for how the lexicographical + order is defined for complex arrays. + Examples -------- >>> np.unique([1, 1, 2, 2, 3, 3]) @@ -324,7 +334,16 @@ def _unique1d(ar, return_index=False, return_inverse=False, aux = ar mask = np.empty(aux.shape, dtype=np.bool_) mask[:1] = True - mask[1:] = aux[1:] != aux[:-1] + if aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and np.isnan(aux[-1]): + if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent + aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') + else: + aux_firstnan = np.searchsorted(aux, aux[-1], side='left') + mask[1:aux_firstnan] = (aux[1:aux_firstnan] != aux[:aux_firstnan - 1]) + mask[aux_firstnan] = True + mask[aux_firstnan + 1:] = False + else: + mask[1:] = aux[1:] != aux[:-1] ret = (aux[mask],) if return_index: diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py index 847e6cb8a3e7..9b4c2ecc162b 100644 --- a/numpy/lib/tests/test_arraysetops.py +++ b/numpy/lib/tests/test_arraysetops.py @@ -531,6 +531,52 @@ def check_all(a, b, i1, i2, c, dt): assert_equal(a3_idx.dtype, np.intp) assert_equal(a3_inv.dtype, np.intp) + # test for ticket 2111 - float + a = [2.0, np.nan, 1.0, np.nan] + ua = [1.0, 2.0, np.nan] + ua_idx = [2, 0, 1] + ua_inv = [1, 2, 0, 2] + ua_cnt = [1, 1, 2] + assert_equal(np.unique(a), ua) + assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) + assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) + assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) + + # test for ticket 2111 - complex + a = [2.0-1j, np.nan, 1.0+1j, complex(0.0, np.nan), complex(1.0, np.nan)] + ua = [1.0+1j, 2.0-1j, complex(0.0, np.nan)] + ua_idx = [2, 0, 3] + ua_inv = [1, 2, 0, 2, 2] + ua_cnt = [1, 1, 3] + assert_equal(np.unique(a), ua) + assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) + assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) + assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) + + # test for ticket 2111 - datetime64 + nat = np.datetime64('nat') + a = [np.datetime64('2020-12-26'), nat, np.datetime64('2020-12-24'), nat] + ua = [np.datetime64('2020-12-24'), np.datetime64('2020-12-26'), nat] + ua_idx = [2, 0, 1] + ua_inv = [1, 2, 0, 2] + ua_cnt = [1, 1, 2] + assert_equal(np.unique(a), ua) + assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) + assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) + assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) + + # test for ticket 2111 - timedelta + nat = np.timedelta64('nat') + a = [np.timedelta64(1, 'D'), nat, np.timedelta64(1, 'h'), nat] + ua = [np.timedelta64(1, 'h'), np.timedelta64(1, 'D'), nat] + ua_idx = [2, 0, 1] + ua_inv = [1, 2, 0, 2] + ua_cnt = [1, 1, 2] + assert_equal(np.unique(a), ua) + assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) + assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) + assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) + def test_unique_axis_errors(self): assert_raises(TypeError, self._run_axis_tests, object) assert_raises(TypeError, self._run_axis_tests,