8000 ENH: Recover separation between indexing and gamma in _QuantileMethod… · chunweiyuan/numpy@cdd13be · GitHub
[go: up one dir, main page]

Skip to content

Commit cdd13be

Browse files
committed
ENH: Recover separation between indexing and gamma in _QuantileMethods in lib.function_base (numpy#9211).
Add tests to ensure consistency 1.) between get_virtual_index/fix_gamma and discrite_shortcut in lib.function_base._QuantileMethods, and 2.) in decimal weights. (numpy#9211) Remove nan-index check in function_base._get_indexes. (numpy#9211)
1 parent db65fb3 commit cdd13be

File tree

3 files changed

+125
-76
lines changed

3 files changed

+125
-76
lines changed

numpy/lib/function_base.py

Lines changed: 65 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -69,84 +69,108 @@
6969
# --- HYNDMAN and FAN METHODS
7070
# Discrete methods
7171
inverted_cdf=dict(
72-
get_virtual_index=lambda n, quantiles: _inverted_cdf(n, quantiles),
73-
fix_gamma=lambda gamma, _: gamma, # should never be called
72+
get_virtual_index=lambda n, quantiles: (n * quantiles) - 1,
73+
fix_gamma=lambda g, _: _get_gamma_mask(
74+
shape=g.shape,
75+
default_value=1.,
76+
conditioned_value=0.0,
77+
where=g == 0),
78+
discrete_shortcut=lambda n, quantiles: _inverted_cdf(n, quantiles),
7479
),
7580
averaged_inverted_cdf=dict(
7681
get_virtual_index=lambda n, quantiles: (n * quantiles) - 1,
77-
fix_gamma=lambda gamma, _: _get_gamma_mask(
78-
shape=gamma.shape,
82+
fix_gamma=lambda g, _: _get_gamma_mask(
83+
shape=g.shape,
7984
default_value=1.,
8085
conditioned_value=0.5,
81-
where=gamma == 0),
86+
where=g == 0),
87+
discrete_shortcut=None,
8288
),
8389
closest_observation=dict(
84-
get_virtual_index=lambda n, quantiles: _closest_observation(n,
85-
quantiles),
86-
fix_gamma=lambda gamma, _: gamma, # should never be called
90+
get_virtual_index=lambda n, quantiles: (n * quantiles) - 1 - 0.5,
91+
fix_gamma=lambda g, index: _get_gamma_mask(
92+
shape=g.shape,
93+
default_value=1.,
94+
conditioned_value=0.0,
95+
where=(g == 0) & (np.floor(index) % 2 == 0)),
96+
discrete_shortcut=(lambda n, quantiles:
97+
_closest_observation(n, quantiles)),
8798
),
8899
# Continuous methods
89100
interpolated_inverted_cdf=dict(
90101
get_virtual_index=lambda n, quantiles:
91102
_compute_virtual_index(n, quantiles, 0, 1),
92-
fix_gamma=lambda gamma, _: gamma,
103+
fix_gamma=lambda g, _: g,
104+
discrete_shortcut=None,
93105
),
94106
hazen=dict(
95107
get_virtual_index=lambda n, quantiles:
96108
_compute_virtual_index(n, quantiles, 0.5, 0.5),
97-
fix_gamma=lambda gamma, _: gamma,
109+
fix_gamma=lambda g, _: g,
110+
discrete_shortcut=None,
98111
),
99112
weibull=dict(
100113
get_virtual_index=lambda n, quantiles:
101114
_compute_virtual_index(n, quantiles, 0, 0),
102-
fix_gamma=lambda gamma, _: gamma,
115+
fix_gamma=lambda g, _: g,
116+
discrete_shortcut=None,
103117
),
104118
# Default method.
105119
# To avoid some rounding issues, `(n-1) * quantiles` is preferred to
106120
# `_compute_virtual_index(n, quantiles, 1, 1)`.
107121
# They are mathematically equivalent.
108122
linear=dict(
109123
get_virtual_index=lambda n, quantiles: (n - 1) * quantiles,
110-
fix_gamma=lambda gamma, _: gamma,
124+
fix_gamma=lambda g, _: g,
125+
discrete_shortcut=None,
111126
),
112127
median_unbiased=dict(
113128
get_virtual_index=lambda n, quantiles:
114129
_compute_virtual_index(n, quantiles, 1 / 3.0, 1 / 3.0),
115-
fix_gamma=lambda gamma, _: gamma,
130+
fix_gamma=lambda g, _: g,
131+
discrete_shortcut=None,
116132
),
117133
normal_unbiased=dict(
118134
get_virtual_index=lambda n, quantiles:
119135
_compute_virtual_index(n, quantiles, 3 / 8.0, 3 / 8.0),
120-
fix_gamma=lambda gamma, _: gamma,
136+
fix_gamma=lambda g, _: g,
137+
discrete_shortcut=None,
121138
),
122139
# --- OTHER METHODS
123140
lower=dict(
124-
get_virtual_index=lambda n, quantiles: np.floor(
141+
get_virtual_index=lambda n, quantiles: (n - 1) * quantiles,
142+
fix_gamma=lambda g, _: np.floor(g),
143+
discrete_shortcut=lambda n, quantiles: np.floor(
125144
(n - 1) * quantiles).astype(np.intp),
126-
fix_gamma=lambda gamma, _: gamma,
127-
# should never be called, index dtype is int
128145
),
129146
higher=dict(
130-
get_virtual_index=lambda n, quantiles: np.ceil(
147+
get_virtual_index=lambda n, quantiles: (n - 1) * quantiles,
148+
fix_gamma=lambda g, _: np.ceil(g),
149+
discrete_shortcut=lambda n, quantiles: np.ceil(
131150
(n - 1) * quantiles).astype(np.intp),
132-
fix_gamma=lambda gamma, _: gamma,
133-
# should never be called, index dtype is int
134151
),
135152
midpoint=dict(
136153
get_virtual_index=lambda n, quantiles: 0.5 * (
137154
np.floor((n - 1) * quantiles)
138155
+ np.ceil((n - 1) * quantiles)),
139-
fix_gamma=lambda gamma, index: _get_gamma_mask(
140-
shape=gamma.shape,
156+
fix_gamma=lambda g, index: _get_gamma_mask(
157+
shape=g.shape,
141158
default_value=0.5,
142159
conditioned_value=0.,
143-
where=index % 1 == 0),
160+
where=(index % 1 == 0) & (g == 0)),
161+
discrete_shortcut=None,
144162
),
145163
nearest=dict(
146-
get_virtual_index=lambda n, quantiles: np.around(
164+
get_virtual_index=lambda n, quantiles: (n - 1) * quantiles,
165+
# fix_gamma here meant to match behavior of discrete_shortcut because
166+
# np.around rounds to the nearest even integer.
167+
fix_gamma=lambda g, index: _get_gamma_mask(
168+
shape=g.shape,
169+
default_value=np.around(g),
170+
conditioned_value=np.around(index) - np.around(index - g),
171+
where=g == 0.5),
172+
discrete_shortcut=lambda n, quantiles: np.around(
147173
(n - 1) * quantiles).astype(np.intp),
148-
fix_gamma=lambda gamma, _: gamma,
149-
# should never be called, index dtype is int
150174
))
151175

152176

@@ -4765,7 +4789,7 @@ def _compute_virtual_index(n, quantiles, alpha: float, beta: float):
47654789

47664790
def _get_gamma(virtual_indexes, previous_indexes, method):
47674791
"""
4768-
Compute gamma (a.k.a 'm' or 'weight') for the linear interpolation
4792+
Compute gamma (a.k.a 'm') for the linear interpolation
47694793
of quantiles.
47704794
47714795
virtual_indexes : array_like
@@ -4780,8 +4804,10 @@ def _get_gamma(virtual_indexes, previous_indexes, method):
47804804
gamma is usually the fractional part of virtual_indexes but can be modified
47814805
by the interpolation method.
47824806
"""
4783-
gamma = np.asanyarray(virtual_indexes - previous_indexes)
4784-
gamma = method["fix_gamma"](gamma, virtual_indexes)
4807+
# % 1 because index diff can be > 1 in weight space calculation
4808+
# when virtual index lies within a value of large weight.
4809+
g = np.asanyarray(virtual_indexes - previous_indexes) % 1
4810+
gamma = method["fix_gamma"](g, virtual_indexes)
47854811
return np.asanyarray(gamma)
47864812

47874813

@@ -4899,12 +4925,6 @@ def _get_indexes(arr, virtual_indexes, valid_values_count):
48994925
if indexes_below_bounds.any():
49004926
previous_indexes[indexes_below_bounds] = 0
49014927
next_indexes[indexes_below_bounds] = 0
4902-
if np.issubdtype(arr.dtype, np.inexact):
4903-
# After the sort, slices having NaNs will have for last element a NaN
4904-
virtual_indexes_nans = np.isnan(virtual_indexes) # indexes have nans?
4905-
if virtual_indexes_nans.any():
4906-
previous_indexes[virtual_indexes_nans] = -1
4907-
next_indexes[virtual_indexes_nans] = -1
49084928
previous_indexes = previous_indexes.astype(np.intp)
49094929
next_indexes = next_indexes.astype(np.intp)
49104930
return previous_indexes, next_indexes
@@ -5037,13 +5057,11 @@ def _get_weighted_quantile_values(arr1d, wgts1d):
50375057
np.interp(previous_w_indexes, w_index_bounds, real_indexes)
50385058
next_indexes =\
50395059
np.interp(next_w_indexes, w_index_bounds, real_indexes)
5040-
indexes =\
5041-
np.interp(weight_space_indexes, w_index_bounds, real_indexes)
50425060

50435061
# method-dependent gammas determine interpolation scheme between
50445062
# neighboring values, and are computed in weight space.
5045-
gamma = _get_gamma(indexes, previous_indexes, method)
5046-
5063+
gamma =\
5064+
_get_gamma(weight_space_indexes, previous_w_indexes, method)
50475065
previous = take(arr1d, previous_indexes.astype(int))
50485066
next = take(arr1d, next_indexes.astype(int))
50495067
return _lerp(previous, next, gamma, out=out)
@@ -5064,7 +5082,14 @@ def _get_weighted_quantile_values(arr1d, wgts1d):
50645082
if axis != 0: # moveaxis is slow, so only call it if necessary.
50655083
arr = np.moveaxis(arr, axis, destination=0)
50665084

5067-
virtual_indexes = method["get_virtual_index"](values_count, quantiles)
5085+
if method["discrete_shortcut"]: # lumps indexing + gamma interplation
5086+
# discrete methods result in dtype = np.intp
5087+
virtual_indexes =\
5088+
method["discrete_shortcut"](values_count, quantiles)
5089+
else:
5090+
virtual_indexes =\
5091+
method["get_virtual_index"](values_count, quantiles)
5092+
50685093
virtual_indexes = np.asanyarray(virtual_indexes)
50695094

50705095
result, slices_having_nans =\

numpy/lib/tests/test_function_base.py

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3785,12 +3785,7 @@ def test_quantile_add_and_multiply_constant(self, method, alpha):
37853785
# "median_unbiased", "normal_unbiased", "midpoint"
37863786
assert_allclose(q, np.quantile(y, alpha, method=method))
37873787

3788-
@pytest.mark.parametrize(
3789-
"method",
3790-
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
3791-
'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
3792-
'median_unbiased', 'normal_unbiased',
3793-
'nearest', 'lower', 'higher', 'midpoint'])
3788+
@pytest.mark.parametrize("method", quantile_methods)
37943789
def test_weights_all_ones(self, method):
37953790
ar = np.arange(24).reshape(2, 3, 4)
37963791
q = 0.5
@@ -3827,12 +3822,7 @@ def test_weights_all_ones(self, method):
38273822
actual = np.quantile(ar, q=q, weights=weights, method=method)
38283823
assert_almost_equal(actual, expected)
38293824

3830-
@pytest.mark.parametrize(
3831-
"method",
3832-
['inverted_cdf', 'closest_observation',
3833-
'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
3834-
'median_unbiased', 'normal_unbiased',
3835-
'nearest', 'lower', 'higher', 'midpoint'])
3825+
@pytest.mark.parametrize("method", quantile_methods)
38363826
def test_weights_on_multiple_axes(self, method):
38373827
"""Test supplying ND weights."""
38383828
ar = np.arange(12).reshape(3, 4).astype(float)
@@ -3844,12 +3834,7 @@ def test_weights_on_multiple_axes(self, method):
38443834
weights=weights, method=method)
38453835
assert_almost_equal(actual, expected)
38463836

3847-
@pytest.mark.parametrize(
3848-
"method",
3849-
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
3850-
'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
3851-
'median_unbiased', 'normal_unbiased',
3852-
'nearest', 'lower', 'higher', 'midpoint'])
3837+
@pytest.mark.parametrize("method", quantile_methods)
38533838
def test_various_weights(self, method):
38543839
"""Test various weights arg scenarios."""
38553840
ar = np.arange(12).reshape(3, 4)
@@ -3919,6 +3904,59 @@ def test_weights_flags(self):
39193904
with assert_raises_regex(ZeroDivisionError, "Weights sum to zero"):
39203905
np.quantile(ar, q=q, axis=axis, weights=[0, 0])
39213906

3907+
@pytest.mark.parametrize("method", ["linear", "lower", "higher",
3908+
"midpoint", "higher"])
3909+
def test_decimal_weights_for_linear_discrete_methods(self, method):
3910+
"""Test consistency when interpolating between weight bands.
3911+
3912+
For these five methods, the virtual index is (n - 1) * quantile.
3913+
Therefore, if sum of weights == 3, then q=0.5 points to index = 1.
3914+
An array of [3, 4] with weights=[2, 1] --> [3, 3, 4],
3915+
and the 0.5 qunatile sits at index=1, which is 3.
3916+
3917+
But if weights=[1.9, 1.1], then the index boundary for value 3
3918+
is [0, 0.9], and the index boundary for value 4 is [1.9, 2]
3919+
This means, the index=1 value is equivalent to interpolating
3920+
between 3 and 4, at q=0.1.
3921+
3922+
This test is designed to ensure our weighted quantile method is
3923+
faithful to this paradigm.
3924+
"""
3925+
actual = np.quantile([3, 4], q=0.5, weights=[1.9, 1.1], method=method)
3926+
expected = np.quantile([3, 4], q=0.1, method=method)
3927+
assert_almost_equal(actual, expected)
3928+
3929+
3930+
class TestQuantileMethods:
3931+
"""Test algorithmic consistency within _QuantileMethods."""
3932+
3933+
@pytest.mark.parametrize("method", quantile_methods)
3934+
def test_discrete_shortcut(self, method):
3935+
"""Test reproducibility of discrete shortcuts.
3936+
3937+
One should be able to reproduce discrete_shortcut with
3938+
get_virtual_index and fix_gamma.
3939+
"""
3940+
n = 10
3941+
ar = np.random.rand(10)
3942+
quantiles = np.array([0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0])
3943+
3944+
# only testing methods where discrete_shortcut exists
3945+
if nfb._QuantileMethods[method]["discrete_shortcut"]:
3946+
actual =\
3947+
nfb._QuantileMethods[method]["discrete_shortcut"](n, quantiles)
3948+
3949+
indexes =\
3950+
nfb._QuantileMethods[method]["get_virtual_index"](n, quantiles)
3951+
indexes[indexes < 0] = 0
3952+
previous_indexes, _ = nfb._get_indexes(ar, indexes, n)
3953+
previous_indexes[previous_indexes == -1] = n - 1
3954+
gs = indexes - previous_indexes
3955+
gammas = nfb._QuantileMethods[method]["fix_gamma"](gs, indexes)
3956+
expected = previous_indexes + gammas
3957+
3958+
assert_almost_equal(actual, expected)
3959+
39223960

39233961
class TestLerp:
39243962
@hypothesis.given(t0=st.floats(allow_nan=False, allow_infinity=False,

numpy/lib/tests/test_nanfunctions.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import numpy as np
77
from numpy.core.numeric import normalize_axis_tuple
8+
import numpy.lib.function_base as nfb
89
from numpy.lib.nanfunctions import _nan_mask, _replace_nan
910
from numpy.testing import (
1011
assert_, assert_equal, assert_almost_equal, assert_raises,
@@ -1276,12 +1277,7 @@ def test_allnans(self, axis, dtype, array):
12761277
assert np.isnan(out).all()
12771278
assert out.dtype == array.dtype
12781279

1279-
@pytest.mark.parametrize(
1280-
"method",
1281-
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
1282-
'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
1283-
'median_unbiased', 'normal_unbiased',
1284-
'nearest', 'lower', 'higher', 'midpoint'])
1280+
@pytest.mark.parametrize("method", list(nfb._QuantileMethods.keys()))
12851281
def test_weights_all_ones(self, method):
12861282
"""Test that all weights == 1 gives same results as no weights."""
12871283
ar = np.arange(24).reshape(2, 3, 4).astype(float)
@@ -1320,12 +1316,7 @@ def test_weights_all_ones(self, method):
13201316
actual = np.nanquantile(ar, q=q, weights=weights, method=method)
13211317
assert_almost_equal(actual, expected)
13221318

1323-
@pytest.mark.parametrize(
1324-
"method",
1325-
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
1326-
'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
1327-
'median_unbiased', 'normal_unbiased',
1328-
'nearest', 'lower', 'higher', 'midpoint'])
1319+
@pytest.mark.parametrize("method", list(nfb._QuantileMethods.keys()))
13291320
def test_multiple_axes(self, method):
13301321
"""Test that weights work on multiple axes."""
13311322
ar = np.arange(12).reshape(3, 4).astype(float)
@@ -1339,12 +1330,7 @@ def test_multiple_axes(self, method):
13391330
method=method)
13401331
assert_almost_equal(actual, expected)
13411332

1342-
@pytest.mark.parametrize(
1343-
"method",
1344-
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
1345-
'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
1346-
'median_unbiased', 'normal_unbiased',
1347-
'nearest', 'lower', 'higher', 'midpoint'])
1333+
@pytest.mark.parametrize("method", list(nfb._QuantileMethods.keys()))
13481334
def test_various_weights(self, method):
13491335
"""Test various weights arg scenarios."""
13501336
ar = np.arange(12).reshape(3, 4).astype(float)

0 commit comments

Comments
 (0)
0