8000 ENH: Tests and documentation for weights arg to quantile/percentile i… · chunweiyuan/numpy@bc01d69 · GitHub
[go: up one dir, main page]

Skip to content

Commit bc01d69

Browse files
committed
ENH: Tests and documentation for weights arg to quantile/percentile in lib.function_base and nanquantile/nanpercentile in lib.nanfunctions (numpy#9211).
Move weights normalization to function_base._validate_and_ureduce_weights(), and speed improvement to conditional statement therein (numpy#9211). Additional docs to how weights are applied for weighted quantile calculation in lib.function_base (numpy#9211). Consolidate all np.vectorize() algo into one in lib.function_base._quantile() (numpy#9211). Better mapping from weight space previous/next indexes to real space in lib.function_base._quantile(), with additional comments. (numpy#9211)
1 parent a559a16 commit bc01d69

File tree

4 files changed

+507
-84
lines changed

4 files changed

+507
-84
lines changed

numpy/lib/_function_base_impl.py

Lines changed: 175 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -3941,6 +3941,39 @@ def percentile(a,
39413941
39423942
.. versionchanged:: 1.9.0
39433943
A tuple of axes is supported
3944+
weights: array_like, optional
3945+
An array of weights associated with the values in `a`.
3946+
3947+
Weights cannot
3948+
3949+
* be negative
3950+
* sum to 0
3951+
3952+
However, they can be
3953+
3954+
* 0, as long as they do not sum to 0
3955+
* less than 1. In this case, all weights are re-normalized by
3956+
the lowest non-zero weight prior to computation.
3957+
3958+
The renormalization ensures that no weight is less than 1, and that
3959+
the weights can have a frequency interpretation.
3960+
3961+
A weight of 1 means a repetition of one for the corresponding value in
3962+
the data array. That means the sum of weights along `axis` is the
3963+
total count of values along that axis. The evaluation of weighted
3964+
quantiles therefore amounts to computing normal quantiles in weight
3965+
space.
3966+
3967+
Weights will be broadcasted to the shape of a, then reduced as done
3968+
via _ureduce().
3969+
3970+
* if wgts.shape == a.shape, simply return ureduced wgts.
3971+
* if scalar, broadcast to shape of a, followed by _ureduce.
3972+
* for 1-D or N-D array, must make sure that its size matches that
3973+
of `a` along the axis dims, so can be reshaped. Hence if
3974+
a.shape == (2, 3, 4, 5, 6), axis == (1, 3), then weights must
3975+
be of size 15, and will be reshaped to (1, 3, 1, 5, 1) before
3976+
broadcasting to the shape of `a`, followed by _ureduce.
39443977
out : ndarray, optional
39453978
Alternative output array in which to place the result. It must
39463979
have the same shape and buffer length as the expected output,
@@ -4241,6 +4274,39 @@ def quantile(a,
42414274
axis : {int, tuple of int, None}, optional
42424275
Axis or axes along which the quantiles are computed. The default is
42434276
to compute the quantile(s) along a flattened version of the array.
4277+
weights: array_like, optional
4278+
An array of weights associated with the values in `a`.
4279+
4280+
Weights cannot
4281+
4282+
* be negative
4283+
* sum to 0
4284+
4285+
However, they can be
4286+
4287+
* 0, as long as they do not sum to 0
4288+
* less than 1. In this case, all weights are re-normalized by
4289+
the lowest non-zero weight prior to computation.
4290+
4291+
The renormalization ensures that no weight is less than 1, and that
4292+
the weights can have a frequency interpretation.
4293+
4294+
A weight of 1 means a repetition of one for the corresponding value in
4295+
the data array. That means the sum of weights along `axis` is the
4296+
total count of values along that axis. The evaluation of weighted
4297+
quantiles therefore amounts to computing normal quantiles in weight
4298+
space.
4299+
4300+
Weights will be broadcasted to the shape of a, then reduced as done
4301+
via _ureduce().
4302+
4303+
* if wgts.shape == a.shape, simply return ureduced wgts.
4304+
* if scalar, broadcast to shape of a, followed by _ureduce.
4305+
* for 1-D or N-D array, must make sure that its size matches that
4306+
of `a` along the axis dims, so can be reshaped. Hence if
4307+
a.shape == (2, 3, 4, 5, 6), axis == (1, 3), then weights must
4308+
be of size 15, and will be reshaped to (1, 3, 1, 5, 1) before
4309+
broadcasting to the shape of `a`, followed by _ureduce.
42444310
out : ndarray, optional
42454311
Alternative output array in which to place the result. It must have
42464312
the same shape and buffer length as the expected output, but the
@@ -4502,6 +4568,11 @@ def _quantile_unchecked(a,
45024568
def _validate_and_ureduce_weights(a, axis, wgts):
45034569
"""Ensure quantile weights are valid.
45044570
4571+
A weight of 1 means a repetition of one for the corresponding value in
4572+
the data array. That means the sum of weights along `axis` is the total
4573+
count of values along that axis, based on which the quantiles are
4574+
evaluated.
4575+
45054576
Weights cannot:
45064577
* be negative
45074578
* sum to 0
@@ -4544,10 +4615,16 @@ def _validate_and_ureduce_weights(a, axis, wgts):
45444615
# broadcast weights to the shape of array a.
45454616
if wgts.shape == a.shape: # no need to broadcast
45464617
pass
4547-
elif wgts.size == 1:
4618+
elif wgts.size == 1: # straight-forwardly broadcastable
4619+
wgts = np.broadcast_to(wgts, a.shape)
4620+
elif (wgts.ndim == a.ndim and
4621+
all(d1 == d2 or d1 == 1 for d1, d2 in zip(wgts.shape, a.shape))):
4622+
# Same # of dims, but some are length==1. Broadcastable.
45484623
wgts = np.broadcast_to(wgts, a.shape)
45494624
elif (wgts.size == functools.reduce(lambda x, y: x * y,
45504625
[a.shape[d] for d in dims])):
4626+
# wgts might be wrong shape, but has right size to match the data dim.
4627+
# Reshape to match shape of dims, with some length==1, before broadcast
45514628
to_shape = tuple(a.shape[d] if d in dims else 1 for d in range(a.ndim))
45524629
wgts = wgts.reshape(to_shape)
45534630
wgts = np.broadcast_to(wgts, a.shape)
@@ -4565,6 +4642,22 @@ def _validate_and_ureduce_weights(a, axis, wgts):
45654642
# Obtain a weights array of the same shape as ureduced a
45664643
wgts = _ureduce(wgts, func=lambda x, **kwargs: x, axis=dims)
45674644

4645+
# Now check/renormalize weights if any is (0, 1)
4646+
def _normalize(v):
4647+
inds = v > 0
4648+
if (v[inds] < 1).any():
4649+
vec = v.copy()
4650+
vec[inds] = vec[inds] / vec[inds].min() # renormalization
4651+
return vec
4652+
else:
4653+
return v
4654+
4655+
# perform normalization along reduced axis
4656+
if len(dims) > 1:
4657+
wgts = np.apply_along_axis(_normalize, -1, wgts)
4658+
else:
4659+
wgts = np.apply_along_axis(_normalize, dims[0], wgts)
4660+
45684661
return wgts
45694662

45704663

@@ -4824,91 +4917,99 @@ def _quantile(
48244917
supports_nans = (
48254918
np.issubdtype(arr.dtype, np.inexact) or arr.dtype.kind in 'Mm')
48264919

4827-
if weights is not None: # weights is the same shape as arr
4920+
if weights is not None: # weights should be the same shape as arr
48284921
# first move data to axis=-1 for np.vectorize, which runs on axis=-1
48294922
if axis != -1:
48304923
arr = np.moveaxis(arr, axis, destination=-1)
48314924
weights = np.moveaxis(weights, axis, destination=-1)
4832-
values_count = arr.shape[-1]
4833-
4834-
# Now check/renormalize weights if any is (0, 1)
4835-
def _normalize(v):
4836-
inds = v > 0
4837-
if (v[inds] < 1).any():
4838-
vec = v.copy()
4839-
vec[inds] = vec[inds] / vec[inds].min() # renormalization
4840-
return vec
4841-
else:
4842-
return v
4843-
weights = np.apply_along_axis(_normalize, -1, weights)
4844-
4845-
# values with weight=0 are made nan and later sorted to end of array
4846-
if (weights == 0).any():
4847-
weights[weights == 0] = np.nan
4848-
4849-
def _sort_by_index(vector, vec_indices):
4850-
return vector[vec_indices]
4851-
# this func vectorizes sort along axis
4852-
n = values_count
4853-
arraysort = np.vectorize(_sort_by_index,
4854-
signature=f"({n}),({n})->({n})")
4855-
# compute the sorted array. nans will be sorted to the end.
4856-
ind_sorted = np.argsort(arr, axis=-1)
4857-
arr = arraysort(arr, ind_sorted)
4858-
# need to align weights to now-sorted arr, hence np.vectorize is used
4859-
weights = arraysort(weights, ind_sorted)
48604925

4926+
# identical treatment to slices_having_nans as no-weight case
48614927
if supports_nans:
4862-
slices_having_nans = np.isnan(
4863-
take(arr, indices=-1, axis=-1)
4864-
)
4928+
slices_having_nans = np.isnan(arr.sum(axis=-1))
48654929
else:
48664930
slices_having_nans = None
48674931

4868-
# convert weights to virtual indexes.
4869-
def _map_weights_to_indexes(wgts, quantiles, hard_indexes):
4870-
# NOTE 1-D function used within apply_along_axis() might be slower,
4871-
# but probably incurs less memory footprint.
4872-
wgts_cumsum = wgts.cumsum()
4873-
n = wgts_cumsum[-1] # sum along axis replaces n in Hyndman paper.
4932+
# Vectorized function to compute weighted quantiles.
4933+
# Each array value occupies a range within weight space, bounded
4934+
# by its left/right_weight_bound. If a weight_space_index falls within
4935+
# those two bounds, the quantile will just take on that array value.
4936+
# If the weight_space_index falls between neighboring ranges, that
4937+
# means its quantile value must fall between neighboring array values
4938+
# as well, so the gamma (computed in weight space) is used to
4939+
# interpolate between those neighboring array values.
4940+
def _get_weighted_quantile_values(arr1d, wgts1d):
4941+
# first deal with sorting
4942+
# values with weight=0 are made nan and sorted to end of array
4943+
if (wgts1d == 0).any():
4944+
arr1d = arr1d.astype(float) # allow np.nan to weight=0 cells
4945+
arr1d[wgts1d == 0] = np.nan
4946+
sort_indexes = np.argsort(arr1d)
4947+
arr1d = arr1d[sort_indexes]
4948+
wgts1d = wgts1d[sort_indexes]
4949+
4950+
wgts1d = wgts1d[~np.isnan(arr1d)] # only deal with non-nan values
4951+
arr1d = arr1d[~np.isnan(arr1d)]
4952+
# weights are a proxy to values count
4953+
wgts1d_cumsum = wgts1d.cumsum()
4954+
n = wgts1d_cumsum[-1] # sum along axis replaces n in Hyndman.
48744955
weight_space_indexes = method["get_virtual_index"](n, quantiles)
4875-
left_weight_bound = np.roll(wgts_cumsum, 1)
4876-
left_weight_bound[0] = 0
4877-
right_weight_bound = wgts_cumsum - 1
4878-
indexes = np.zeros(2 * hard_indexes.size)
4879-
weights = indexes.copy()
4880-
indexes[0::2] = hard_indexes
4881-
indexes[1::2] = hard_indexes
4882-
weights[0::2] = left_weight_bound
4883-
weights[1::2] = right_weight_bound
4884-
return np.interp(weight_space_indexes, weights, indexes)
4885-
4886-
hard_indexes = np.arange(values_count)
4887-
4888-
virtual_indexes = np.apply_along_axis(_map_weights_to_indexes,
4889-
-1, weights,
4890-
quantiles=quantiles,
4891-
hard_indexes=hard_indexes)
4892-
# unlike the no-weights case, virtual_indexes here can be N-D,
4893-
# with different indexes along every DATA_AXIS slice.
4894-
# also the use of np.interp means virtual_indexes is of type float.
4895-
previous_indexes, next_indexes = _get_indexes(arr, virtual_indexes,
4896-
values_count)
48974956

4898-
# this function becomes vectorized
4899-
def _get_values_from_indexes(arr, virtual_indexes,
4900-
previous_indexes, next_indexes):
4901-
previous = take(arr, previous_indexes)
4902-
next = take(arr, next_indexes)
4903-
gamma = _get_gamma(virtual_indexes, previous_indexes, method)
4957+
# each weight occupies a range in weight space w/ left/right bounds
4958+
left_weight_bound = np.roll(wgts1d_cumsum, 1)
4959+
left_weight_bound[0] = 0 # left-most weight bound fixed at 0
4960+
right_weight_bound = wgts1d_cumsum - 1
4961+
4962+
# now construct a mapping from weight bounds to real indexes
4963+
# for example, arr1d=[1, 2] & wgts1d=[2, 3] ->
4964+
# -> real_indexes=[0, 0, 1, 1] & w_index_bounds=[0, 1, 2, 4]
4965+
indexes = np.arange(arr1d.size)
4966+
real_indexes = np.zeros(2 * indexes.size)
4967+
w_index_bounds = real_indexes.copy()
4968+
real_indexes[0::2] = indexes # real indexes
4969+
real_indexes[1::2] = indexes
4970+
w_index_bounds[0::2] = left_weight_bound # weight index bounds
4971+
w_index_bounds[1::2] = right_weight_bound
4972+
4973+
# first define previous_w_indexes/next_w_indexes as the indexes
4974+
# within w_index_bounds whose values sandwich weight_space_indexes.
4975+
# so if w_index_bounds=[0, 1, 2, 4] and weight_space_index=3.5,
4976+
# then previous_w_indexes = 2 and next_w_indexes = 3
4977+
previous_w_indexes = np.searchsorted(w_index_bounds,
4978+
weight_space_indexes,
4979+
side="right") - 1
4980+
# leverage _get_index() to deal with out-of-bound indices
4981+
previous_w_indexes, next_w_indexes =\
4982+
_get_indexes(w_index_bounds, previous_w_indexes,
4983+
len(w_index_bounds))
4984+
# now redefine previous_w_indexes/next_w_indexes as the weight
4985+
# space indexes that neighbor weight_space_indexes.
4986+
previous_w_indexes = w_index_bounds[previous_w_indexes]
4987+
next_w_indexes = w_index_bounds[next_w_indexes]
4988+
4989+
# map all weight space indexes to real indexes, then compute gamma
4990+
previous_indexes =\
4991+
np.interp(previous_w_indexes, w_index_bounds, real_indexes)
4992+
next_indexes =\
4993+
np.interp(next_w_indexes, w_index_bounds, real_indexes)
4994+
indexes =\
4995+
np.interp(weight_space_indexes, w_index_bounds, real_indexes)
4996+
4997+
# method-dependent gammas determine interpolation scheme between
4998+
# neighboring values, and are computed in weight space.
4999+
gamma = _get_gamma(indexes, previous_indexes, method)
5000+
5001+
previous = take(arr1d, previous_indexes.astype(int))
5002+
next = take(arr1d, next_indexes.astype(int))
49045003
return _lerp(previous, next, gamma, out=out)
49055004

4906-
m = virtual_indexes.shape[-1]
4907-
get_values_from_indexes =\
4908-
np.vectorize(_get_values_from_indexes,
4909-
signature=f"({n}),({m}),({m}),({m})->({m})")
4910-
result = get_values_from_indexes(arr, virtual_indexes,
4911-
previous_indexes, next_indexes)
5005+
n = arr.shape[-1] if arr.ndim else 1 # same as values_count
5006+
m = quantiles.shape[-1] if quantiles.ndim else ""
5007+
5008+
get_weighted_quantile_values =\
5009+
np.vectorize(_get_weighted_quantile_values,
5010+
signature=f"({n}),({n})->({m})")
5011+
result = get_weighted_quantile_values(arr, weights)
5012+
49125013
# now move data to DATA_AXIS to be consistent with no-weights case
49135014
result = np.moveaxis(result, -1, destination=0)
49145015

@@ -4935,7 +5036,7 @@ def _get_values_from_indexes(arr, virtual_indexes,
49355036

49365037
def _take_sorted_index_values(arr, method, virtual_indexes, values_count,
49375038
supports_nans, out):
4938-
5039+
"""Evaluate quantile values for all cases without weights."""
49395040
if np.issubdtype(virtual_indexes.dtype, np.integer):
49405041
# No interpolation needed, take the points along axis
49415042
if supports_nans:

numpy/lib/_nanfunctions_impl.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,8 +1570,12 @@ def _nanquantile_unchecked(
15701570
return np.nanmean(a, axis, out=out, keepdims=keepdims)
15711571

15721572
if weights is not None:
1573+
<<<<<<< HEAD
15731574
weights = fnb._validate_and_ureduce_weights(a, axis, weights)
15741575
weights[np.isnan(a)] = np.nan # for _nanquantile_1d
1576+
=======
1577+
weights = function_base._validate_and_ureduce_weights(a, axis, weights)
1578+
>>>>>>> ENH: Tests and documentation for weights arg to quantile/percentile in lib.function_base and nanquantile/nanpercentile in lib.nanfunctions (#9211).
15751579

15761580
return fnb._ureduce(a,
15771581
func=_nanquantile_ureduce_func,
@@ -1595,31 +1599,31 @@ def _nanquantile_ureduce_func(a, q, axis=None, weights=None, out=None,
15951599
part = a.ravel()
15961600
if weights is not None:
15971601
weights = weights.ravel()
1602+
weights[np.isnan(part)] = np.nan # dealt with in _nanquantile_1d
15981603
result = _nanquantile_1d(part, q, weights, overwrite_input, method)
15991604
else:
16001605
if weights is not None:
1601-
1606+
# weights could be made from np.broadcast_to as non-writeable view
1607+
weights = weights.copy() # this makes it writeable
1608+
weights[np.isnan(a)] = np.nan # dealt with in _nanquantile_1d
16021609
if axis != -1: # move data to axis=-1 for np.vectorize() below
16031610
a = np.moveaxis(a, axis, destination=-1)
16041611
weights = np.moveaxis(weights, axis, destination=-1)
16051612

16061613
def _vectorize_nanquantile_1d(arr1d, ws1d):
16071614
return _nanquantile_1d(arr1d, q, ws1d, overwrite_input, method)
16081615

1609-
n = a.shape[axis]
1610-
m = len(q)
1616+
n = a.shape[axis] if a.ndim else 1
1617+
m = q.shape[axis] if q.ndim else ""
16111618
vectorized_nanquantile_1d =\
16121619
np.vectorize(_vectorize_nanquantile_1d,
16131620
signature=f"({n}),({n})->({m})")
16141621

16151622
result = vectorized_nanquantile_1d(a, weights)
16161623

16171624
# now move data axis back for consistency with the no-weights case
1618-
if axis != -1:
1619-
if q.ndim == 0:
1620-
result = np.moveaxis(result, -1, axis)
1621-
else:
1622-
result = np.moveaxis(result, -1, 0)
1625+
if axis != -1 and q.ndim:
1626+
result = np.moveaxis(result, -1, 0)
16231627

16241628
else:
16251629
result = np.apply_along_axis(_nanquantile_1d, axis, a, q,

0 commit comments

Comments
 (0)
0