From 88a564e81ba3b4eba7e1cd864e2ef68a39b572d1 Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:16:27 -0700 Subject: [PATCH 1/7] MAINT: Make np.histogramdd error messages consistent with np.histogram --- numpy/lib/histograms.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index aa067a4312e8..c17683c7c17b 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -884,18 +884,18 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): if np.isscalar(bins[i]): if bins[i] < 1: raise ValueError( - "Element at index %s in `bins` should be a positive " - "integer." % i) + '`bins[{}]` must be positive, when an integer'.format(i)) nbin[i] = bins[i] + 2 # +2 for outlier bins edges[i] = np.linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) else: edges[i] = np.asarray(bins[i], edge_dt) nbin[i] = len(edges[i]) + 1 # +1 for outlier bins dedges[i] = np.diff(edges[i]) + # not just monotonic, due to the use of mindiff below if np.any(np.asarray(dedges[i]) <= 0): raise ValueError( - "Found bin edge of size <= 0. Did you specify `bins` with" - "non-monotonic sequence?") + '`bins[{}]` must be strictly increasing, when an array' + .format(i)) nbin = np.asarray(nbin) From 89b402a9dab57c45eca0f942bde699ee6961a1f5 Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:18:37 -0700 Subject: [PATCH 2/7] MAINT: Unify computation of `nbin[i]` --- numpy/lib/histograms.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index c17683c7c17b..50bafec8c85a 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -885,11 +885,10 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): if bins[i] < 1: raise ValueError( '`bins[{}]` must be positive, when an integer'.format(i)) - nbin[i] = bins[i] + 2 # +2 for outlier bins - edges[i] = np.linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) + edges[i] = np.linspace(smin[i], smax[i], bins[i] + 1, dtype=edge_dt) else: edges[i] = np.asarray(bins[i], edge_dt) - nbin[i] = len(edges[i]) + 1 # +1 for outlier bins + nbin[i] = len(edges[i]) + 1 # includes an outlier on each end dedges[i] = np.diff(edges[i]) # not just monotonic, due to the use of mindiff below if np.any(np.asarray(dedges[i]) <= 0): From 52a739e2c1172876fb444a73dafd23247c826d6e Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:25:11 -0700 Subject: [PATCH 3/7] MAINT: Only check bin monotonicity if not using linspace This also switches to doing comparisons rather than subtractions, for consistency with np.histogram. That change is not strictly necessary here as the arguments are not unsigned integer types (unlike in np.histogram), but it would nice to support integer bins in future. --- numpy/lib/histograms.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index 50bafec8c85a..097df2053e35 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -888,13 +888,14 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): edges[i] = np.linspace(smin[i], smax[i], bins[i] + 1, dtype=edge_dt) else: edges[i] = np.asarray(bins[i], edge_dt) + # not just monotonic, due to the use of mindiff below + if np.any(edges[i][:-1] >= edges[i][1:]): + raise ValueError( + '`bins[{}]` must be strictly increasing, when an array' + .format(i)) + nbin[i] = len(edges[i]) + 1 # includes an outlier on each end dedges[i] = np.diff(edges[i]) - # not just monotonic, due to the use of mindiff below - if np.any(np.asarray(dedges[i]) <= 0): - raise ValueError( - '`bins[{}]` must be strictly increasing, when an array' - .format(i)) nbin = np.asarray(nbin) From fd92d02152d50735b5ffafcce3a7a722e366b192 Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:28:49 -0700 Subject: [PATCH 4/7] MAINT: Give a more useful error messages for bins of an incorrect dimension Previously gave `ValueError: object too deep for desired array` from an internal call This also adds support for 0d array bincounts --- numpy/lib/histograms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index 097df2053e35..a8cd9acb8cbe 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -881,18 +881,21 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): edge_dt = float # Create edge arrays for i in np.arange(D): - if np.isscalar(bins[i]): + if np.ndim(bins[i]) == 0: if bins[i] < 1: raise ValueError( '`bins[{}]` must be positive, when an integer'.format(i)) edges[i] = np.linspace(smin[i], smax[i], bins[i] + 1, dtype=edge_dt) - else: + elif np.ndim(bins[i]) == 1: edges[i] = np.asarray(bins[i], edge_dt) # not just monotonic, due to the use of mindiff below if np.any(edges[i][:-1] >= edges[i][1:]): raise ValueError( '`bins[{}]` must be strictly increasing, when an array' .format(i)) + else: + raise ValueError( + '`bins[{}]` must be a scalar or 1d array'.format(i)) nbin[i] = len(edges[i]) + 1 # includes an outlier on each end dedges[i] = np.diff(edges[i]) From 992163e51d48d26df05430690052e42dbdd7fe9b Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:36:56 -0700 Subject: [PATCH 5/7] MAINT/ENH: Reuse range computation code from np.histogram in np.histogramdd This also adds support for inferring the range along a subset of the axes, rather than an all or nothing approach. --- doc/release/1.15.0-notes.rst | 6 +++++ numpy/lib/histograms.py | 44 ++++++++++++------------------------ 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst index fadce76849bc..a0d42bfa41a7 100644 --- a/doc/release/1.15.0-notes.rst +++ b/doc/release/1.15.0-notes.rst @@ -100,6 +100,12 @@ as usual with `errstate`. Dates, times, and timedeltas can now be histogrammed. The bin edges must be passed explicitly, and are not yet computed automatically. +``histogramdd`` allows explicit ranges to be given in a subset of axes +---------------------------------------------------------------------- +The ``range`` argument of `histogramdd` can now contain ``None`` values to +indicate that the range for the corresponding axis should be computed from the +data. Previously, this could not be specified on a per-axis basis. + ``np.r_`` works with 0d arrays, and ``np.ma.mr_` works with ``np.ma.masked`` ---------------------------------------------------------------------------- 0d arrays passed to the `r_` and `mr_` concatenation helpers are now treated as diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index a8cd9acb8cbe..c0cfa07c75fe 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -791,9 +791,12 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): * The number of bins for all dimensions (nx=ny=...=bins). range : sequence, optional - A sequence of lower and upper bin edges to be used if the edges are - not given explicitly in `bins`. Defaults to the minimum and maximum - values along each dimension. + A sequence of length D, each an optional (lower, upper) tuple giving + the outer bin edges to be used if the edges are not given explicitly in + `bins`. + An entry of None in the sequence results in the minimum and maximum + values being used for the corresponding dimension. + The default, None, is equivalent to passing a tuple of D None values. normed : bool, optional If False, returns the number of samples in each bin. If True, returns the bin density ``bin_count / sample_count / bin_volume``. @@ -849,43 +852,26 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): # bins is an integer bins = D*[bins] - # Select range for each dimension - # Used only if number of bins is given. - if range is None: - # Handle empty input. Range can't be determined in that case, use 0-1. - if N == 0: - smin = np.zeros(D) - smax = np.ones(D) - else: - smin = np.atleast_1d(np.array(sample.min(0), float)) - smax = np.atleast_1d(np.array(sample.max(0), float)) - else: - if not np.all(np.isfinite(range)): - raise ValueError( - 'range parameter must be finite.') - smin = np.zeros(D) - smax = np.zeros(D) - for i in np.arange(D): - smin[i], smax[i] = range[i] - - # Make sure the bins have a finite width. - for i in np.arange(len(smin)): - if smin[i] == smax[i]: - smin[i] = smin[i] - .5 - smax[i] = smax[i] + .5 - # avoid rounding issues for comparisons when dealing with inexact types if np.issubdtype(sample.dtype, np.inexact): edge_dt = sample.dtype else: edge_dt = float + + # normalize the range argument + if range is None: + range = (None,) * D + elif len(range) != D: + raise ValueError('range argument must have one entry per dimension') + # Create edge arrays for i in np.arange(D): if np.ndim(bins[i]) == 0: if bins[i] < 1: raise ValueError( '`bins[{}]` must be positive, when an integer'.format(i)) - edges[i] = np.linspace(smin[i], smax[i], bins[i] + 1, dtype=edge_dt) + smin, smax = _get_outer_edges(sample[:,i], range[i]) + edges[i] = np.linspace(smin, smax, bins[i] + 1, dtype=edge_dt) elif np.ndim(bins[i]) == 1: edges[i] = np.asarray(bins[i], edge_dt) # not just monotonic, due to the use of mindiff below From 79e482a410dfb3dcdc47f409c1ef76826f00a98d Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:40:00 -0700 Subject: [PATCH 6/7] DOC: Document the surprising behaviour of np.histogramdd on array_likes --- numpy/lib/histograms.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index c0cfa07c75fe..bd2f801863ee 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -779,10 +779,18 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): Parameters ---------- - sample : array_like - The data to be histogrammed. It must be an (N,D) array or data - that can be converted to such. The rows of the resulting array - are the coordinates of points in a D dimensional polytope. + sample : (N, D) array, or (D, N) array_like + The data to be histogrammed. + + Note the unusual interpretation of sample when an array_like: + + * When an array, each row is a coordinate in a D-dimensional space - + such as ``histogramgramdd(np.array([p1, p2, p3]))``. + * When an array_like, each element is the list of values for single + coordinate - such as ``histogramgramdd((X, Y, Z))``. + + The first form should be preferred. + bins : sequence or int, optional The bin specification: From c8a5f560f1eedd510ca5656b06dd1ca2ba9322bd Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Mon, 26 Mar 2018 00:40:53 -0700 Subject: [PATCH 7/7] BUG: Use intp for indices Fixes gh-8531 --- numpy/lib/histograms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index bd2f801863ee..66e2ccda1ac7 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -927,7 +927,7 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None): # Compute the sample indices in the flattened histogram matrix. ni = nbin.argsort() - xy = np.zeros(N, int) + xy = np.zeros(N, np.intp) for i in np.arange(0, D-1): xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() xy += Ncount[ni[-1]]