-
-
Notifications
You must be signed in to change notification settings - Fork 10.9k
MAINT: Move histogram and histogramdd into their own module #10186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Actual diff, ignoring the move:--- :\numpy\numpy\lib\function_base.py 2017-12-09 14:16:48.000000000 -0800
+++ :\numpy\lib\histograms.py 2017-12-09 14:14:36.000000000 -0800
@@ -641,15 +411,15 @@
... rng.normal(loc=5, scale=2, size=1000)))
>>> plt.hist(a, bins='auto') # arguments are passed to np.histogram
>>> plt.title("Histogram with 'auto' bins")
>>> plt.show()
"""
- a = asarray(a)
+ a = np.asarray(a)
if weights is not None:
- weights = asarray(weights)
+ weights = np.asarray(weights)
if weights.shape != a.shape:
raise ValueError(
'weights should have the same shape as a.')
weights = weights.ravel()
a = a.ravel()
@@ -730,13 +500,13 @@
raise ValueError('`bins` must be 1d, when an array')
del bins
# compute the bins if only the count was specified
if n_equal_bins is not None:
- bin_edges = linspace(
+ bin_edges = np.linspace(
first_edge, last_edge, n_equal_bins + 1, endpoint=True)
# Histogram is an integer or a float array depending on the weights.
if weights is None:
ntype = np.dtype(np.intp)
else:
@@ -766,13 +536,13 @@
norm = n_equal_bins / (last_edge - first_edge)
# We iterate over blocks here for two reasons: the first is that for
# large arrays, it is actually faster (for example for a 10^8 array it
# is 2x as fast) and it results in a memory footprint 3x lower in the
# limit of large arrays.
- for i in arange(0, len(a), BLOCK):
+ for i in np.arange(0, len(a), BLOCK):
tmp_a = a[i:i+BLOCK]
if weights is None:
tmp_w = None
else:
tmp_w = weights[i:i + BLOCK]
@@ -811,19 +581,19 @@
n += np.bincount(indices, weights=tmp_w,
minlength=n_equal_bins).astype(ntype)
else:
# Compute via cumulative histogram
cum_n = np.zeros(bin_edges.shape, ntype)
if weights is None:
- for i in arange(0, len(a), BLOCK):
- sa = sort(a[i:i+BLOCK])
+ for i in np.arange(0, len(a), BLOCK):
+ sa = np.sort(a[i:i+BLOCK])
cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
sa.searchsorted(bin_edges[-1], 'right')]
else:
- zero = array(0, dtype=ntype)
- for i in arange(0, len(a), BLOCK):
+ zero = np.array(0, dtype=ntype)
+ for i in np.arange(0, len(a), BLOCK):
tmp_a = a[i:i+BLOCK]
tmp_w = weights[i:i+BLOCK]
sorting_index = np.argsort(tmp_a)
sa = tmp_a[sorting_index]
sw = tmp_w[sorting_index]
cw = np.concatenate(([zero], sw.cumsum()))
@@ -831,17 +601,17 @@
sa.searchsorted(bin_edges[-1], 'right')]
cum_n += cw[bin_index]
n = np.diff(cum_n)
if density:
- db = array(np.diff(bin_edges), float)
+ db = np.array(np.diff(bin_edges), float)
return n/db/n.sum(), bin_edges
elif normed:
# deprecated, buggy behavior. Remove for NumPy 2.0.0
- db = array(np.diff(bin_edges), float)
+ db = np.array(np.diff(bin_edges), float)
return n/(n*db).sum(), bin_edges
else:
return n, bin_edges
def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
@@ -898,20 +668,20 @@
try:
# Sample is an ND-array.
N, D = sample.shape
except (AttributeError, ValueError):
# Sample is a sequence of 1D arrays.
- sample = atleast_2d(sample).T
+ sample = np.atleast_2d(sample).T
N, D = sample.shape
- nbin = empty(D, int)
+ nbin = np.empty(D, int)
edges = D*[None]
dedges = D*[None]
if weights is not None:
- weights = asarray(weights)
+ weights = np.asarray(weights)
try:
M = len(bins)
if M != D:
raise ValueError(
'The dimension of bins must be equal to the dimension of the '
@@ -922,4245 +692,120 @@
# Select range for each dimension
# Used only if number of bins is given.
if range is None:
# Handle empty input. Range can't be determined in that case, use 0-1.
if N == 0:
- smin = zeros(D)
- smax = ones(D)
+ smin = np.zeros(D)
+ smax = np.ones(D)
else:
- smin = atleast_1d(array(sample.min(0), float))
- smax = atleast_1d(array(sample.max(0), float))
+ smin = np.atleast_1d(np.array(sample.min(0), float))
+ smax = np.atleast_1d(np.array(sample.max(0), float))
else:
if not np.all(np.isfinite(range)):
raise ValueError(
'range parameter must be finite.')
- smin = zeros(D)
- smax = zeros(D)
- for i in arange(D):
+ smin = np.zeros(D)
+ smax = np.zeros(D)
+ for i in np.arange(D):
smin[i], smax[i] = range[i]
# Make sure the bins have a finite width.
- for i in arange(len(smin)):
+ for i in np.arange(len(smin)):
if smin[i] == smax[i]:
smin[i] = smin[i] - .5
smax[i] = smax[i] + .5
# avoid rounding issues for comparisons when dealing with inexact types
if np.issubdtype(sample.dtype, np.inexact):
edge_dt = sample.dtype
else:
edge_dt = float
# Create edge arrays
- for i in arange(D):
- if isscalar(bins[i]):
+ for i in np.arange(D):
+ if np.isscalar(bins[i]):
if bins[i] < 1:
raise ValueError(
"Element at index %s in `bins` should be a positive "
"integer." % i)
nbin[i] = bins[i] + 2 # +2 for outlier bins
- edges[i] = linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt)
+ edges[i] = np.linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt)
else:
- edges[i] = asarray(bins[i], edge_dt)
+ edges[i] = np.asarray(bins[i], edge_dt)
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
- dedges[i] = diff(edges[i])
+ dedges[i] = np.diff(edges[i])
if np.any(np.asarray(dedges[i]) <= 0):
raise ValueError(
"Found bin edge of size <= 0. Did you specify `bins` with"
"non-monotonic sequence?")
- nbin = asarray(nbin)
+ nbin = np.asarray(nbin)
# Handle empty input.
if N == 0:
return np.zeros(nbin-2), edges
# Compute the bin number each sample falls into.
Ncount = {}
- for i in arange(D):
- Ncount[i] = digitize(sample[:, i], edges[i])
+ for i in np.arange(D):
+ Ncount[i] = np.digitize(sample[:, i], edges[i])
# Using digitize, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right edge to be
# counted in the last bin, and not as an outlier.
- for i in arange(D):
+ for i in np.arange(D):
# Rounding precision
mindiff = dedges[i].min()
if not np.isinf(mindiff):
- decimal = int(-log10(mindiff)) + 6
+ decimal = int(-np.log10(mindiff)) + 6
# Find which points are on the rightmost edge.
not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
- on_edge = (around(sample[:, i], decimal) ==
- around(edges[i][-1], decimal))
+ on_edge = (np.around(sample[:, i], decimal) ==
+ np.around(edges[i][-1], decimal))
# Shift these points one bin to the left.
- Ncount[i][nonzero(on_edge & not_smaller_than_edge)[0]] -= 1
+ Ncount[i][np.nonzero(on_edge & not_smaller_than_edge)[0]] -= 1
# Flattened histogram matrix (1D)
# Reshape is used so that overlarge arrays
# will raise an error.
- hist = zeros(nbin, float).reshape(-1)
+ hist = np.zeros(nbin, float).reshape(-1)
# Compute the sample indices in the flattened histogram matrix.
ni = nbin.argsort()
- xy = zeros(N, int)
- for i in arange(0, D-1):
+ xy = np.zeros(N, int)
+ for i in np.arange(0, D-1):
xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod()
xy += Ncount[ni[-1]]
# Compute the number of repetitions in xy and assign it to the
# flattened histmat.
if len(xy) == 0:
- return zeros(nbin-2, int), edges
+ return np.zeros(nbin-2, int), edges
- flatcount = bincount(xy, weights)
- a = arange(len(flatcount))
+ flatcount = np.bincount(xy, weights)
+ a = np.arange(len(flatcount))
hist[a] = flatcount
# Shape into a proper matrix
- hist = hist.reshape(sort(nbin))
- for i in arange(nbin.size):
+ hist = hist.reshape(np.sort(nbin))
+ for i in np.arange(nbin.size):
j = ni.argsort()[i]
hist = hist.swapaxes(i, j)
ni[i], ni[j] = ni[j], ni[i]
# Remove outliers (indices 0 and -1 for each dimension).
core = D*[slice(1, -1)]
hist = hist[core]
# Normalize if normed is True
if normed:
s = hist.sum()
- for i in arange(D):
- shape = ones(D, int)
+ for i in np.arange(D):
+ shape = np.ones(D, int)
shape[i] = nbin[i] - 2
hist = hist / dedges[i].reshape(shape)
hist /= s
if (hist.shape != nbin - 2).any():
raise RuntimeError(
"Internal Shape Error")
return hist, edges |
Seems sensible |
Deliberately leaving |
Yep, I almost did the same a bit ago. I think it would be good to change the imports in the Needs a release note. |
Why does this need a release note? Nothing has changed to the end user.
In case someone has
Was tempted, but I'm not sure that having |
It wasn't, but too late for that. |
It's not too late to stop it expanding any further though. On the other hand, the only way things get into the top level numpy namespace is |
Maybe it should at some point, having the relevant imports scattered about gets confusing after a while. Ideally, everyone would just import this from numpy as I expect most everyone does at this point. Anyway, it could go under improvements, the entries there generally don't involve new functionality or changes to old functionality, but just serve to let folks know something has been done. There is a bit of the newsletter in the release notes. |
Alright, I'll fix up the imports and add a note |
Pretty much everything in lib ends up at the top level, except for recfunctions, which were considered experimental. I think it is good to conform to expectations. For completely new stuff, e.g., the new polynomial package, explicit import can be required. |
800 self-contained lines are easily enough to go in their own file, as are the 500 lines of tests. For compatibility, the names are still available through `np.lib.function_base.histogram` and `from np.lib.function_base import *` For simplicity of imports, all of the unqualified `np.` names are now qualified
084b6fa
to
34c9950
Compare
to maintain compatibility, aliased at ``np.lib.function_base.histogram(dd)``. | ||
|
||
Code that does ``from np.lib.function_base import *`` will need to be updated | ||
with the new location, and should consider not using ``import *`` in future. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Presumably we can get away with this? The alternative is to have duplicate entries in __all__
, but we already do that (#10198)...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed with this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is all OK
Ready to go, @charris? |
hmm, seems the release notes have to be merged twice to make github think there are no conflicts. |
Thanks Eric. |
800 self-contained lines are easily enough to go in their own file, as are the 500 lines of tests.
For compatibility, the names are still available through
np.lib.function_base.histogram
andfrom np.lib.function_base import *
For simplicity of imports, all of the unqualified
np.
names are now qualifiedInspired by thinking about #10183, and realizing that there's already way too much histogram stuff in function_base