8000 Fixed bug for regression test #1181 in scipy unit tests; ksdensity is… · solvents/matplotlib@079f4e7 · GitHub
[go: up one dir, main page]

Skip to content

Commit 079f4e7

Browse files
khchansolvents
authored andcommitted
Fixed bug for regression test matplotlib#1181 in scipy unit tests; ksdensity is now referred to as gaussian_kde and exists as a class in mlab.
Fixed list comp position bug and updated examples
1 parent 887d081 commit 079f4e7

File tree

3 files changed

+149
-122
lines changed

3 files changed

+149
-122
lines changed

examples/statistics/violinplot_demo.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# fake data
1010
fs = 10 # fontsize
11-
pos = range(5)
11+
pos = [1,2,4,5,7,8]
1212
data = [np.random.normal(size=100) for i in pos]
1313

1414
# TODO: future customizability dicts go here
@@ -25,22 +25,28 @@
2525

2626
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6,6))
2727

28-
axes[0, 0].violinplot(data, pos, width=0.1)
28+
axes[0, 0].violinplot(data, pos, points=20, widths=0.1, showmeans=True,
29+
showextrema=True, showmedians=True)
2930
axes[0, 0].set_title('Custom violinplot 1', fontsize=fs)
3031

31-
axes[0, 1].violinplot(data, pos, width=0.3)
32+
axes[0, 1].violinplot(data, pos, points=40, widths=0.3, showmeans=True,
33+
showextrema=True, showmedians=True)
3234
axes[0, 1].set_title('Custom violinplot 2', fontsize=fs)
3335

34-
axes[0, 2].violinplot(data, pos, width=0.5)
36+
axes[0, 2].violinplot(data, pos, points=60, widths=0.5, showmeans=True,
37+
showextrema=True, showmedians=True)
3538
axes[0, 2].set_title('Custom violinplot 3', fontsize=fs)
3639

37-
axes[1, 0].violinplot(data, pos, width=0.7)
40+
axes[1, 0].violinplot(data, pos, points=80, vert=False, widths=0.7,
41+
showmeans=True, showextrema=True, showmedians=True)
3842
axes[1, 0].set_title('Custom violinplot 4', fontsize=fs)
3943

40-
axes[1, 1].violinplot(data, pos, width=0.9)
44+
axes[1, 1].violinplot(data, pos, points=100, vert=False, widths=0.9,
45+
showmeans=True, showextrema=True, showmedians=True)
4146
axes[1, 1].set_title('Custom violinplot 5', fontsize=fs)
4247

43-
axes[1, 2].violinplot(data, pos, width=1.1)
48+
axes[1, 2].violinplot(data, pos, points=200, vert=False, widths=1.1,
49+
showmeans=True, showextrema=True, showmedians=True)
4450
axes[1, 2].set_title('Custom violinplot 6', fontsize=fs)
4551

4652
for ax in axes.flatten():

lib/matplotlib/axes/_axes.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6752,7 +6752,7 @@ def matshow(self, Z, **kwargs):
67526752
integer=True))
67536753
return im
67546754

6755-
def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=False,
6755+
def violinplot(self, dataset, positions=None, points=100, vert=True, widths=0.5, showmeans=False,
67566756
showextrema=True, showmedians=False):
67576757
"""
67586758
Make a violin plot.
@@ -6775,6 +6775,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
67756775
positions : array-like, default = [1, 2, ..., n]
67766776
Sets the positions of the violins. The ticks and limits are
67776777
automatically set to match the positions.
6778+
6779+
points: array-like, default = 100
6780+
Number of points to evaluate pdf estimation for Gaussian kernel
67786781
67796782
vert : bool, default = True.
67806783
If true, creates vertical violin plot
@@ -6833,6 +6836,9 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
68336836
cbars = None
68346837
cmedians = None
68356838

6839+
datashape_message = ("List of violinplot statistics and `{0}` "
6840+
"values must have same the length")
6841+
68366842
# Validate positions
68376843
if positions == None:
68386844
positions = range(1, len(dataset) + 1)
@@ -6857,13 +6863,14 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
68576863
# Render violins
68586864
for d,p,w in zip(dataset,positions,widths):
68596865
# Calculate the kernel density
6860-
kde = mlab.ksdensity(d)
6861-
m = kde['xmin']
6862-
M = kde['xmax']
6863-
mean = kde['mean']
6864-
median = kde['median']
6865-
v = kde['result']
6866-
coords = np.arange(m,M,(M-m)/100.)
6866+
kde = mlab.gaussian_kde(d)
6867+
m = kde.dataset.min()
6868+
M = kde.dataset.max()
6869+
mean = np.mean(kde.dataset)
6870+
median = np.median(kde.dataset)
6871+
coords = np.arange(m,M,(M-m)/float(points))
6872+
6873+
v = kde.evaluate(coords)
68676874

68686875
# Since each data point p is plotted from v-p to v+p,
68696876
# we need to scale it by an additional 0.5 factor so that we get
@@ -6873,10 +6880,10 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
68736880
# create vertical violin plot
68746881
if vert:
68756882
bodies += [self.fill_betweenx(coords,
6876-
-v+p,
6877-
v+p,
6878-
facecolor='y',
6879-
alpha=0.3)]
6883+
-v+p,
6884+
v+p,
6885+
facecolor='y',
6886+
alpha=0.3)]
68806887
# create horizontal violin plot
68816888
else:
68826889
bodies += [self.fill_between(coords,
@@ -6922,10 +6929,6 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5, showmeans=F
69226929
if showmedians:
69236930
cmedians = self.vlines(medians, pmins, pmaxes, colors='r')
69246931

6925-
6926-
6927-
6928-
69296932
# Reset hold
69306933
self.hold(holdStatus)
69316934

lib/matplotlib/mlab.py

Lines changed: 117 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -3678,12 +3678,12 @@ def stineman_interp(xi,x,y,yp=None):
36783678
1/(dy1+dy2),))
36793679
return yi
36803680

3681-
def ksdensity(dataset, bw_method=None):
3681+
class gaussian_kde(object):
36823682
"""
36833683
Representation of a kernel-density estimate using Gaussian kernels.
36843684
36853685
Call signature::
3686-
kde_dict = ksdensity(dataset, 'silverman')
3686+
kde = gaussian_kde(dataset, 'silverman')
36873687
36883688
Parameters
36893689
----------
@@ -3698,10 +3698,10 @@ def ksdensity(dataset, bw_method=None):
36983698
Attributes
36993699
----------
37003700
dataset : ndarray
3701-
The dataset with which `ksdensity` was initialized.
3702-
d : int
3701+
The dataset with which `gaussian_kde` was initialized.
3702+
dim : int
37033703
Number of dimensions.
3704-
n : int
3704+
num_dp : int
37053705
Number of datapoints.
37063706
factor : float
37073707
The bandwidth factor, obtained from `kde.covariance_factor`, with which
@@ -3712,117 +3712,135 @@ def ksdensity(dataset, bw_method=None):
37123712
inv_cov : ndarray
37133713
The inverse of `covariance`.
37143714
3715-
Returns
3715+
Methods
37163716
-------
3717-
A dictionary mapping each various aspects of the computed KDE.
3718-
The dictionary has the following keys:
3719-
3720-
xmin : number
3721-
The min of the input dataset
3722-
xmax : number
3723-
The max of the input dataset
3724-
mean : number
3725-
The mean of the result
3726-
median: number
3727-
The median of the result
3728-
result: (# of points,)-array
3729-
The array of the evaluated PDF estimation
3730-
3731-
Raises
3732-
------
3733-
ValueError : if the dimensionality of the input points is different than
3734-
the dimensionality of the KDE.
3717+
kde.evaluate(points) : ndarray
3718+
Evaluate the estimated pdf on a provided set of points.
3719+
kde(points) : ndarray
3720+
Same as kde.evaluate(points)
3721+
kde.set_bandwidth(bw_method='scott') : None
3722+
Computes the bandwidth, i.e. the coefficient that multiplies the data
3723+
covariance matrix to obtain the kernel covariance matrix.
3724+
.. versionadded:: 0.11.0
3725+
kde.covariance_factor : float
3726+
Computes the coefficient (`kde.factor`) that multiplies the data
3727+
covariance matrix to obtain the kernel covariance matrix.
3728+
The default is `scotts_factor`. A subclass can overwrite this method
3729+
to provide a different method, or set it through a call to
3730+
`kde.set_bandwidth`.
37353731
37363732
"""
37373733

37383734
# This implementation with minor modification was too good to pass up.
37393735
# from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
37403736

3741-
dataset = np.array(np.atleast_2d(dataset))
3742-
xmin = dataset.min()
3743-
xmax = dataset.max()
3737+
def __init__(self, dataset, bw_method=None):
3738+
self.dataset = np.atleast_2d(dataset)
3739+
if not self.dataset.size > 1:
3740+
raise ValueError("`dataset` input should have multiple elements.")
37443741

3745-
if not dataset.size > 1:
3746-
raise ValueError("`dataset` input should have multiple elements.")
3742+
self.dim, self.num_dp = self.dataset.shape
3743+
self.set_bandwidth(bw_method=bw_method)
37473744

3748-
dim, num_dp = dataset.shape
3745+
def scotts_factor(self):
3746+
return np.power(self.num_dp, -1./(self.dim+4))
37493747

3750-
# ----------------------------------------------
3751-
# Set Bandwidth, defaulted to Scott's Factor
3752-
# ----------------------------------------------
3753-
scotts_factor = lambda: np.power(num_dp, -1./(dim+4))
3754-
silverman_factor = lambda: np.power(num_dp*(dim+2.0)/4.0, -1./(dim+4))
3748+
def silverman_factor(self):
3749+
return np.power(self.num_dp*(self.dim+2.0)/4.0, -1./(self.dim+4))
37553750

3756-
# Default method to calculate bandwidth, can be overwritten by subclass
3751+
# Default method to calculate bandwidth, can be overwritten by subclass
37573752
covariance_factor = scotts_factor
37583753

3759-
if bw_method is None:
3760-
pass
3761-
elif bw_method == 'scott':
3762-
covariance_factor = scotts_factor
3763-
elif bw_method == 'silverman':
3764-
covariance_factor = silverman_factor
3765-
elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
3766-
covariance_factor = lambda: bw_method
3767-
else:
3768-
msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
3769-
raise ValueError(msg)
3770-
3771-
# ---------------------------------------------------------------
3772-
# Computes covariance matrix for each Gaussian kernel with factor
3773-
# ---------------------------------------------------------------
3774-
factor = covariance_factor()
3775-
3776-
# Cache covariance and inverse covariance of the data
3777-
data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1, bias=False))
3778-
data_inv_cov = np.linalg.inv(data_covariance)
3779-
3780-
covariance = data_covariance * factor**2
3781-
inv_cov = data_inv_cov / factor**2
3782-
norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * num_dp
3783-
3784-
# ----------------------------------------------
3785-
# Evaluate the estimated pdf on a set of points.
3786-
# ----------------------------------------------
3787-
points = np.atleast_2d(np.arange(xmin, xmax, (xmax-xmin)/100.))
3788-
3789-
dim_pts, num_dp_pts = np.array(points).shape
3790-
if dim_pts != dim:
3791-
if dim_pts == 1 and num_dp_pts == num_dp:
3792-
# points was passed in as a row vector
3793-
points = np.reshape(points, (dim, 1))
3794-
num_dp_pts = 1
3754+
def set_bandwidth(self, bw_method=None):
3755+
if bw_method is None:
3756+
pass
3757+
elif bw_method == 'scott':
3758+
self.covariance_factor = self.scotts_factor
3759+
elif bw_method == 'silverman':
3760+
self.covariance_factor = self.silverman_factor
3761+
elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
3762+
self._bw_method = 'use constant'
3763+
self.covariance_factor = lambda: bw_method
3764+
elif callable(bw_method):
3765+
self._bw_method = bw_method
3766+
self.covariance_factor = lambda: self._bw_method(self)
37953767
else:
3796-
msg = "points have dimension %s,\
3797-
dataset has dimension %s" % (dim_pts, dim)
3768+
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
3769+
"or a callable."
37983770
raise ValueError(msg)
37993771

3800-
result = np.zeros((num_dp_pts,), dtype=np.float)
3772+
self._compute_covariance()
38013773

3802-
if num_dp_pts >= num_dp:
3803-
# there are more points than data, so loop over data
3804-
for i in range(num_dp):
3805-
diff = dataset[:, i, np.newaxis] - points
3806-
tdiff = np.dot(inv_cov, diff)
3807-
energy = np.sum(diff*tdiff, axis=0) / 2.0
3808-
result = result + np.exp(-energy)
3809-
else:
3810-
# loop over points
3811-
for i in range(num_dp_pts):
3812-
diff = dataset - points[:, i, np.newaxis]
3813-
tdiff = np.dot(inv_cov, diff)
3814-
energy = np.sum(diff * tdiff, axis=0) / 2.0
3815-
result[i] = np.sum(np.exp(-energy), axis=0)
3816-
3817-
result = result / norm_factor
3818-
3819-
return {
3820-
'xmin': xmin,
3821-
'xmax': xmax,
3822-
'mean': np.mean(dataset),
3823-
'median': np.median(dataset),
3824-
'result': result
3825-
}
3774+
def _compute_covariance(self):
3775+
"""Computes the covariance matrix for each Gaussian kernel using
3776+
covariance_factor().
3777+
"""
3778+
self.factor = self.covariance_factor()
3779+
# Cache covariance and inverse covariance of the data
3780+
if not hasattr(self, '_data_inv_cov'):
3781+
self._data_covariance = np.atleast_2d(np.cov(self.dataset, rowvar=1,
3782+
bias=False))
3783+
self._data_inv_cov = np.linalg.inv(self._data_covariance)
3784+
3785+
self.covariance = self._data_covariance * self.factor**2
3786+
self.inv_cov = self._data_inv_cov / self.factor**2
3787+
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.num_dp
3788+
3789+
def evaluate(self, points):
3790+
"""Evaluate the estimated pdf on a set of points.
3791+
3792+
Parameters
3793+
----------
3794+
points : (# of dimensions, # of points)-array
3795+
Alternatively, a (# of dimensions,) vector can be passed in and
3796+
treated as a single point.
3797+
3798+
Returns
3799+
-------
3800+
values : (# of points,)-array
3801+
The values at each point.
3802+
3803+
Raises
3804+
------
3805+
ValueError : if the dimensionality of the input points is different than
3806+
the dimensionality of the KDE.
3807+
3808+
"""
3809+
points = np.atleast_2d(points)
3810+
3811+
d, m = points.shape
3812+
if d != self.dim:
3813+
if d == 1 and m == self.dim:
3814+
# points was passed in as a row vector
3815+
points = np.reshape(points, (self.dim, 1))
3816+
m = 1
3817+
else:
3818+
msg = "points have dimension %s, dataset has dimension %s" % (d,
3819+
self.dim)
3820+
raise ValueError(msg)
3821+
3822+
result = np.zeros((m,), dtype=np.float)
3823+
3824+
if m >= self.num_dp:
3825+
# there are more points than data, so loop over data
3826+
for i in range(self.num_dp):
3827+
diff = self.dataset[:, i, np.newaxis] - points
3828+
tdiff = np.dot(self.inv_cov, diff)
3829+
energy = np.sum(diff*tdiff,axis=0) / 2.0
3830+
result = result + np.exp(-energy)
3831+
else:
3832+
# loop over points
3833+
for i in range(m):
3834+
diff = self.dataset - points[:, i, np.newaxis]
3835+
tdiff = np.dot(self.inv_cov, diff)
3836+
energy = np.sum(diff * tdiff, axis=0) / 2.0
3837+
result[i] = np.sum(np.exp(-energy), axis=0)
3838+
3839+
result = result / self._norm_factor
3840+
3841+
return result
3842+
3843+
__call__ = evaluate
38263844

38273845
##################################################
38283846
# Code related to things in and around polygons

0 commit comments

Comments
 (0)
0