@@ -3656,12 +3656,12 @@ def stineman_interp(xi,x,y,yp=None):
3656
3656
1 / (dy1 + dy2 ),))
3657
3657
return yi
3658
3658
3659
- class gaussian_kde ( object ):
3659
+ def ksdensity ( dataset , bw_method = None ):
3660
3660
"""
3661
3661
Representation of a kernel-density estimate using Gaussian kernels.
3662
3662
3663
3663
Call signature::
3664
- kde = gaussian_kde (dataset, 'silverman')
3664
+ kde_dict = ksdensity (dataset, 'silverman')
3665
3665
3666
3666
Parameters
3667
3667
----------
@@ -3676,10 +3676,10 @@ class gaussian_kde(object):
3676
3676
Attributes
3677
3677
----------
3678
3678
dataset : ndarray
3679
- The dataset with which `gaussian_kde ` was initialized.
3680
- dim : int
3679
+ The dataset with which `ksdensity ` was initialized.
3680
+ d : int
3681
3681
Number of dimensions.
3682
- num_dp : int
3682
+ n : int
3683
3683
Number of datapoints.
3684
3684
factor : float
3685
3685
The bandwidth factor, obtained from `kde.covariance_factor`, with which
@@ -3690,135 +3690,117 @@ class gaussian_kde(object):
3690
3690
inv_cov : ndarray
3691
3691
The inverse of `covariance`.
3692
3692
3693
- Methods
3693
+ Returns
3694
3694
-------
3695
- kde.evaluate(points) : ndarray
3696
- Evaluate the estimated pdf on a provided set of points.
3697
- kde(points) : ndarray
3698
- Same as kde.evaluate(points)
3699
- kde.set_bandwidth(bw_method='scott') : None
3700
- Computes the bandwidth, i.e. the coefficient that multiplies the data
3701
- covariance matrix to obtain the kernel covariance matrix.
3702
- .. versionadded:: 0.11.0
3703
- kde.covariance_factor : float
3704
- Computes the coefficient (`kde.factor`) that multiplies the data
3705
- covariance matrix to obtain the kernel covariance matrix.
3706
- The default is `scotts_factor`. A subclass can overwrite this method
3707
- to provide a different method, or set it through a call to
3708
- `kde.set_bandwidth`.
3695
+ A dictionary mapping each various aspects of the computed KDE.
3696
+ The dictionary has the following keys:
3697
+
3698
+ xmin : number
3699
+ The min of the input dataset
3700
+ xmax : number
3701
+ The max of the input dataset
3702
+ mean : number
3703
+ The mean of the result
3704
+ median: number
3705
+ The median of the result
3706
+ result: (# of points,)-array
3707
+ The array of the evaluated PDF estimation
3708
+
3709
+ Raises
3710
+ ------
3711
+ ValueError : if the dimensionality of the input points is different than
3712
+ the dimensionality of the KDE.
3709
3713
3710
3714
"""
3711
3715
3712
3716
# This implementation with minor modification was too good to pass up.
3713
3717
# from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
3714
3718
3715
- def __init__ (self , dataset , bw_method = None ):
3716
- self .dataset = np .atleast_2d (dataset )
3717
- if not self .dataset .size > 1 :
3718
- raise ValueError ("`dataset` input should have multiple elements." )
3719
+ dataset = np .array (np .atleast_2d (dataset ))
3720
+ xmin = dataset .min ()
3721
+ xmax = dataset .max ()
3719
3722
3720
- self . dim , self . num_dp = self . dataset . shape
3721
- self . set_bandwidth ( bw_method = bw_method )
3723
+ if not dataset . size > 1 :
3724
+ raise ValueError ( "`dataset` input should have multiple elements." )
3722
3725
3723
- def scotts_factor (self ):
3724
- return np .power (self .num_dp , - 1. / (self .dim + 4 ))
3726
+ dim , num_dp = dataset .shape
3725
3727
3726
- def silverman_factor (self ):
3727
- return np .power (self .num_dp * (self .dim + 2.0 )/ 4.0 , - 1. / (self .dim + 4 ))
3728
+ # ----------------------------------------------
3729
+ # Set Bandwidth, defaulted to Scott's Factor
3730
+ # ----------------------------------------------
3731
+ scotts_factor = lambda : np .power (num_dp , - 1. / (dim + 4 ))
3732
+ silverman_factor = lambda : np .power (num_dp * (dim + 2.0 )/ 4.0 , - 1. / (dim + 4 ))
3728
3733
3729
- # Default method to calculate bandwidth, can be overwritten by subclass
3734
+ # Default method to calculate bandwidth, can be overwritten by subclass
3730
3735
covariance_factor = scotts_factor
3731
3736
3732
- def set_bandwidth (self , bw_method = None ):
3733
- if bw_method is None :
3734
- pass
3735
- elif bw_method == 'scott' :
3736
- self .covariance_factor = self .scotts_factor
3737
- elif bw_method == 'silverman' :
3738
- self .covariance_factor = self .silverman_factor
3739
- elif np .isscalar (bw_method ) and not isinstance (bw_method , six .string_types ):
3740
- self ._bw_method = 'use constant'
3741
- self .covariance_factor = lambda : bw_method
3742
- elif callable (bw_method ):
3743
- self ._bw_method = bw_method
3744
- self .covariance_factor = lambda : self ._bw_method (self )
3737
+ if bw_method is None :
3738
+ pass
3739
+ elif bw_method == 'scott' :
3740
+ covariance_factor = scotts_factor
3741
+ elif bw_method == 'silverman' :
3742
+ covariance_factor = silverman_factor
3743
+ elif np .isscalar (bw_method ) and not isinstance (bw_method , six .string_types ):
3744
+ covariance_factor = lambda : bw_method
3745
+ else :
3746
+ msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
3747
+ raise ValueError (msg )
3748
+
3749
+ # ---------------------------------------------------------------
3750
+ # Computes covariance matrix for each Gaussian kernel with factor
3751
+ # ---------------------------------------------------------------
3752
+ factor = covariance_factor ()
3753
+
3754
+ # Cache covariance and inverse covariance of the data
3755
+ data_covariance = np .atleast_2d (np .cov (dataset , rowvar = 1 , bias = False ))
3756
+ data_inv_cov = np .linalg .inv (data_covariance )
3757
+
3758
+ covariance = data_covariance * factor ** 2
3759
+ inv_cov = data_inv_cov / factor ** 2
3760
+ norm_factor = np .sqrt (np .linalg .det (2 * np .pi * covariance )) * num_dp
3761
+
3762
+ # ----------------------------------------------
3763
+ # Evaluate the estimated pdf on a set of points.
3764
+ # ----------------------------------------------
3765
+ points = np .atleast_2d (np .arange (xmin , xmax , (xmax - xmin )/ 100. ))
3766
+
3767
+ dim_pts , num_dp_pts = np .array (points ).shape
3768
+ if dim_pts != dim :
3769
+ if dim_pts == 1 and num_dp_pts == num_dp :
3770
+ # points was passed in as a row vector
3771
+ points = np .reshape (points , (dim , 1 ))
3772
+ num_dp_pts = 1
3745
3773
else :
3746
- msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
3747
- "or a callable."
3774
+ msg = "points have dimension %s, \
3775
+ dataset has dimension %s" % ( dim_pts , dim )
3748
3776
raise ValueError (msg )
3749
3777
3750
- self ._compute_covariance ()
3751
-
3752
- def _compute_covariance (self ):
3753
- """Computes the covariance matrix for each Gaussian kernel using
3754
- covariance_factor().
3755
- """
3756
- self .factor = self .covariance_factor ()
3757
- # Cache covariance and inverse covariance of the data
3758
- if not hasattr (self , '_data_inv_cov' ):
3759
- self ._data_covariance = np .atleast_2d (np .cov (self .dataset , rowvar = 1 ,
3760
- bias = False ))
3761
- self ._data_inv_cov = np .linalg .inv (self ._data_covariance )
3762
-
3763
- self .covariance = self ._data_covariance * self .factor ** 2
3764
- self .inv_cov = self ._data_inv_cov / self .factor ** 2
3765
- self ._norm_factor = np .sqrt (np .linalg .det (2 * np .pi * self .covariance )) * self .num_dp
3766
-
3767
- def evaluate (self , points ):
3768
- """Evaluate the estimated pdf on a set of points.
3769
-
3770
- Parameters
3771
- ----------
3772
- points : (# of dimensions, # of points)-array
3773
- Alternatively, a (# of dimensions,) vector can be passed in and
3774
- treated as a single point.
3775
-
3776
- Returns
3777
- -------
3778
- values : (# of points,)-array
3779
- The values at each point.
3780
-
3781
- Raises
3782
- ------
3783
- ValueError : if the dimensionality of the input points is different than
3784
- the dimensionality of the KDE.
3778
+ result = np .zeros ((num_dp_pts ,), dtype = np .float )
3785
3779
3786
- """
3787
- points = np .atleast_2d (points )
3788
-
3789
- d , m = points .shape
3790
- if d != self .dim :
3791
- if d == 1 and m == self .dim :
3792
- # points was passed in as a row vector
3793
- points = np .reshape (points , (self .dim , 1 ))
3794
- m = 1
3795
- else :
3796
- msg = "points have dimension %s, dataset has dimension %s" % (d ,
3797
- self .dim )
3798
- raise ValueError (msg )
3799
-
3800
- result = np .zeros ((m ,), dtype = np .float )
3801
-
3802
- if m >= self .num_dp :
3803
- # there are more points than data, so loop over data
3804
- for i in range (self .num_dp ):
3805
- diff = self .dataset [:, i , np .newaxis ] - points
3806
- tdiff = np .dot (self .inv_cov , diff )
3807
- energy = np .sum (diff * tdiff ,axis = 0 ) / 2.0
3808
- result = result + np .exp (- energy )
3809
- else :
3810
- # loop over points
3811
- for i in range (m ):
3812
- diff = self .dataset - points [:, i , np .newaxis ]
3813
- tdiff = np .dot (self .inv_cov , diff )
3814
- energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3815
- result [i ] = np .sum (np .exp (- energy ), axis = 0 )
3816
-
3817
- result = result / self ._norm_factor
3818
-
3819
- return result
3820
-
3821
- __call__ = evaluate
3780
+ if num_dp_pts >= num_dp :
3781
+ # there are more points than data, so loop over data
3782
+ for i in range (num_dp ):
3783
+ diff = dataset [:, i , np .newaxis ] - points
3784
+ tdiff = np .dot (inv_cov , diff )
3785
+ energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3786
+ result = result + np .exp (- energy )
3787
+ else :
3788
+ # loop over points
3789
+ for i in range (num_dp_pts ):
3790
+ diff = dataset - points [:, i , np .newaxis ]
3791
+ tdiff = np .dot (inv_cov , diff )
3792
+ energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3793
+ result [i ] = np .sum (np .exp (- energy ), axis = 0 )
3794
+
3795
+ result = result / norm_factor
3796
+
3797
+ return {
3798
+ 'xmin' : xmin ,
3799
+ 'xmax' : xmax ,
3800
+ 'mean' : np .mean (dataset ),
3801
+ 'median' : np .median (dataset ),
3802
+ 'result' : result
3803
+ }
3822
3804
3823
3805
##################################################
3824
3806
# Code related to things in and around polygons
0 commit comments