scikit-learn · chenhe95 · Dec 1, 2016 · Dec 1, 2016 · Dec 2, 2016 · Dec 2, 2016
diff --git a/README.rst b/README.rst
@@ -40,6 +40,7 @@ It is currently maintained by a team of volunteers.
 
 Website: http://scikit-learn.org
 
+
 Installation
 ------------
 
@@ -136,7 +137,7 @@ Project History
 
 The project was started in 2007 by David Cournapeau as a Google Summer
 of Code project, and since then many volunteers have contributed. See
-the AUTHORS.rst file for a complete list of contributors.
+the  `AUTHORS.rst <AUTHORS.rst>`_ file for a complete list of contributors.
 
 The project is currently maintained by a team of volunteers.
 

diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst
@@ -290,7 +290,7 @@ For regression, MLP uses the Square Error loss function; written as,
 
 .. math::
 
-    Loss(\hat{y},y,W) = \frac{1}{2}||\hat{y} - y ||_2^2 + \alpha ||W||_2^2
+    Loss(\hat{y},y,W) = \frac{1}{2}||\hat{y} - y ||_2^2 + \frac{\alpha}{2} ||W||_2^2
 
 
 Starting from initial random weights, multi-layer perceptron (MLP) minimizes

diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
@@ -15,6 +15,12 @@
 
 The estimation of the model is done by iteratively maximizing the
 marginal log-likelihood of the observations.
+
+We also plot predictions and uncertainties for ARD
+for one dimensional regression using polynomial feature expansion.
+Note the uncertainty starts going up on the right side of the plot.
+This is because these test samples are outside of the range of the training
+samples.
 """
 print(__doc__)
 
@@ -54,8 +60,8 @@
 ols.fit(X, y)
 
 ###############################################################################
-# Plot the true weights, the estimated weights and the histogram of the
-# weights
+# Plot the true weights, the estimated weights, the histogram of the
+# weights, and predictions with standard deviations
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
 plt.plot(clf.coef_, color='darkblue', linestyle='-', linewidth=2,
@@ -81,4 +87,30 @@
 plt.plot(clf.scores_, color='navy', linewidth=2)
 plt.ylabel("Score")
 plt.xlabel("Iterations")
+
+
+# Plotting some predictions for polynomial regression
+def f(x, noise_amount):
+    y = np.sqrt(x) * np.sin(x)
+    noise = np.random.normal(0, 1, len(x))
+    return y + noise_amount * noise
+
+
+degree = 10
+X = np.linspace(0, 10, 100)
+y = f(X, noise_amount=1)
+clf_poly = ARDRegression(threshold_lambda=1e5)
+clf_poly.fit(np.vander(X, degree), y)
+
+X_plot = np.linspace(0, 11, 25)
+y_plot = f(X_plot, noise_amount=0)
 y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
+plt.figure(figsize=(6, 5))
+plt.errorbar(X_plot, y_mean, y_std, color='navy',
+             label="Polynomial ARD", linewidth=2)
+plt.plot(X_plot, y_plot, color='gold', linewidth=2,
+         label="Ground Truth")
+plt.ylabel("Output y")
+plt.xlabel("Feature X")
+plt.legend(loc="lower left")
 plt.show()
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
@@ -15,6 +15,12 @@
 
 The estimation of the model is done by iteratively maximizing the
 marginal log-likelihood of the observations.
+
+We also plot predictions and uncertainties for Bayesian Ridge Regression
+for one dimensional regression using polynomial feature expansion.
+Note the uncertainty starts going up on the right side of the plot.
+This is because these test samples are outside of the range of the training
+samples.
 """
 print(__doc__)
 
@@ -51,7 +57,8 @@
 ols.fit(X, y)
 
 ###############################################################################
-# Plot true weights, estimated weights and histogram of the weights
+# Plot true weights, estimated weights, histogram of the weights, and
+# predictions with standard deviations
 lw = 2
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
@@ -77,4 +84,30 @@
 plt.plot(clf.scores_, color='navy', linewidth=lw)
 plt.ylabel("Score")
 plt.xlabel("Iterations")
+
+
+# Plotting some predictions for polynomial regression
+def f(x, noise_amount):
+    y = np.sqrt(x) * np.sin(x)
+    noise = np.random.normal(0, 1, len(x))
+    return y + noise_amount * noise
+
+
+degree = 10
+X = np.linspace(0, 10, 100)
+y = f(X, noise_amount=0.1)
+clf_poly = BayesianRidge()
+clf_poly.fit(np.vander(X, degree), y)
+
+X_plot = np.linspace(0, 11, 25)
+y_plot = f(X_plot, noise_amount=0)
+y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
+plt.figure(figsize=(6, 5))
+plt.errorbar(X_plot, y_mean, y_std, color='navy',
+             label="Polynomial Bayesian Ridge Regression", linewidth=lw)
+plt.plot(X_plot, y_plot, color='gold', linewidth=lw,
+         label="Ground Truth")
+plt.ylabel("Output y")
+plt.xlabel("Feature X")
+plt.legend(loc="lower left")
 plt.show()
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -511,7 +511,7 @@ def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
         numerator = np.sum(sample_weight * residual)
         denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual))
 
-        if denominator == 0.0:
+        if np.isclose(denominator, 0.0):
             tree.value[leaf, 0, 0] = 0.0
         else:
             tree.value[leaf, 0, 0] = numerator / denominator

diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
@@ -91,6 +91,9 @@ class BayesianRidge(LinearModel, RegressorMixin):
     lambda_ : float
        estimated precision of the weights.
 
+    sigma_ : array, shape = (n_features, n_features)
+        estimated variance-covariance matrix of the weights
+
     scores_ : float
         if computed, value of the objective function (to be maximized)
 
@@ -109,6 +112,16 @@ class BayesianRidge(LinearModel, RegressorMixin):
     Notes
     -----
     See examples/linear_model/plot_bayesian_ridge.py for an example.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
+    Vol. 4, No. 3, 1992.
+
+    R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
+    http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
+    Their beta is our self.alpha_
+    Their alpha is our self.lambda_
     """
 
     def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
@@ -142,8 +155,10 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
-        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
+        X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
         n_samples, n_features = X.shape
 
         # Initialization of the values of the parameters
@@ -171,7 +186,8 @@ def fit(self, X, y):
             # coef_ = sigma_^-1 * XT * y
             if n_samples > n_features:
                 coef_ = np.dot(Vh.T,
-                               Vh / (eigen_vals_ + lambda_ / alpha_)[:, None])
+                               Vh / (eigen_vals_ +
+                                     lambda_ / alpha_)[:, np.newaxis])
                 coef_ = np.dot(coef_, XT_y)
                 if self.compute_score:
                     logdet_sigma_ = - np.sum(
@@ -216,10 +232,45 @@ def fit(self, X, y):
         self.alpha_ = alpha_
         self.lambda_ = lambda_
         self.coef_ = coef_
+        sigma_ = np.dot(Vh.T,
+                        Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis])
+        self.sigma_ = (1. / alpha_) * sigma_
 
-        self._set_intercept(X_offset, y_offset, X_scale)
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
         return self
 
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
+            Samples.
+
+        return_std : boolean, optional
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array, shape = (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array, shape = (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            if self.normalize:
+                X = (X - self.X_offset_) / self.X_scale_
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
+            return y_mean, y_std
+
 
 ###############################################################################
 # ARD (Automatic Relevance Determination) regression
@@ -323,6 +374,19 @@ class ARDRegression(LinearModel, RegressorMixin):
     Notes
     --------
     See examples/linear_model/plot_ard.py for an example.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
+    competition, ASHRAE Transactions, 1994.
+
+    R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
+    http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
+    Their beta is our self.alpha_
+    Their alpha is our self.lambda_
+    ARD is a little different than the slide: only dimensions/features for
+    which self.lambda_ < self.threshold_lambda are kept and the rest are
+    discarded.
     """
 
     def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
@@ -365,7 +429,7 @@ def fit(self, X, y):
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)
 
-        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
+        X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
 
         # Launch the convergence loop
@@ -417,7 +481,7 @@ def fit(self, X, y):
                 s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
                 s += alpha_1 * log(alpha_) - alpha_2 * alpha_
                 s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) +
-                                                np.sum(np.log(lambda_)))
+                            np.sum(np.log(lambda_)))
                 s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_ ** 2).sum())
                 self.scores_.append(s)
 
@@ -432,5 +496,38 @@ def fit(self, X, y):
         self.alpha_ = alpha_
         self.sigma_ = sigma_
         self.lambda_ = lambda_
-        self._set_intercept(X_offset, y_offset, X_scale)
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
         return self
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
+            Samples.
+
+        return_std : boolean, optional
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array, shape = (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array, shape = (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            if self.normalize:
+                X = (X - self.X_offset_) / self.X_scale_
+            X = X[:, self.lambda_ < self.threshold_lambda]
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
+            return y_mean, y_std
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
@@ -56,3 +56,35 @@ def test_toy_ard_object():
     # Check that the model could approximately learn the identity function
     test = [[1], [3], [4]]
     assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
+
+
+def test_return_std():
+    # Test return_std option for both Bayesian regressors
+    def f(X):
+        return np.dot(X, w) + b
+
+    def f_noise(X, noise_mult):
+        return f(X) + np.random.randn(X.shape[0])*noise_mult
+
+    d = 5
+    n_train = 50
+    n_test = 10
+
+    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
+    b = 1.0
+
+    X = np.random.random((n_train, d))
+    X_test = np.random.random((n_test, d))
+
+    for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
+        y = f_noise(X, noise_mult)
+
+        m1 = BayesianRidge()
+        m1.fit(X, y)
+        y_mean1, y_std1 = m1.predict(X_test, return_std=True)
+        assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)
+
+        m2 = ARDRegression()
+        m2.fit(X, y)
+        y_mean2, y_std2 = m2.predict(X_test, return_std=True)
+        assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)