glouppe
diff --git a/‎examples/plot_predictive_standard_deviation.py
Lines changed: 123 additions & 0 deletions b/‎examples/plot_predictive_standard_deviation.py
Lines changed: 123 additions & 0 deletions
diff --git a/‎sklearn/ensemble/bagging.py
Lines changed: 20 additions & 9 deletions b/‎sklearn/ensemble/bagging.py
Lines changed: 20 additions & 9 deletions
diff --git a/‎sklearn/ensemble/forest.py
Lines changed: 20 additions & 7 deletions b/‎sklearn/ensemble/forest.py
Lines changed: 20 additions & 7 deletions
diff --git a/‎sklearn/linear_model/logistic.py
Lines changed: 26 additions & 16 deletions b/‎sklearn/linear_model/logistic.py
Lines changed: 26 additions & 16 deletions
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+r"""
+==============================================================
+Comparison of predictive distributions of different regressors
+==============================================================
+
+A simple one-dimensional, noisy regression problem adressed by three different
+regressors:
+
+1. A Gaussian Process
+2. A Random Forest
+3. A Bagging-based Regressor
+
+The regressors are fitted based on noisy observations where the magnitude of
+the noise at the different training point is constant and known. Plotted are
+both the mean and the pointwise 95% confidence interval of the predictions.
+The mean predictions are evaluated on noise-less test data using the mean-
+squared-error. The mean log probabilities of the noise-less test data are used
+to evaluate the predictive distributions (a normal distribution with the
+predicted mean and standard deviation) of the three regressors.
+
+This example is based on the example gaussian_process/plot_gp_regression.py.
+"""
+print(__doc__)
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+import numpy as np
+from scipy.stats import norm
+from sklearn.gaussian_process import GaussianProcess
+from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
+from sklearn.metrics import mean_squared_error
+from matplotlib import pyplot as pl
+
+np.random.seed(1)
+
+
+def f(x):
+    """The function to predict."""
+    return x * np.sin(x)
+
+X = np.linspace(0.1, 9.9, 20)
+X = np.atleast_2d(X).T
+
+# Observations and noise
+y = f(X).ravel()
+dy = np.ones_like(y)
+noise = np.random.normal(0, dy)
+y += noise
+
+# Mesh the input space for evaluations of the real function, the prediction and
+# its standard deviation
+x = np.atleast_2d(np.linspace(0, 10, 1000)).T
+
+regrs = {"Gaussian Process": GaussianProcess(corr='squared_exponential',
+                                             theta0=1e-1, thetaL=1e-3,
+                                             thetaU=1, nugget=(dy / y) ** 2,
+                                             random_start=100),
+         "Random Forest": RandomForestRegressor(n_estimators=250),
+         "Bagging": BaggingRegressor(n_estimators=250)}
+
+
+# Plot predictive distributions of different regressors
+fig = pl.figure()
+# Plot the function and the observations
+pl.plot(x, f(x), 'r', label=u'$f(x) = x\,\sin(x)$')
+pl.fill(np.concatenate([x, x[::-1]]),
+        np.concatenate([f(x) - 1.9600, (f(x) + 1.9600)[::-1]]),
+        alpha=.3, fc='r', ec='None')
+pl.plot(X.ravel(), y, 'ko', zorder=5, label=u'Observations')
+# Plot predictive distibutions of GP and Bagging
+colors = {"Gaussian Process": 'b', "Bagging": 'g'}
+mse = {}
+log_pdf_loss = {}
+for name, regr in regrs.items():
+    regr.fit(X, y)
+
+    # Make the prediction on the meshed x-axis (ask for standard deviation
+    # as well)
+    y_pred, sigma = regr.predict(x, with_std=True)
+
+    # Compute mean-squared error and log predictive loss
+    mse[name] = mean_squared_error(f(x), y_pred)
+    log_pdf_loss[name] = \
+        norm(y_pred, sigma).logpdf(f(x)).mean()
+
+    if name == "Random Forest":  # Skip because RF is very similar to Bagging
+        continue
+
+    # Plot 95% confidence interval based on the predictive standard deviation
+    pl.plot(x, y_pred, colors[name], label=name)
+    pl.fill(np.concatenate([x, x[::-1]]),
+            np.concatenate([y_pred - 1.9600 * sigma,
+                           (y_pred + 1.9600 * sigma)[::-1]]),
+            alpha=.3, fc=colors[name], ec='None')
+
+
+pl.xlabel('$x$')
+pl.ylabel('$f(x)$')
+pl.ylim(-10, 20)
+pl.legend(loc='upper left')
+
+print "Mean-squared error of predictors on 1000 equidistant noise-less test " \
+    "datapoints:\n\tRandom Forest: %.2f\n\tBagging: %.2f" \
+    "\n\tGaussian Process: %.2f" \
+    % (mse["Random Forest"], mse["Bagging"], mse["Gaussian Process"])
+
+print "Mean log-probability of 1000 equidistant noise-less test datapoints " \
+    "under the (normal) predictive distribution of the predictors, i.e., " \
+    "log N(y_true| y_pred_mean, y_pred_std) [less is better]:"\
+    "\n\tRandom Forest: %.2f\n\tBagging: %.2f\n\tGaussian Process: %.2f" \
+    % (log_pdf_loss["Random Forest"], log_pdf_loss["Bagging"],
+       log_pdf_loss["Gaussian Process"])
+
+print "In summary, the mean predictions of the Gaussian Process are slightly "\
+    "better than those of Random Forest and Bagging. The predictive " \
+    "distributions (taking into account also the predictive variance) " \
+    "of the Gaussian Process are considerably better."
+
+pl.show()
@@ -194,9 +194,8 @@ def _parallel_decision_function(estimators, estimators_features, X):
 
 def _parallel_predict_regression(estimators, estimators_features, X):
     """Private function used to compute predictions within a job."""
-    return sum(estimator.predict(X[:, features])
-               for estimator, features in zip(estimators,
-                                              estimators_features))
+    return [estimator.predict(X[:, features])
+            for estimator, features in zip(estimators, estimators_features)]
 
 
 class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
@@ -856,22 +855,31 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose)
 
-    def predict(self, X):
+    def predict(self, X, with_std=False):
         """Predict regression target for X.
 
         The predicted regression target of an input sample is computed as the
         mean predicted regression targets of the estimators in the ensemble.
+        Optionally, the standard deviation of the predictions of the ensemble's
+        estimators is computed in addition.
 
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape = [n_samples, n_features]
             The training input samples. Sparse matrices are accepted only if
             they are supported by the base estimator.
 
+        with_std : boolean, optional, default=False
+            When True, the standard deviation of the predictions of the
+            ensemble's estimators is returned in addition to the mean.
+
         Returns
         -------
-        y : array of shape = [n_samples]
-            The predicted values.
+        y_mean : array of shape = [n_samples]
+            The mean of the predicted values.
+
+        y_std : array of shape = [n_samples], optional (if with_std == True)
+            The standard deviation of the ensemble's predicted values.
         """
         check_is_fitted(self, "estimators_features_")
         # Check data
@@ -889,9 +897,12 @@ def predict(self, X):
             for i in range(n_jobs))
 
         # Reduce
-        y_hat = sum(all_y_hat) / self.n_estimators
-
-        return y_hat
+        all_y_hat = np.array(all_y_hat).reshape(self.n_estimators, -1)
+        y_mean = np.mean(all_y_hat, axis=0)
+        if with_std:
+            return y_mean, np.std(all_y_hat, axis=0)
+        else:
+            return y_mean
 
     def _validate_estimator(self):
         """Check the estimator and set the base_estimator_ attribute."""
 
@@ -72,13 +72,15 @@ class calls the ``fit`` method of each sub-estimator on random samples
 
 MAX_INT = np.iinfo(np.int32).max
 
+
 def _generate_sample_indices(random_state, n_samples):
     """Private function used to _parallel_build_trees function."""
     random_instance = check_random_state(random_state)
     sample_indices = random_instance.randint(0, n_samples, n_samples)
 
     return sample_indices
 
+
 def _generate_unsampled_indices(random_state, n_samples):
     """Private function used to forest._set_oob_score fuction."""
     sample_indices = _generate_sample_indices(random_state, n_samples)
@@ -89,6 +91,7 @@ def _generate_unsampled_indices(random_state, n_samples):
 
     return unsampled_indices
 
+
 def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
                           verbose=0, class_weight=None):
     """Private function used to fit a single tree in parallel."""
@@ -628,11 +631,13 @@ def __init__(self,
             verbose=verbose,
             warm_start=warm_start)
 
-    def predict(self, X):
+    def predict(self, X, with_std=False):
         """Predict regression target for X.
 
         The predicted regression target of an input sample is computed as the
         mean predicted regression targets of the trees in the forest.
+        Optionally, the standard deviation of the predictions of the ensemble's
+        estimators is computed in addition.
 
         Parameters
         ----------
@@ -641,10 +646,17 @@ def predict(self, X):
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
+        with_std : boolean, optional, default=False
+            When True, the standard deviation of the predictions of the
+            ensemble's estimators is returned in addition to the mean.
+
         Returns
         -------
-        y : array of shape = [n_samples] or [n_samples, n_outputs]
-            The predicted values.
+        y_mean: array of shape = [n_samples] or [n_samples, n_outputs]
+            The mean of the predicted values.
+
+        y_std : array of shape = [n_samples], optional (if with_std == True)
+            The standard deviation of the predicted values.
         """
         # Check data
         X = self._validate_X_predict(X)
@@ -658,10 +670,11 @@ def predict(self, X):
             delayed(_parallel_helper)(e, 'predict', X, check_input=False)
             for e in self.estimators_)
 
-        # Reduce
-        y_hat = sum(all_y_hat) / len(self.estimators_)
-
-        return y_hat
+        y_mean = np.mean(all_y_hat, axis=0)
+        if with_std:
+            return y_mean, np.std(all_y_hat, axis=0)
+        else:
+            return y_mean
 
     def _set_oob_score(self, X, y):
         """Compute out-of-bag scores"""
 
@@ -594,11 +594,11 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         sample_weight = np.ones(X.shape[0])
 
     # If class_weights is a dict (provided by the user), the weights
-    # are assigned to the original labels. If it is "auto", then
+    # are assigned to the original labels. If it is "balanced", then
     # the class_weights are assigned after masking the labels with a OvR.
     le = LabelEncoder()
 
-    if isinstance(class_weight, dict):
+    if isinstance(class_weight, dict) or multi_class == 'multinomial':
         if solver == "liblinear":
             if classes.size == 2:
                 # Reconstruct the weights with keys 1 and -1
@@ -610,7 +610,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                                  "solver cannot handle multiclass with "
                                  "class_weight of type dict. Use the lbfgs, "
                                  "newton-cg or sag solvers or set "
-                                 "class_weight='auto'")
+                                 "class_weight='balanced'")
         else:
             class_weight_ = compute_class_weight(class_weight, classes, y)
             sample_weight *= class_weight_[le.fit_transform(y)]
@@ -623,20 +623,21 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         mask = (y == pos_class)
         y_bin = np.ones(y.shape, dtype=np.float64)
         y_bin[~mask] = -1.
+        # for compute_class_weight
+
+        # 'auto' is deprecated and will be removed in 0.19
+        if class_weight in ("auto", "balanced"):
+            class_weight_ = compute_class_weight(class_weight, mask_classes,
+                                                 y_bin)
+            sample_weight *= class_weight_[le.fit_transform(y_bin)]
 
     else:
         lbin = LabelBinarizer()
-        Y_bin = lbin.fit_transform(y)
-        if Y_bin.shape[1] == 1:
-            Y_bin = np.hstack([1 - Y_bin, Y_bin])
-        w0 = np.zeros((Y_bin.shape[1], n_features + int(fit_intercept)),
+        Y_binarized = lbin.fit_transform(y)
+        if Y_binarized.shape[1] == 1:
+            Y_binarized = np.hstack([1 - Y_binarized, Y_binarized])
+        w0 = np.zeros((Y_binarized.shape[1], n_features + int(fit_intercept)),
                       order='F')
-        mask_classes = classes
-
-    if class_weight == "auto":
-        class_weight_ = compute_class_weight(class_weight, mask_classes,
-                                             y_bin)
-        sample_weight *= class_weight_[le.fit_transform(y_bin)]
 
     if coef is not None:
         # it must work both giving the bias term and not
@@ -665,7 +666,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     if multi_class == 'multinomial':
         # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters.
         w0 = w0.ravel()
-        target = Y_bin
+        target = Y_binarized
         if solver == 'lbfgs':
             func = lambda x, *args: _multinomial_loss_grad(x, *args)[0:2]
         elif solver == 'newton-cg':
@@ -1535,9 +1536,18 @@ def fit(self, X, y, sample_weight=None):
         if self.class_weight and not(isinstance(self.class_weight, dict) or
                                      self.class_weight in
                                      ['balanced', 'auto']):
+            # 'auto' is deprecated and will be removed in 0.19
             raise ValueError("class_weight provided should be a "
                              "dict or 'balanced'")
 
+        # compute the class weights for the entire dataset y
+        if self.class_weight in ("auto", "balanced"):
+            classes = np.unique(y)
+            class_weight = compute_class_weight(self.class_weight, classes, y)
+            class_weight = dict(zip(classes, class_weight))
+        else:
+            class_weight = self.class_weight
+
         path_func = delayed(_log_reg_scoring_path)
 
         # The SAG solver releases the GIL so it's more efficient to use
@@ -1549,7 +1559,7 @@ def fit(self, X, y, sample_weight=None):
                       fit_intercept=self.fit_intercept, penalty=self.penalty,
                       dual=self.dual, solver=self.solver, tol=self.tol,
                       max_iter=self.max_iter, verbose=self.verbose,
-                      class_weight=self.class_weight, scoring=self.scoring,
+                      class_weight=class_weight, scoring=self.scoring,
                       multi_class=self.multi_class,
                       intercept_scaling=self.intercept_scaling,
                       random_state=self.random_state,
@@ -1621,7 +1631,7 @@ def fit(self, X, y, sample_weight=None):
                     fit_intercept=self.fit_intercept, coef=coef_init,
                     max_iter=self.max_iter, tol=self.tol,
                     penalty=self.penalty, copy=False,
-                    class_weight=self.class_weight,
+                    class_weight=class_weight,
                     multi_class=self.multi_class,
                     verbose=max(0, self.verbose - 1),
                     random_state=self.random_state,