8000 Merge pull request #5008 from TomDLT/logistic_multiclass · glouppe/scikit-learn@caeefff · GitHub
[go: up one dir, main page]

Skip to content

Commit caeefff

Browse files
amuellerglouppe
authored andcommitted
Merge pull request scikit-learn#5008 from TomDLT/logistic_multiclass
[MRG+1] fix logistic regression class weights
2 parents 861ac13 + d439dc4 commit caeefff

File tree

6 files changed

+245
-43
lines changed

6 files changed

+245
-43
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
4+
r"""
5+
==============================================================
6+
Comparison of predictive distributions of different regressors
7+
==============================================================
8+
9+
A simple one-dimensional, noisy regression problem adressed by three different
10+
regressors:
11+
12+
1. A Gaussian Process
13+
2. A Random Forest
14+
3. A Bagging-based Regressor
15+
16+
The regressors are fitted based on noisy observations where the magnitude of
17+
the noise at the different training point is constant and known. Plotted are
18+
both the mean and the pointwise 95% confidence interval of the predictions.
19+
The mean predictions are evaluated on noise-less test data using the mean-
20+
squared-error. The mean log probabilities of the noise-less test data are used
21+
to evaluate the predictive distributions (a normal distribution with the
22+
predicted mean and standard deviation) of the three regressors.
23+
24+
This example is based on the example gaussian_process/plot_gp_regression.py.
25+
"""
26+
print(__doc__)
27+
28+
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
29+
# Licence: BSD 3 clause
30+
31+
import numpy as np
32+
from scipy.stats import norm
33+
from sklearn.gaussian_process import GaussianProcess
34+
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
35+
from sklearn.metrics import mean_squared_error
36+
from matplotlib import pyplot as pl
37+
38+
np.random.seed(1)
39+
40+
41+
def f(x):
42+
"""The function to predict."""
43+
return x * np.sin(x)
44+
45+
X = np.linspace(0.1, 9.9, 20)
46+
X = np.atleast_2d(X).T
47+
48+
# Observations and noise
49+
y = f(X).ravel()
50+
dy = np.ones_like(y)
51+
noise = np.random.normal(0, dy)
52+
y += noise
53+
54+
# Mesh the input space for evaluations of the real function, the prediction and
55+
# its standard deviation
56+
x = np.atleast_2d(np.linspace(0, 10, 1000)).T
57+
58+
regrs = {"Gaussian Process": GaussianProcess(corr='squared_exponential',
59+
theta0=1e-1, thetaL=1e-3,
60+
thetaU=1, nugget=(dy / y) ** 2,
61+
random_start=100),
62+
"Random Forest": RandomForestRegressor(n_estimators=250),
63+
"Bagging": BaggingRegressor(n_estimators=250)}
64+
65+
66+
# Plot predictive distributions of different regressors
67+
fig = pl.figure()
68+
# Plot the function and the observations
69+
pl.plot(x, f(x), 'r', label=u'$f(x) = x\,\sin(x)$')
70+
pl.fill(np.concatenate([x, x[::-1]]),
71+
np.concatenate([f(x) - 1.9600, (f(x) + 1.9600)[::-1]]),
72+
alpha=.3, fc='r', ec='None')
73+
pl.plot(X.ravel(), y, 'ko', zorder=5, label=u'Observations')
74+
# Plot predictive distibutions of GP and Bagging
75+
colors = {"Gaussian Process": 'b', "Bagging": 'g'}
76+
mse = {}
77+
log_pdf_loss = {}
78+
for name, regr in regrs.items():
79+
regr.fit(X, y)
80+
81+
# Make the prediction on the meshed x-axis (ask for standard deviation
82+
# as well)
83+
y_pred, sigma = regr.predict(x, with_std=True)
84+
85+
# Compute mean-squared error and log predictive loss
86+
mse[name] = mean_squared_error(f(x), y_pred)
87+
log_pdf_loss[name] = \
88+
norm(y_pred, sigma).logpdf(f(x)).mean()
89+
90+
if name == "Random Forest": # Skip because RF is very similar to Bagging
91+
continue
92+
93+
# Plot 95% confidence interval based on the predictive standard deviation
94+
pl.plot(x, y_pred, colors[name], label=name)
95+
pl.fill(np.concatenate([x, x[::-1]]),
96+
np.concatenate([y_pred - 1.9600 * sigma,
97+
(y_pred + 1.9600 * sigma)[::-1]]),
98+
alpha=.3, fc=colors[name], ec='None')
99+
100+
101+
pl.xlabel('$x$')
102+
pl.ylabel('$f(x)$')
103+
pl.ylim(-10, 20)
104+
pl.legend(loc='upper left')
105+
106+
print "Mean-squared error of predictors on 1000 equidistant noise-less test " \
107+
"datapoints:\n\tRandom Forest: %.2f\n\tBagging: %.2f" \
108+
"\n\tGaussian Process: %.2f" \
109+
% (mse["Random Forest"], mse["Bagging"], mse["Gaussian Process"])
110+
111+
print "Mean log-probability of 1000 equidistant noise-less test datapoints " \
112+
"under the (normal) predictive distribution of the predictors, i.e., " \
113+
"log N(y_true| y_pred_mean, y_pred_std) [less is better]:"\
114+
"\n\tRandom Forest: %.2f\n\tBagging: %.2f\n\tGaussian Process: %.2f" \
115+
% (log_pdf_loss["Random Forest"], log_pdf_loss["Bagging"],
116+
log_pdf_loss["Gaussian Process"])
117+
118+
print "In summary, the mean predictions of the Gaussian Process are slightly "\
119+
"better than those of Random Forest and Bagging. The predictive " \
120+
"distributions (taking into account also the predictive variance) " \
121+
"of the Gaussian Process are considerably better."
122+
123+
pl.show()

sklearn/ensemble/bagging.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,8 @@ def _parallel_decision_function(estimators, estimators_features, X):
194194

195195
def _parallel_predict_regression(estimators, estimators_features, X):
196196
"""Private function used to compute predictions within a job."""
197-
return sum(estimator.predict(X[:, features])
198-
for estimator, features in zip(estimators,
199-
estimators_features))
197+
return [estimator.predict(X[:, features])
198+
for estimator, features in zip(estimators, estimators_features)]
200199

201200

202201
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
@@ -856,22 +855,31 @@ def __init__(self,
856855
random_state=random_state,
857856
verbose=verbose)
858857

859-
def predict(self, X):
858+
def predict(self, X, with_std=False):
860859
"""Predict regression target for X.
861860
862861
The predicted regression target of an input sample is computed as the
863862
mean predicted regression targets of the estimators in the ensemble.
863+
Optionally, the standard deviation of the predictions of the ensemble's
864+
estimators is computed in addition.
864865
865866
Parameters
866867
----------
867868
X : {array-like, sparse matrix} of shape = [n_samples, n_features]
868869
The training input samples. Sparse matrices are accepted only if
869870
they are supported by the base estimator.
870871
872+
with_std : boolean, optional, default=False
873+
When True, the standard deviation of the predictions of the
874+
ensemble's estimators is returned in addition to the mean.
875+
871876
Returns
872877
-------
873-
y : array of shape = [n_samples]
874-
The predicted values.
878+
y_mean : array of shape = [n_samples]
879+
The mean of the predicted values.
880+
881+
y_std : array of shape = [n_samples], optional (if with_std == True)
882+
The standard deviation of the ensemble's predicted values.
875883
"""
876884
check_is_fitted(self, "estimators_features_")
877885
# Check data
@@ -889,9 +897,12 @@ def predict(self, X):
889897
for i in range(n_jobs))
890898

891899
# Reduce
892-
y_hat = sum(all_y_hat) / self.n_estimators
893-
894-
return y_hat
900+
all_y_hat = np.array(all_y_hat).reshape(self.n_estimators, -1)
901+
y_mean = np.mean(all_y_hat, axis=0)
902+
if with_std:
903+
return y_mean, np.std(all_y_hat, axis=0)
904+
else:
905+
return y_mean
895906

896907
def _validate_estimator(self):
897908
"""Check the estimator and set the base_estimator_ attribute."""

sklearn/ensemble/forest.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,15 @@ class calls the ``fit`` method of each sub-estimator on random samples
7272

7373
MAX_INT = np.iinfo(np.int32).max
7474

75+
7576
def _generate_sample_indices(random_state, n_samples):
7677
"""Private function used to _parallel_build_trees function."""
7778
random_instance = check_random_state(random_state)
7879
sample_indices = random_instance.randint(0, n_samples, n_samples)
7980

8081
return sample_indices
8182

83+
8284
def _generate_unsampled_indices(random_state, n_samples):
8385
"""Private function used to forest._set_oob_score fuction."""
8486
sample_indices = _generate_sample_indices(random_state, n_samples)
@@ -89,6 +91,7 @@ def _generate_unsampled_indices(random_state, n_samples):
8991

9092
return unsampled_indices
9193

94+
9295
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
9396
verbose=0, class_weight=None):
9497
"""Private function used to fit a single tree in parallel."""
@@ -628,11 +631,13 @@ def __init__(self,
628631
verbose=verbose,
629632
warm_start=warm_start)
630633

631-
def predict(self, X):
634+
def predict(self, X, with_std=False):
632635
"""Predict regression target for X.
633636
634637
The predicted regression target of an input sample is computed as the
635638
mean predicted regression targets of the trees in the forest.
639+
Optionally, the standard deviation of the predictions of the ensemble's
640+
estimators is computed in addition.
636641
637642
Parameters
638643
----------
@@ -641,10 +646,17 @@ def predict(self, X):
641646
``dtype=np.float32`` and if a sparse matrix is provided
642647
to a sparse ``csr_matrix``.
643648
649+
with_std : boolean, optional, default=False
650+
When True, the standard deviation of the predictions of the
651+
ensemble's estimators is returned in addition to the mean.
652+
644653
Returns
645654
-------
646-
y : array of shape = [n_samples] or [n_samples, n_outputs]
647-
The predicted values.
655+
y_mean: array of shape = [n_samples] or [n_samples, n_outputs]
656+
The mean of the predicted values.
657+
658+
y_std : array of shape = [n_samples], optional (if with_std == True)
659+
The standard deviation of the predicted values.
648660
"""
649661
# Check data
650662
X = self._validate_X_predict(X)
@@ -658,10 +670,11 @@ def predict(self, X):
658670
delayed(_parallel_helper)(e, 'predict', X, check_input=False)
659671
for e in self.estimators_)
660672

661-
# Reduce
662-
y_hat = sum(all_y_hat) / len(self.estimators_)
663-
664-
return y_hat
673+
y_mean = np.mean(all_y_hat, axis=0)
674+
if with_std:
675+
return y_mean, np.std(all_y_hat, axis=0)
676+
else:
677+
return y_mean
665678

666679
def _set_oob_score(self, X, y):
667680
"""Compute out-of-bag scores"""

sklearn/linear_model/logistic.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -594,11 +594,11 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
594594
sample_weight = np.ones(X.shape[0])
595595

596596
# If class_weights is a dict (provided by the user), the weights
597-
# are assigned to the original labels. If it is "auto", then
597+
# are assigned to the original labels. If it is "balanced", then
598598
# the class_weights are assigned after masking the labels with a OvR.
599599
le = LabelEncoder()
600600

601-
if isinstance(class_weight, dict):
601+
if isinstance(class_weight, dict) or multi_class == 'multinomial':
602602
if solver == "liblinear":
603603
if classes.size == 2:
604604
# Reconstruct the weights with keys 1 and -1
@@ -610,7 +610,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
610610
"solver cannot handle multiclass with "
611611
"class_weight of type dict. Use the lbfgs, "
612612
"newton-cg or sag solvers or set "
613-
"class_weight='auto'")
613+
"class_weight='balanced'")
614614
else:
615615
class_weight_ = compute_class_weight(class_weight, classes, y)
616616
sample_weight *= class_weight_[le.fit_transform(y)]
@@ -623,20 +623,21 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
623623
mask = (y == pos_class)
624624
y_bin = np.ones(y.shape, dtype=np.float64)
625625
y_bin[~mask] = -1.
626+
# for compute_class_weight
627+
628+
# 'auto' is deprecated and will be removed in 0.19
629+
if class_weight in ("auto", "balanced"):
630+
class_weight_ = compute_class_weight(class_weight, mask_classes,
631+
y_bin)
632+
sample_weight *= class_weight_[le.fit_transform(y_bin)]
626633

627634
else:
628635
lbin = LabelBinarizer()
629-
Y_bin = lbin.fit_transform(y)
630-
if Y_bin.shape[1] == 1:
631-
Y_bin = np.hstack([1 - Y_bin, Y_bin])
632-
w0 = np.zeros((Y_bin.shape[1], n_features + int(fit_intercept)),
636+
Y_binarized = lbin.fit_transform(y)
637+
if Y_binarized.shape[1] == 1:
638+
Y_binarized = np.hstack([1 - Y_binarized, Y_binarized])
639+
w0 = np.zeros((Y_binarized.shape[1], n_features + int(fit_intercept)),
633640
order='F')
634-
mask_classes = classes
635-
636-
if class_weight == "auto":
637-
class_weight_ = compute_class_weight(class_weight, mask_classes,
638-
y_bin)
639-
sample_weight *= class_weight_[le.fit_transform(y_bin)]
640641

641642
if coef is not None:
642643
# it must work both giving the bias term and not
@@ -665,7 +666,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
665666
if multi_class == 'multinomial':
666667
# fmin_l_bfgs_b and newton-cg accepts only ravelled parameters.
667668
w0 = w0.ravel()
668-
target = Y_bin
669+
target = Y_binarized
669670
if solver == 'lbfgs':
670671
func = lambda x, *args: _multinomial_loss_grad(x, *args)[0:2]
671672
elif solver == 'newton-cg':
@@ -1535,9 +1536,18 @@ def fit(self, X, y, sample_weight=None):
15351536
if self.class_weight and not(isinstance(self.class_weight, dict) or
15361537
self.class_weight in
15371538
['balanced', 'auto']):
1539+
# 'auto' is deprecated and will be removed in 0.19
15381540
raise ValueError("class_weight provided should be a "
15391541
"dict or 'balanced'")
15401542

1543+
# compute the class weights for the entire dataset y
1544+
if self.class_weight in ("auto", "balanced"):
1545+
classes = np.unique(y)
1546+
class_weight = compute_class_weight(self.class_weight, classes, y)
1547+
class_weight = dict(zip(classes, class_weight))
1548+
else:
1549+
class_weight = self.class_weight
1550+
15411551
path_func = delayed(_log_reg_scoring_path)
15421552

15431553
# The SAG solver releases the GIL so it's more efficient to use
@@ -1549,7 +1559,7 @@ def fit(self, X, y, sample_weight=None):
15491559
fit_intercept=self.fit_intercept, penalty=self.penalty,
15501560
dual=self.dual, solver=self.solver, tol=self.tol,
15511561
max_iter=self.max_iter, verbose=self.verbose,
1552-
class_weight=self.class_weight, scoring=self.scoring,
1562+
class_weight=class_weight, scoring=self.scoring,
15531563
multi_class=self.multi_class,
15541564
intercept_scaling=self.intercept_scaling,
15551565
random_state=self.random_state,
@@ -1621,7 +1631,7 @@ def fit(self, X, y, sample_weight=None):
16211631
fit_intercept=self.fit_intercept, coef=coef_init,
16221632
max_iter=self.max_iter, tol=self.tol,
16231633
penalty=self.penalty, copy=False,
1624-
class_weight=self.class_weight,
1634+
class_weight=class_weight,
16251635
multi_class=self.multi_class,
16261636
verbose=max(0, self.verbose - 1),
16271637
random_state=self.random_state,

0 commit comments

Comments
 (0)
0