8000 [MRG+1] label binarizer not used consistently in CalibratedClassifier… · maskani-moh/scikit-learn@dfe65de · GitHub 8000
[go: up one dir, main page]

Skip to content

Commit dfe65de

Browse files
srivatsan-rameshmaskani-moh
authored andcommitted
[MRG+1] label binarizer not used consistently in CalibratedClassifierCV (scikit-learn#7799)
* label binarizer not used consistently in CalibratedClassifierCV * changed position of classes argument to make old tests run * moved parameter to constructor and added test * added test where train set doesnt have all classes * CalibratedClassifierCV can now handle cases where train set doesnt contain all labels * fixing flake error * fixing line lengths * removing np.full() * from __future__ import division for py2.7 * change is test file * added an extra test and removed a test with Ridge * stronger test * whats new entry
1 parent 1d1b5a1 commit dfe65de

File tree

3 files changed

+72
-20
lines changed
  • doc
  • sklearn
    • tests
      • test_calibration.py

3 files changed

+72
-20
lines changed

doc/whats_new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ Bug fixes
144144
``partial_fit`` was less than the total number of classes in the
145145
data. :issue:`7786` by `Srivatsan Ramesh`_
146146

147+
- Fixes issue in :class:`calibration.CalibratedClassifierCV` where
148+
the sum of probabilities of each class for a data was not 1, and
149+
``CalibratedClassifierCV`` now handles the case where the training set
150+
has less number of classes than the total data. :issue:`7799` by
151+
`Srivatsan Ramesh`_
152+
147153

148154
API changes summary
149155
-------------------

sklearn/calibration.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414
import numpy as np
1515

1616
from scipy.optimize import fmin_bfgs
17+
from sklearn.preprocessing import LabelEncoder
1718

1819
from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
19-
from .preprocessing import LabelBinarizer
20+
from .preprocessing import label_binarize, LabelBinarizer
2021
from .utils import check_X_y, check_array, indexable, column_or_1d
2122
from .utils.validation import check_is_fitted
2223
from .utils.fixes import signature
@@ -50,7 +51,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
5051
The method to use for calibration. Can be 'sigmoid' which
5152
corresponds to Platt's method or 'isotonic' which is a
5253
non-parametric approach. It is not advised to use isotonic calibration
53-
with too few calibration samples ``(<<1000)`` since it tends to overfit.
54+
with too few calibration samples ``(<<1000)`` since it tends to
55+
overfit.
5456
Use sigmoids (Platt's calibration) in this case.
5557
5658
cv : integer, cross-validation generator, iterable or "prefit", optional
@@ -63,8 +65,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
6365
- An iterable yielding train/test splits.
6466
6567
For integer/None inputs, if ``y`` is binary or multiclass,
66-
:class:`sklearn.model_selection.StratifiedKFold` is used. If ``y``
67-
is neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
68+
:class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
69+
neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
6870
is used.
6971
7072
Refer :ref:`User Guide <cross_validation>` for the various
@@ -124,15 +126,16 @@ def fit(self, X, y, sample_weight=None):
124126
X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
125127
force_all_finite=False)
126128
X, y = indexable(X, y)
127-
lb = LabelBinarizer().fit(y)
128-
self.classes_ = lb.classes_
129+
le = LabelBinarizer().fit(y)
130+
self.classes_ = le.classes_
129131

130132
# Check that each cross-validation fold can have at least one
131133
# example per class
132134
n_folds = self.cv if isinstance(self.cv, int) \
133135
else self.cv.n_folds if hasattr(self.cv, "n_folds") else None
134136
if n_folds and \
135-
np.any([np.sum(y == class_) < n_folds for class_ in self.classes_]):
137+
np.any([np.sum(y == class_) < n_folds for class_ in
138+
self.classes_]):
136139
raise ValueError("Requesting %d-fold cross-validation but provided"
137140
" less than %d examples for at least one class."
138141
% (n_folds, n_folds))
@@ -175,7 +178,8 @@ def fit(self, X, y, sample_weight=None):
175178
this_estimator.fit(X[train], y[train])
176179

177180
calibrated_classifier = _CalibratedClassifier(
178-
this_estimator, method=self.method)
181+
this_estimator, method=self.method,
182+
classes=self.classes_)
179183
if sample_weight is not None:
180184
calibrated_classifier.fit(X[test], y[test],
181185
sample_weight[test])
@@ -253,6 +257,11 @@ class _CalibratedClassifier(object):
253257
corresponds to Platt's method or 'isotonic' which is a
254258
non-parametric approach based on isotonic regression.
255259
260+
classes : array-like, shape (n_classes,), optional
261+
Contains unique classes used to fit the base estimator.
262+
if None, then classes is extracted from the given target values
263+
in fit().
264+
256265
References
257266
----------
258267
.. [1] Obtaining calibrated probability estimates from decision trees
@@ -267,9 +276,10 @@ class _CalibratedClassifier(object):
267276
.. [4] Predicting Good Probabilities with Supervised Learning,
268277
A. Niculescu-Mizil & R. Caruana, ICML 2005
269278
"""
270-
def __init__(self, base_estimator, method='sigmoid'):
279+
def __init__(self, base_estimator, method='sigmoid', classes=None):
271280
self.base_estimator = base_estimator
272281
self.method = method
282+
self.classes = classes
273283

274284
def _preproc(self, X):
275285
n_classes = len(self.classes_)
@@ -285,7 +295,8 @@ def _preproc(self, X):
285295
raise RuntimeError('classifier has no decision_function or '
286296
'predict_proba method.')
287297

288-
idx_pos_class = np.arange(df.shape[1])
298+
idx_pos_class = self.label_encoder_.\
299+
transform(self.base_estimator.classes_)
289300

290301
return df, idx_pos_class
291302

@@ -308,9 +319,15 @@ def fit(self, X, y, sample_weight=None):
308319
self : object
309320
Returns an instance of self.
310321
"""
311-
lb = LabelBinarizer()
312-
Y = lb.fit_transform(y)
313-
self.classes_ = lb.classes_
322+
323+
self.label_encoder_ = LabelEncoder()
324+
if self.classes is None:
325+
self.label_encoder_.fit(y)
326+
else:
327+
self.label_encoder_.fit(self.classes)
328+
329+
self.classes_ = self.label_encoder_.classes_
330+
Y = label_binarize(y, self.classes_)
314331

315332
df, idx_pos_class = self._preproc(X)
316333
self.calibrators_ = []

sklearn/tests/test_calibration.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
22
# License: BSD 3 clause
33

4+
from __future__ import division
45
import numpy as np
56
from scipy import sparse
7+
from sklearn.model_selection import LeaveOneOut
68

79
from sklearn.utils.testing import (assert_array_almost_equal, assert_equal,
810
assert_greater, assert_almost_equal,
@@ -14,7 +16,6 @@
1416
from sklearn.naive_bayes import MultinomialNB
1517
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
1618
from sklearn.svm import LinearSVC
17-
from sklearn.linear_model import Ridge
1819
from sklearn.pipeline import Pipeline
1920
from sklearn.preprocessing import Imputer
2021
from sklearn.metrics import brier_score_loss, log_loss
@@ -87,12 +88,6 @@ def test_calibration():
8788
brier_score_loss((y_test + 1) % 2,
8889
prob_pos_pc_clf_relabeled))
8990

90-
# check that calibration can also deal with regressors that have
91-
# a decision_function
92-
clf_base_regressor = CalibratedClassifierCV(Ridge())
93-
clf_base_regressor.fit(X_train, y_train)
94-
clf_base_regressor.predict(X_test)
95-
9691
# Check failure cases:
9792
# only "isotonic" and "sigmoid" should be accepted as methods
9893
clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
@@ -159,6 +154,7 @@ def test_calibration_multiclass():
159154
def softmax(y_pred):
160155
e = np.exp(-y_pred)
161156
return e / e.sum(axis=1).reshape(-1, 1)
157+
162158
uncalibrated_log_loss = \
163159
log_loss(y_test, softmax(clf.decision_function(X_test)))
164160
calibrated_log_loss = log_loss(y_test, probas)
@@ -275,3 +271,36 @@ def test_calibration_nan_imputer():
275271
clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic')
276272
clf_c.fit(X, y)
277273
clf_c.predict(X)
274+
275+
276+
def test_calibration_prob_sum():
277+
# Test that sum of probabilities is 1. A non-regression test for
278+
# issue #7796
279+
num_classes = 2
280+
X, y = make_classification(n_samples=10, n_features=5,
281+
n_classes=num_classes)
282+
clf = LinearSVC(C=1.0)
283+
clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut())
284+
clf_prob.fit(X, y)
285+
286+
probs = clf_prob.predict_proba(X)
287+
assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
288+
289+
290+
def test_calibration_less_classes():
291+
# Test to check calibration works fine when train set in a test-train
292+
# split does not contain all classes
293+
# Since this test uses LOO, at each iteration train set will not contain a
294+
# class label
295+
X = np.random.randn(10, 5)
296+
y = np.arange(10)
297+
clf = LinearSVC(C=1.0)
298+
cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut())
299+
cal_clf.fit(X, y)
300+
301+
for i, calibrated_classifier in \
302+
enumerate(cal_clf.calibrated_classifiers_):
303+
proba = calibrated_classifier.predict_proba(X)
304+
assert_array_equal(proba[:, i], np.zeros(len(y)))
305+
assert_equal(np.all(np.hstack([proba[:, :i],
306+
proba[:, i + 1:]])), True)

0 commit comments

Comments
 (0)
0