8000 Added checks and tests for init estimators output shape · scikit-learn/scikit-learn@e2e1a5c · GitHub
[go: up one dir, main page]

Skip to content

Commit e2e1a5c

Browse files
committed
Added checks and tests for init estimators output shape
1 parent 2c06512 commit e2e1a5c

File tree

2 files changed

+82
-24
lines changed

2 files changed

+82
-24
lines changed

sklearn/ensemble/_gb_losses.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,16 @@ def check_init_estimator(self, estimator):
174174
)
175175

176176
def get_init_raw_predictions(self, X, estimator):
177-
return estimator.predict(X).reshape(-1, 1).astype(np.float64)
177+
predictions = estimator.predict(X)
178+
n_samples = X.shape[0]
179+
if predictions.shape != (n_samples,):
180+
# if the init estimator was trained for e.g. multioutput
181+
# regression, raise error
182+
raise ValueError(
183+
'The init estimator predicted output with shape={}, '
184+
'expected shape=({},).'.format(predictions.shape, n_samples)
185+
)
186+
return predictions.reshape(-1, 1).astype(np.float64)
178187

179188

180189
class LeastSquaresError(RegressionLossFunction):
@@ -658,7 +667,14 @@ def _raw_prediction_to_decision(self, raw_predictions):
658667
return np.argmax(proba, axis=1)
659668

660669
def get_init_raw_predictions(self, X, estimator):
661-
proba_pos_class = estimator.predict_proba(X)[:, 1]
670+
probas = estimator.predict_proba(X)
671+
n_samples = X.shape[0]
672+
if probas.shape != (n_samples, 2):
673+
raise ValueError(
674+
'The init estimator predicted probabilities with shape={}, '
675+
'expected shape=({},)'.format(probas.shape, n_samples)
676+
)
677+
proba_pos_class = probas[:, 1]
662678
eps = np.finfo(np.float32).eps
663679
proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)
664680
# log(x / (1 - x)) is the inverse of the sigmoid (expit) function
@@ -766,9 +782,15 @@ def _raw_prediction_to_decision(self, raw_predictions):
766782

767783
def get_init_raw_predictions(self, X, estimator):
768784
probas = estimator.predict_proba(X)
785+
n_samples = X.shape[0]
786+
if probas.shape != (n_samples, self.K):
787+
raise ValueError(
788+
'The init estimator predicted probabilities with shape={}, '
789+
'expected shape={}'.format(probas.shape, (n_samples, self.K))
790+
)
791+
769792
eps = np.finfo(np.float32).eps
770793
probas = np.clip(probas, eps, 1 - eps)
771-
772794
raw_predictions = np.log(probas).astype(np.float64)
773795
return raw_predictions
774796

@@ -862,7 +884,14 @@ def _raw_prediction_to_decision(self, raw_predictions):
862884
return (raw_predictions.ravel() >= 0).astype(np.int)
863885

864886
def get_init_raw_predictions(self, X, estimator):
865-
proba_pos_class = estimator.predict_proba(X)[:, 1]
887+
probas = estimator.predict_proba(X)
888+
n_samples = X.shape[0]
889+
if probas.shape != (n_samples, 2):
890+
raise ValueError(
891+
'The init estimator predicted probabilities with shape={}, '
892+
'expected shape=({},)'.format(probas.shape, n_samples)
893+
)
894+
proba_pos_class = probas[:, 1]
866895
eps = np.finfo(np.float32).eps
867896
proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)
868897
# according to The Elements of Statistical Learning sec. 10.5, the

sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from numpy.testing import assert_almost_equal
77
from numpy.testing import assert_allclose
88
from numpy.testing import assert_equal
9+
import pytest
910

1011
from sklearn.utils import check_random_state
1112
from sklearn.utils.stats import _weighted_percentile
@@ -18,6 +19,8 @@
1819
from sklearn.ensemble._gb_losses import MultinomialDeviance
1920
from sklearn.ensemble._gb_losses import ExponentialLoss
2021
from sklearn.ensemble._gb_losses import LOSS_FUNCTIONS
22+
from sklearn.linear_model import LinearRegression
23+
from sklearn.linear_model import LogisticRegression
2124

2225

2326
def test_binomial_deviance():
@@ -257,27 +260,13 @@ def test_init_raw_predictions_values():
257260
p = y.mean()
258261
assert_almost_equal(raw_predictions, np.log(p / (1 - p)))
259262

260-
# FIXME: uncomment this and fix
261-
# for y_unstable in (np.zeros(shape=n_samples), np.ones(shape=n_samples)):
262-
# init_estimator = loss.init_estimator().fit(X, y_unstable)
263-
# raw_predictions = loss.get_init_raw_predictions(y_unstable,
264-
# init_estimator)
265-
# assert_all_finite(raw_predictions)
266-
267263
# Exponential loss
268264
loss = ExponentialLoss(n_classes=2)
269265
init_estimator = loss.init_estimator().fit(X, y)
270266
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
271267
p = y.mean()
272268
assert_almost_equal(raw_predictions, .5 * np.log(p / (1 - p)))
273269

274-
# FIXME: uncomment this and fix
275-
# for y_unstable in (np.zeros(shape=n_samples), np.ones(shape=n_samples)):
276-
# init_estimator = loss.init_estimator().fit(X, y_unstable)
277-
# raw_predictions = loss.get_init_raw_predictions(y_unstable,
278-
# init_estimator)
279-
# assert_all_finite(raw_predictions)
280-
281270
# Multinomial deviance loss
282271
for n_classes in range(3, 5):
283272
y = rng.randint(0, n_classes, size=n_samples)
@@ -288,9 +277,49 @@ def test_init_raw_predictions_values():
288277
p = (y == k).mean()
289278
assert_almost_equal(raw_predictions[:, k], np.log(p))
290279

291-
# FIXME: uncomment this and fix
292-
# for y_unstable in (np.zeros(shape=n_samples), np.ones(shape=n_samples)):
293-
# init_estimator = loss.init_estimator().fit(X, y_unstable)
294-
# raw_predictions = loss.get_init_raw_predictions(y_unstable,
295-
# init_estimator)
296-
# assert_all_finite(raw_predictions)
280+
281+
def test_bad_init_estimator():
282+
# check that the init estimator predict() or predict_proba() methods output
283+
# expected shape
284+
285+
rng = np.random.RandomState(0)
286+
n_samples = 100
287+
288+
X = rng.normal(size=(n_samples, 10))
289+
290+
# Regression losses
291+
# train init estimator on multioutput regression target
292+
y_init_est = rng.normal(size=(n_samples, 2))
293+
lr = LinearRegression().fit(X, y_init_est)
294+
for loss in (LeastSquaresError(n_classes=1),
295+
LeastAbsoluteError(n_classes=1),
296+
QuantileLossFunction(n_classes=1),
297+
HuberLossFunction(n_classes=1)):
298+
with pytest.raises(
299+
ValueError,
300+
match='The init estimator predicted output with shape'
301+
):
302+
loss.get_init_raw_predictions(X, estimator=lr)
303+
304+
# Binomial deviance and exponential loss
305+
# train init estimator on 3 classes instead of 2
306+
y_init_est = rng.randint(0, 3, size=(n_samples))
307+
lr = LogisticRegression().fit(X, y_init_est)
308+
for loss in (BinomialDeviance(n_classes=2),
309+
ExponentialLoss(n_classes=2)):
310+
with pytest.raises(
311+
ValueError,
312+
match='The init estimator predicted probabilities with shape'
313+
):
314+
loss.get_init_raw_predictions(X, estimator=lr)
315+
316+
# Multinomial deviance
317+
# train init estimator on 4 classes instead of 3
318+
y_init_est = rng.randint(0, 4, size=(n_samples))
319+
lr = LogisticRegression().fit(X, y_init_est)
320+
loss = MultinomialDeviance(n_classes=3)
321+
with pytest.raises(
322+
ValueError,
323+
match='The init estimator predicted probabilities with shape'
324+
):
325+
loss.get_init_raw_predictions(X, estimator=lr)

0 commit comments

Comments
 (0)
0