8000 DEP Deprecate n_classes_ in GradientBoostingRegressor (#17702) · jayzed82/scikit-learn@c6e0627 · GitHub
[go: up one dir, main page]

Skip to content

Commit c6e0627

Browse files
simonamaggioglemaitre
authored andcommitted
DEP Deprecate n_classes_ in GradientBoostingRegressor (scikit-learn#17702)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 374ac20 commit c6e0627

File tree

5 files changed

+99
-57
lines changed

5 files changed

+99
-57
lines changed

doc/whats_new/v0.24.rst

+4
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ Changelog
147147
which allows monitoring of each stage.
148148
:pr:`16985` by :user:`Hao Chun Chang <haochunchang>`.
149149

150+
- |API|: The parameter ``n_classes_`` is now deprecated in
151+
:class:`ensemble.GradientBoostingRegressor` and returns `1`.
152+
:pr:`17702` by :user:`Simona Maggio <simonamaggio>`.
153+
150154
:mod:`sklearn.exceptions`
151155
.........................
152156

sklearn/ensemble/_gb.py

+46-20
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from ..base import RegressorMixin
3030
from ..base import BaseEstimator
3131
from ..base import is_classifier
32+
from ..utils import deprecated
3233

3334
from ._gradient_boosting import predict_stages
3435
from ._gradient_boosting import predict_stage
@@ -165,9 +166,13 @@ def __init__(self, *, loss, learning_rate, n_estimators, criterion,
165166
self.n_iter_no_change = n_iter_no_change
166167
self.tol = tol
167168

169+
@abstractmethod
170+
def _validate_y(self, y, sample_weight=None):
171+
"""Called by fit to validate y"""
172+
168173
def _fit_stage(self, i, X, y, raw_predictions, sample_weight, sample_mask,
169174
random_state, X_csc=None, X_csr=None):
170-
"""Fit another stage of ``n_classes_`` trees to the boosting model. """
175+
"""Fit another stage of ``_n_classes`` trees to the boosting model. """
171176

172177
assert sample_mask.dtype == bool
173178
loss = self.loss_
@@ -240,10 +245,12 @@ def _check_params(self):
240245
else:
241246
loss_class = _gb_losses.LOSS_FUNCTIONS[self.loss]
242247

243-
if self.loss in ('huber', 'quantile'):
244-
self.loss_ = loss_class(self.n_classes_, self.alpha)
245-
else:
248+
if is_classifier(self):
246249
self.loss_ = loss_class(self.n_classes_)
250+
elif self.loss in ("huber", "quantile"):
251+
self.loss_ = loss_class(self.alpha)
252+
else:
253+
self.loss_ = loss_class()
247254

248255
if not (0.0 < self.subsample <= 1.0):
249256
raise ValueError("subsample must be in (0,1] but "
@@ -265,11 +272,9 @@ def _check_params(self):
265272

266273
if isinstance(self.max_features, str):
267274
if self.max_features == "auto":
268-
# if is_classification
269-
if self.n_classes_ > 1:
275+
if is_classifier(self):
270276
max_features = max(1, int(np.sqrt(self.n_features_)))
271277
else:
272-
# is regression
273278
max_features = self.n_features_
274279
elif self.max_features == "sqrt":
275280
max_features = max(1, int(np.sqrt(self.n_features_)))
@@ -405,7 +410,11 @@ def fit(self, X, y, sample_weight=None, monitor=None):
405410
sample_weight = _check_sample_weight(sample_weight, X)
406411

407412
y = column_or_1d(y, warn=True)
408-
y = self._validate_y(y, sample_weight)
413+
414+
if is_classifier(self):
415+
y = self._validate_y(y, sample_weight)
416+
else:
417+
y = self._validate_y(y)
409418

410419
if self.n_iter_no_change is not None:
411420
stratify = y if is_classifier(self) else None
@@ -415,7 +424,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
415424
test_size=self.validation_fraction,
416425
stratify=stratify))
417426
if is_classifier(self):
418-
if self.n_classes_ != np.unique(y).shape[0]:
427+
if self._n_classes != np.unique(y).shape[0]:
419428
# We choose to error here. The problem is that the init
420429
# estimator would be trained on y, which has some missing
421430
# classes now, so its predictions would not have the
@@ -711,15 +720,6 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
711720

712721
return averaged_predictions
713722

714-
def _validate_y(self, y, sample_weight):
715-
# 'sample_weight' is not utilised but is used for
716-
# consistency with similar method _validate_y of GBC
717-
self.n_classes_ = 1
718-
if y.dtype.kind == 'O':
719-
y = y.astype(DOUBLE)
720-
# Default implementation
721-
return y
722-
723723
def apply(self, X):
724724
"""Apply trees in the ensemble to X, return leaf indices.
725725
@@ -1096,7 +1096,9 @@ def _validate_y(self, y, sample_weight):
10961096
"trimmed classes with zero weights, while a "
10971097
"minimum of 2 classes are required."
10981098
% n_trim_classes)
1099-
self.n_classes_ = len(self.classes_)
1099+
self._n_classes = len(self.classes_)
1100+
# expose n_classes_ attribute
1101+
self.n_classes_ = self._n_classes
11001102
return y
11011103

11021104
def decision_function(self, X):
@@ -1507,7 +1509,11 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
15071509
The collection of fitted sub-estimators.
15081510
15091511
n_classes_ : int
1510-
The number of classes, set to 1 in regression tasks.
1512+
The number of classes, set to 1 for regressors.
1513+
1514+
.. deprecated:: 0.24
1515+
Attribute ``n_classes_`` was deprecated in version 0.24 and
1516+
will be removed in 0.26.
15111517
15121518
n_estimators_ : int
15131519
The number of estimators as selected by early stopping (if
@@ -1589,6 +1595,11 @@ def __init__(self, *, loss='ls', learning_rate=0.1, n_estimators=100,
15891595
validation_fraction=validation_fraction,
15901596
n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)
15911597

1598+
def _validate_y(self, y, sample_weight=None):
1599+
if y.dtype.kind == 'O':
1600+
y = y.astype(DOUBLE)
1601+
return y
1602+
15921603
def predict(self, X):
15931604
"""Predict regression target for X.
15941605
@@ -1651,3 +1662,18 @@ def apply(self, X):
16511662
leaves = super().apply(X)
16521663
leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0])
16531664
return leaves
1665+
1666+
# FIXME: to be removed in 0.26
1667+
# mypy error: Decorated property not supported
1668+
@deprecated("Attribute n_classes_ was deprecated " # type: ignore
1669+
"in version 0.24 and will be removed in 0.26.")
1670+
@property
1671+
def n_classes_(self):
1672+
try:
1673+
check_is_fitted(self)
1674+
except NotFittedError as nfe:
1675+
raise AttributeError(
1676+
"{} object has no n_classes_ attribute."
1677+
.format(self.__class__.__name__)
1678+
) from nfe
1679+
return 1

sklearn/ensemble/_gb_losses.py

+7-22
Original file line numberDiff line numberDiff line change
@@ -145,18 +145,9 @@ def get_init_raw_predictions(self, X, estimator):
145145

146146

147147
class RegressionLossFunction(LossFunction, metaclass=ABCMeta):
148-
"""Base class for regression loss functions.
149-
150-
Parameters
151-
----------
152-
n_classes : int
153-
Number of classes.
154-
"""
155-
def __init__(self, n_classes):
156-
if n_classes != 1:
157-
raise ValueError("``n_classes`` must be 1 for regression but "
158-
"was %r" % n_classes)
159-
super().__init__(n_classes)
148+
"""Base class for regression loss functions."""
149+
def __init__(self):
150+
super().__init__(n_classes=1)
160151

161152
def check_init_estimator(self, estimator):
162153
"""Make sure estimator has the required fit and predict methods.
@@ -328,9 +319,6 @@ class HuberLossFunction(RegressionLossFunction):
328319
329320
Parameters
330321
----------
331-
n_classes : int
332-
Number of classes.
333-
334322
alpha : float, default=0.9
335323
Percentile at which to extract score.
336324
@@ -340,8 +328,8 @@ class HuberLossFunction(RegressionLossFunction):
340328
Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.
341329
"""
342330

343-
def __init__(self, n_classes, alpha=0.9):
344-
super().__init__(n_classes)
331+
def __init__(self, alpha=0.9):
332+
super().__init__()
345333
self.alpha = alpha
346334
self.gamma = None
347335

@@ -439,14 +427,11 @@ class QuantileLossFunction(RegressionLossFunction):
439427
440428
Parameters
441429
----------
442-
n_classes : int
443-
Number of classes.
444-
445430
alpha : float, default=0.9
446431
The percentile.
447432
"""
448-
def __init__(self, n_classes, alpha=0.9):
449-
super().__init__(n_classes)
433+
def __init__(self, alpha=0.9):
434+
super().__init__()
450435
self.alpha = alpha
451436
self.percentile = alpha * 100
452437

sklearn/ensemble/tests/test_gradient_boosting.py

+27
Original file line numberDiff line numberDiff line change
@@ -1306,3 +1306,30 @@ def test_gbr_degenerate_feature_importances():
13061306
gbr = GradientBoostingRegressor().fit(X, y)
13071307
assert_array_equal(gbr.feature_importances_,
13081308
np.zeros(10, dtype=np.float64))
1309+
1310+
1311+
# TODO: Remove in 0.26 when `n_classes_` is deprecated
1312+
def test_gbr_deprecated_attr():
1313+
# check that accessing n_classes_ in GradientBoostingRegressor raises
1314+
# a deprecation warning
1315+
X = np.zeros((10, 10))
1316+
y = np.ones((10,))
1317+
gbr = GradientBoostingRegressor().fit(X, y)
1318+
msg = "Attribute n_classes_ was deprecated"
1319+
with pytest.warns(FutureWarning, match=msg):
1320+
gbr.n_classes_
1321+
1322+
1323+
# TODO: Remove in 0.26 when `n_classes_` is deprecated
1324+
@pytest.mark.filterwarnings("ignore:Attribute n_classes_ was deprecated")
1325+
def test_attr_error_raised_if_not_fitted():
1326+
# check that accessing n_classes_ in not fitted GradientBoostingRegressor
1327+
# raises an AttributeError
1328+
gbr = GradientBoostingRegressor()
1329+
# test raise AttributeError if not fitted
1330+
msg = (
1331+
f"{GradientBoostingRegressor.__name__} object has no n_classes_ "
1332+
f"attribute."
1333+
)
1334+
with pytest.raises(AttributeError, match=msg):
1335+
gbr.n_classes_

sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py

+15-15
< 3E26 tr class="diff-line-row">
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_sample_weight_smoke():
6262
pred = rng.rand(100)
6363

6464
# least squares
65-
loss = LeastSquaresError(1)
65+
loss = LeastSquaresError()
6666
loss_wo_sw = loss(y, pred)
6767
loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32))
6868
assert_almost_equal(loss_wo_sw, loss_w_sw)
@@ -81,16 +81,16 @@ def test_sample_weight_init_estimators():
8181
if Loss is None:
8282
continue
8383
if issubclass(Loss, RegressionLossFunction):
84-
k = 1
8584
y = reg_y
85+
loss = Loss()
8686
else:
8787
k = 2
8888
y = clf_y
8989
if Loss.is_multi_class:
9090
# skip multiclass
9191
continue
92+
loss = Loss(k)
9293

93-
loss = Loss(k)
9494
init_est = loss.init_estimator()
9595
init_est.fit(X, y)
9696
out = loss.get_init_raw_predictions(X, init_est)
@@ -110,7 +110,7 @@ def test_quantile_loss_function():
110110
# There was a sign problem when evaluating the function
111111
# for negative values of 'ytrue - ypred'
112112
x = np.asarray([-1.0, 0.0, 1.0])
113-
y_found = QuantileLossFunction(1, 0.9)(x, np.zeros_like(x))
113+
y_found = QuantileLossFunction(0.9)(x, np.zeros_like(x))
114114
y_expected = np.asarray([0.1, 0.0, 0.9]).mean()
115115
np.testing.assert_allclose(y_found, y_expected)
116116

@@ -127,9 +127,9 @@ def test_sample_weight_deviance():
127127
if Loss is None:
128128
continue
129129
if issubclass(Loss, RegressionLossFunction):
130-
k = 1
131130
y = reg_y
132131
p = reg_y
132+
loss = Loss()
133133
else:
134134
k = 2
135135
y = clf_y
@@ -141,8 +141,8 @@ def test_sample_weight_deviance():
141141
p = np.zeros((y.shape[0], k), dtype=np.float64)
142142
for i in range(k):
143143
p[:, i] = y == i
144+
loss = Loss(k)
144145

145-
loss = Loss(k)
146146
deviance_w_w = loss(y, p, sample_weight)
147147
deviance_wo_w = loss(y, p)
148148
assert deviance_wo_w == deviance_w_w
@@ -201,10 +201,10 @@ def test_init_raw_predictions_shapes():
201201
n_samples = 100
202202
X = rng.normal(size=(n_samples, 5))
203203
y = rng.normal(size=n_samples)
204-
for loss in (LeastSquaresError(n_classes=1),
205-
LeastAbsoluteError(n_classes=1),
206-
QuantileLossFunction(n_classes=1),
207-
HuberLossFunction(n_classes=1)):
204+
for loss in (LeastSquaresError(),
205+
LeastAbsoluteError(),
206+
QuantileLossFunction(),
207+
HuberLossFunction()):
208208
init_estimator = loss.init_estimator().fit(X, y)
209209
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
210210
assert raw_predictions.shape == (n_samples, 1)
@@ -237,23 +237,23 @@ def test_init_raw_predictions_values():
237237
y = rng.normal(size=n_samples)
238238

239239
# Least squares loss
240-
loss = LeastSquaresError(n_classes=1)
240+
loss = LeastSquaresError()
241241
init_estimator = loss.init_estimator().fit(X, y)
242242
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
243243
# Make sure baseline prediction is the mean of all targets
244244
assert_almost_equal(raw_predictions, y.mean())
245245

246246
# Least absolute and huber loss
247247
for Loss in (LeastAbsoluteError, HuberLossFunction):
248-
loss = Loss(n_classes=1)
248+
loss = Loss()
249249
init_estimator = loss.init_estimator().fit(X, y)
250250
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
251251
# Make sure baseline prediction is the median of all targets
252252
assert_almost_equal(raw_predictions, np.median(y))
253253

254254
# Quantile loss
255255
for alpha in (.1, .5, .9):
256-
loss = QuantileLossFunction(n_classes=1, alpha=alpha)
256+
loss = QuantileLossFunction(alpha=alpha)
257257
init_estimator = loss.init_estimator().fit(X, y)
258258
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
259259
# Make sure baseline prediction is the alpha-quantile of all targets
@@ -294,8 +294,8 @@ def test_init_raw_predictions_values():
294294
@pytest.mark.parametrize('seed', range(5))
295295
def test_lad_equals_quantile_50(seed):
296296
# Make sure quantile loss with alpha = .5 is equivalent to LAD
297-
lad = LeastAbsoluteError(n_classes=1)
298-
ql = QuantileLossFunction(n_classes=1, alpha=0.5)
297+
lad = LeastAbsoluteError()
298+
ql = QuantileLossFunction(alpha=0.5)
299299

300300
n_samples = 50
301301
rng = np.random.RandomState(seed)

0 commit comments

Comments
 (0)
0