8000 Merge branch 'scikit-learn:main' into update-scikit-learn · scikit-learn/scikit-learn@c37cf78 · GitHub
[go: up one dir, main page]

Skip to content

Commit c37cf78

Browse files
authored
< 8000 div class="CommitHeader-module__commit-message-container--nl1pf">
Merge branch 'scikit-learn:main' into update-scikit-learn
2 parents 5e76ebd + de968ed commit c37cf78

36 files changed

+597
-364
lines changed

doc/modules/classes.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1122,7 +1122,7 @@ See the :ref:`visualizations` section of the user guide for further details.
11221122

11231123
.. autosummary::
11241124
:toctree: generated/
1125-
:template: display.rst
1125+
:template: display_all_class_methods.rst
11261126

11271127
metrics.ConfusionMatrixDisplay
11281128
metrics.DetCurveDisplay

doc/whats_new/v1.3.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,12 @@ Changelog
240240
dataframe.
241241
:pr:`25931` by :user:`Yao Xiao <Charlie-XIAO>`.
242242

243+
- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and
244+
:class:`ensemble.HistGradientBoostingClassifier` treats negative values for
245+
categorical features consistently as missing values, following LightGBM's and
246+
pandas' conventions.
247+
:pr:`25629` by `Thomas Fan`_.
248+
243249
:mod:`sklearn.exception`
244250
........................
245251
- |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
@@ -284,8 +290,8 @@ Changelog
284290
estimators consistent with the rest of estimators.
285291
:pr:`25697` by :user:`John Pangas <jpangas>`.
286292

287-
- |Enhancement| The `n_iter_` attribute has been included in
288-
:class:`linear_model.ARDRegression` to expose the actual number of iterations
293+
- |Enhancement| The `n_iter_` attribute has been included in
294+
:class:`linear_model.ARDRegression` to expose the actual number of iterations
289295
required to reach the stopping criterion.
290296
:pr:`25697` by :user:`John Pangas <jpangas>`.
291297

sklearn/calibration.py

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,16 @@
3030
from .utils import (
3131
column_or_1d,
3232
indexable,
33-
check_matplotlib_support,
3433
_safe_indexing,
3534
)
36-
from .utils._response import _get_response_values_binary
3735

38-
from .utils.multiclass import check_classification_targets, type_of_target
36+
from .utils.multiclass import check_classification_targets
3937
from .utils.parallel import delayed, Parallel
4038
from .utils._param_validation import StrOptions, HasMethods, Hidden
39+
from .utils._plotting import _BinaryClassifierCurveDisplayMixin
4140
from .utils.validation import (
4241
_check_fit_params,
42+
_check_pos_label_consistency,
4343
_check_sample_weight,
4444
_num_samples,
4545
check_consistent_length,
@@ -48,7 +48,6 @@
4848
from .isotonic import IsotonicRegression
4949
from .svm import LinearSVC
5050
from .model_selection import check_cv, cross_val_predict
51-
from .metrics._base import _check_pos_label_consistency
5251

5352

5453
class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -1013,7 +1012,7 @@ def calibration_curve(
10131012
return prob_true, prob_pred
10141013

10151014

1016-
class CalibrationDisplay:
1015+
class CalibrationDisplay(_BinaryClassifierCurveDisplayMixin):
10171016
"""Calibration curve (also known as reliability diagram) visualization.
10181017
10191018
It is recommended to use
@@ -1124,13 +1123,8 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
11241123
display : :class:`~sklearn.calibration.CalibrationDisplay`
11251124
Object that stores computed values.
11261125
"""
1127-
check_matplotlib_support("CalibrationDisplay.plot")
1128-
import matplotlib.pyplot as plt
1126+
self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
11291127

1130-
if ax is None:
1131-
fig, ax = plt.subplots()
1132-
1133-
name = self.estimator_name if name is None else name
11341128
info_pos_label = (
11351129
f"(Positive class: {self.pos_label})" if self.pos_label is not None else ""
11361130
)
@@ -1141,20 +1135,20 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
11411135
line_kwargs.update(**kwargs)
11421136

11431137
ref_line_label = "Perfectly calibrated"
1144-
existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]
1138+
existing_ref_line = ref_line_label in self.ax_.get_legend_handles_labels()[1]
11451139
if ref_line and not existing_ref_line:
1146-
ax.plot([0, 1], [0, 1], "k:", label=ref_line_label)
1147-
self.line_ = ax.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[0]
1140+
self.ax_.plot([0, 1], [0, 1], "k:", label=ref_line_label)
1141+
self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[
1142+
0
1143+
]
11481144

11491145
# We always have to show the legend for at least the reference line
1150-
ax.legend(loc="lower right")
1146+
self.ax_.legend(loc="lower right")
11511147

11521148
xlabel = f"Mean predicted probability {info_pos_label}"
11531149
ylabel = f"Fraction of positives {info_pos_label}"
1154-
ax.set(xlabel=xlabel, ylabel=ylabel)
1150+
self.ax_.set(xlabel=xlabel, ylabel=ylabel)
11551151

1156-
self.ax_ = ax
1157-
self.figure_ = ax.figure
11581152
return self
11591153

11601154
@classmethod
@@ -1260,15 +1254,15 @@ def from_estimator(
12601254
>>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)
12611255
>>> plt.show()
12621256
"""
1263-
method_name = f"{cls.__name__}.from_estimator"
1264-
check_matplotlib_support(method_name)
1265-
1266-
check_is_fitted(estimator)
1267-
y_prob, pos_label = _get_response_values_binary(
1268-
estimator, X, response_method="predict_proba", pos_label=pos_label
1257+
y_prob, pos_label, name = cls._validate_and_get_response_values(
1258+
estimator,
1259+
X,
1260+
y,
1261+
response_method="predict_proba",
1262+
pos_label=pos_label,
1263+
name=name,
12691264
)
12701265

1271-
name = name if name is not None else estimator.__class__.__name__
12721266
return cls.from_predictions(
12731267
y,
12741268
y_prob,
@@ -1378,26 +1372,19 @@ def from_predictions(
13781372
>>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)
13791373
>>> plt.show()
13801374
"""
1381-
method_name = f"{cls.__name__}.from_predictions"
1382-
check_matplotlib_support(method_name)
1383-
1384-
target_type = type_of_target(y_true)
1385-
if target_type != "binary":
1386-
raise ValueError(
1387-
f"The target y is not binary. Got {target_type} type of target."
1388-
)
1375+
pos_label_validated, name = cls._validate_from_predictions_params(
1376+
y_true, y_prob, sample_weight=None, pos_label=pos_label, name=name
1377+
)
13891378

13901379
prob_true, prob_pred = calibration_curve(
13911380
y_true, y_prob, n_bins=n_bins, strategy=strategy, pos_label=pos_label
13921381
)
1393-
name = "Classifier" if name is None else name
1394-
pos_label = _check_pos_label_consistency(pos_label, y_true)
13951382

13961383
disp = cls(
13971384
prob_true=prob_true,
13981385
prob_pred=prob_pred,
13991386
y_prob=y_prob,
14001387
estimator_name=name,
1401-
pos_label=pos_label,
1388+
pos_label=pos_label_validated,
14021389
)
14031390
return disp.plot(ax=ax, ref_line=ref_line, **kwargs)

sklearn/compose/_column_transformer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,8 @@ def _get_transformer_list(estimators):
936936
return transformer_list
937937

938938

939+
# This function is not validated using validate_params because
940+
# it's just a factory for ColumnTransformer.
939941
def make_column_transformer(
940942
*transformers,
941943
remainder="drop",

sklearn/discriminant_analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ def fit(self, X, y):
640640
intercept_ = xp.asarray(
641641
self.intercept_[1] - self.intercept_[0], dtype=X.dtype
642642
)
643-
self.intercept_ = xp.reshape(intercept_, 1)
643+
self.intercept_ = xp.reshape(intercept_, (1,))
644644
self._n_features_out = self._max_components
645645
return self
646646

sklearn/ensemble/_hist_gradient_boosting/_binning.pyx

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
88

99
def _map_to_bins(const X_DTYPE_C [:, :] data,
1010
list binning_thresholds,
11+
const unsigned char[::1] is_categorical,
1112
const unsigned char missing_values_bin_idx,
1213
int n_threads,
1314
X_BINNED_DTYPE_C [::1, :] binned):
@@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
2324
binning_thresholds : list of arrays
2425
For each feature, stores the increasing numeric values that are
2526
used to separate the bins.
27+
is_categorical : ndarray of unsigned char of shape (n_features,)
28+
Indicates categorical features.
2629
n_threads : int
2730
Number of OpenMP threads to use.
2831
binned : ndarray, shape (n_samples, n_features)
@@ -34,13 +37,15 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
3437
for feature_idx in range(data.shape[1]):
3538
_map_col_to_bins(data[:, feature_idx],
3639
binning_thresholds[feature_idx],
40+
is_categorical[feature_idx],
3741
missing_values_bin_idx,
3842
n_threads,
3943
binned[:, feature_idx])
4044

4145

4246
cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
4347
const X_DTYPE_C [:] binning_thresholds,
48+
const unsigned char is_categorical,
4449
const unsigned char missing_values_bin_idx,
4550
int n_threads,
4651
X_BINNED_DTYPE_C [:] binned):
@@ -53,7 +58,12 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
5358

5459
for i in prange(data.shape[0], schedule='static', nogil=True,
5560
num_threads=n_threads):
56-
if isnan(data[i]):
61+
if (
62+
isnan(data[i]) or
63+
# To follow LightGBM's conventions, negative values for
64+
# categorical features are considered as missing values.
65+
(is_categorical and data[i] < 0)
66+
):
5767
binned[i] = missing_values_bin_idx
5868
else:
5969
# for known values, use binary search

sklearn/ensemble/_hist_gradient_boosting/binning.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,12 @@ def transform(self, X):
275275
n_threads = _openmp_effective_n_threads(self.n_threads)
276276
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
277277
_map_to_bins(
278-
X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
278+
X,
279+
self.bin_thresholds_,
280+
self.is_categorical_,
281+
self.missing_values_bin_idx_,
282+
n_threads,
283+
binned,
279284
)
280285
return binned
281286

sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,11 @@ def _check_categories(self, X):
269269
if missing.any():
270270
categories = categories[~missing]
271271

272+
# Treat negative values for categorical features as missing values.
273+
negative_categories = categories < 0
274+
if negative_categories.any():
275+
categories = categories[~negative_categories]
276+
272277
if hasattr(self, "feature_names_in_"):
273278
feature_name = f"'{self.feature_names_in_[f_idx]}'"
274279
else:
@@ -1265,9 +1270,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
12651270
data has feature names).
12661271
12671272
For each categorical feature, there must be at most `max_bins` unique
1268-
categories, and each categorical value must be in [0, max_bins -1].
1269-
During prediction, categories encoded as a negative value are treated as
1270-
missing values.
1273+
categories, and each categorical value must be less then `max_bins - 1`.
1274+
Negative values for categorical features are treated as missing values.
12711275
12721276
Read more in the :ref:`User Guide <categorical_support_gbdt>`.
12731277
@@ -1623,9 +1627,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
16231627
data has feature names).
16241628
16251629
For each categorical feature, there must be at most `max_bins` unique
1626-
categories, and each categorical value must be in [0, max_bins -1].
1627-
During prediction, categories encoded as a negative value are treated as
1628-
missing values.
1630+
categories, and each categorical value must be less then `max_bins - 1`.
1631+
Negative values for categorical features are treated as missing values.
16291632
16301633
Read more in the :ref:`User Guide <categorical_support_gbdt>`.
16311634

sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ def test_map_to_bins(max_bins):
9595
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
9696
]
9797
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
98+
is_categorical = np.zeros(2, dtype=np.uint8)
9899
last_bin_idx = max_bins
99-
_map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
100+
_map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
100101
assert binned.shape == DATA.shape
101102
assert binned.dtype == np.uint8
102103
assert binned.flags.f_contiguous
@@ -357,10 +358,35 @@ def test_categorical_feature(n_bins):
357358
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
358359
assert_array_equal(bin_mapper.transform(X), expected_trans)
359360

360-
# For unknown categories, the mapping is incorrect / undefined. This never
361-
# happens in practice. This check is only for illustration purpose.
362-
X = np.array([[-1, 100]], dtype=X_DTYPE).T
363-
expected_trans = np.array([[0, 6]]).T
361+
# Negative categories are mapped to the missing values' bin
362+
# (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
363+
# Unknown positive categories does not happen in practice and tested
364+
# for illustration purpose.
365+
X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
366+
expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
367+
assert_array_equal(bin_mapper.transform(X), expected_trans)
368+
369+
370+
def test_categorical_feature_negative_missing():
371+
"""Make sure bin mapper treats negative categories as missing values."""
372+
X = np.array(
373+
[[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
374+
).T
375+
bin_mapper = _BinMapper(
376+
n_bins=4,
377+
is_categorical=np.array([True]),
378+
known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
379+
).fit(X)
380+
381+
assert bin_mapper.n_bins_non_missing_ == [3]
382+
383+
X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T
384+
385+
# Negative values for categorical features are considered as missing values.
386+
# They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
387+
# which is 3 here.
388+
assert bin_mapper.missing_values_bin_idx_ == 3
389+
expected_trans = np.array([[3, 0, 1, 2, 3]]).T
364390
assert_array_equal(bin_mapper.transform(X), expected_trans)
365391

366392

sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,10 @@ def test_staged_predict(HistGradientBoosting, X, y):
950950
"Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier)
951951
)
952952
@pytest.mark.parametrize("bool_categorical_parameter", [True, False])
953-
def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter):
953+
@pytest.mark.parametrize("missing_value", [np.nan, -1])
954+
def test_unknown_categories_nan(
955+
insert_missing, Est, bool_categorical_parameter, missing_value
956+
):
954957
# Make sure no error is raised at predict if a category wasn't seen during
955958
# fit. We also make sure they're treated as nans.
956959

@@ -970,7 +973,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
970973
if insert_missing:
971974
mask = rng.binomial(1, 0.01, size=X.shape).astype(bool)
972975
assert mask.sum() > 0
973-
X[mask] = np.nan
976+
X[mask] = missing_value
974977

975978
est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y)
976979
assert_array_equal(est.is_categorical_, [False, True])
@@ -979,7 +982,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
979982
# unknown categories will be treated as nans
980983
X_test = np.zeros((10, X.shape[1]), dtype=float)
981984
X_test[:5, 1] = 30
982-
X_test[5:, 1] = np.nan
985+
X_test[5:, 1] = missing_value
983986
assert len(np.unique(est.predict(X_test))) == 1
984987

985988

sklearn/impute/_base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from scipy import sparse as sp
1212

1313
from ..base import BaseEstimator, TransformerMixin
14-
from ..utils._param_validation import StrOptions, Hidden
14+
from ..utils._param_validation import StrOptions, Hidden, MissingValues
1515
from ..utils.fixes import _mode
1616
from ..utils.sparsefuncs import _get_median
1717
from ..utils.validation import check_is_fitted
@@ -78,7 +78,7 @@ class _BaseImputer(TransformerMixin, BaseEstimator):
7878
"""
7979

8080
_parameter_constraints: dict = {
81-
"missing_values": ["missing_values"],
81+
"missing_values": [MissingValues()],
8282
"add_indicator": ["boolean"],
8383
"keep_empty_features": ["boolean"],
8484
}
@@ -800,7 +800,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
800800
"""
801801

802802
_parameter_constraints: dict = {
803-
"missing_values": [numbers.Real, numbers.Integral, str, None],
803+
"missing_values": [MissingValues()],
804804
"features": [StrOptions({"missing-only", "all"})],
805805
"sparse": ["boolean", StrOptions({"auto"})],
806806
"error_on_new": ["boolean"],

sklearn/linear_model/_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def decision_function(self, X):
399399

400400
X = self._validate_data(X, accept_sparse="csr", reset=False)
401401
scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
402-
return xp.reshape(scores, -1) if scores.shape[1] == 1 else scores
402+
return xp.reshape(scores, (-1,)) if scores.shape[1] == 1 else scores
403403

404404
def predict(self, X):
405405
"""

0 commit comments

Comments
 (0)
0