8000 FIX Adds support for negative values in categorical features in gradi… · scikit-learn/scikit-learn@de968ed · GitHub
[go: up one dir, main page]

Skip to content

Commit de968ed

Browse files
thomasjpfanjjerphanbetatim
authored
FIX Adds support for negative values in categorical features in gradient boosting (#25629)
Co-authored-by: Julien Jerphanion <git@jjerphan.xyz> Co-authored-by: Tim Head <betatim@gmail.com>
1 parent ba46b65 commit de968ed

File tree

6 files changed

+71
-18
lines changed

6 files changed

+71
-18
lines changed

doc/whats_new/v1.3.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,12 @@ Changelog
240240
dataframe.
241241
:pr:`25931` by :user:`Yao Xiao <Charlie-XIAO>`.
242242

243+
- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and
244+
:class:`ensemble.HistGradientBoostingClassifier` treats negative values for
245+
categorical features consistently as missing values, following LightGBM's and
246+
pandas' conventions.
247+
:pr:`25629` by `Thomas Fan`_.
248+
243249
:mod:`sklearn.exception`
244250
........................
245251
- |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
@@ -284,8 +290,8 @@ Changelog
284290
estimators consistent with the rest of estimators.
285291
:pr:`25697` by :user:`John Pangas <jpangas>`.
286292

287-
- |Enhancement| The `n_iter_` attribute has been included in
288-
:class:`linear_model.ARDRegression` to expose the actual number of iterations
293+
- |Enhancement| The `n_iter_` attribute has been included in
294+
:class:`linear_model.ARDRegression` to expose the actual number of iterations
289295
required to reach the stopping criterion.
290296
:pr:`25697` by :user:`John Pangas <jpangas>`.
291297

sklearn/ensemble/_hist_gradient_boosting/_binning.pyx

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
88

99
def _map_to_bins(const X_DTYPE_C [:, :] data,
1010
list binning_thresholds,
11+
const unsigned char[::1] is_categorical,
1112
const unsigned char missing_values_bin_idx,
1213
int n_threads,
1314
X_BINNED_DTYPE_C [::1, :] binned):
@@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
2324
binning_thresholds : list of arrays
2425
For each feature, stores the increasing numeric values that are
2526
used to separate the bins.
27+
is_categorical : ndarray of unsigned char of shape (n_features,)
28+
Indicates categorical features.
2629
n_threads : int
2730
Number of OpenMP threads to use.
2831
binned : ndarray, shape (n_samples, n_features)
@@ -34,13 +37,15 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
3437
for feature_idx in range(data.shape[1]):
3538
_map_col_to_bins(data[:, feature_idx],
3639
binning_thresholds[feature_idx],
40+
is_categorical[feature_idx],
3741
missing_values_bin_idx,
3842
n_threads,
3943
binned[:, feature_idx])
4044

4145

4246
cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
4347
const X_DTYPE_C [:] binning_thresholds,
48+
const unsigned char is_categorical,
4449
const unsigned char missing_values_bin_idx,
4550
int n_threads,
4651
X_BINNED_DTYPE_C [:] binned):
@@ -53,7 +58,12 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
5358

5459
for i in prange(data.shape[0], schedule='static', nogil=True,
5560
num_threads=n_threads):
56-
if isnan(data[i]):
61+
if (
62+
isnan(data[i]) or
63+
# To follow LightGBM's conventions, negative values for
64+
# categorical features are considered as missing values.
65+
(is_categorical and data[i] < 0)
66+
):
5767
binned[i] = missing_values_bin_idx
5868
else:
5969
# for known values, use binary search

sklearn/ensemble/_hist_gradient_boosting/binning.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,12 @@ def transform(self, X):
275275
n_threads = _openmp_effective_n_threads(self.n_threads)
276276
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
277277
_map_to_bins(
278-
X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
278+
X,
279+
self.bin_thresholds_,
280+
self.is_categorical_,
281+
self.missing_values_bin_idx_,
282+
n_threads,
283+
binned,
279284
)
280285
return binned
281286

sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,11 @@ def _check_categories(self, X):
269269
if missing.any():
270270
categories = categories[~missing]
271271

272+
# Treat negative values for categorical features as missing values.
273+
negative_categories = categories < 0
274+
if negative_categories.any():
275+
categories = categories[~negative_categories]
276+
272277
if hasattr(self, "feature_names_in_"):
273278
feature_name = f"'{self.feature_names_in_[f_idx]}'"
274279
else:
@@ -1265,9 +1270,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
12651270
data has feature names).
12661271
12671272
For each categorical feature, there must be at most `max_bins` unique
1268-
categories, and each categorical value must be in [0, max_bins -1].
1269-
During prediction, categories encoded as a negative value are treated as
1270-
missing values.
1273+
categories, and each categorical value must be less then `max_bins - 1`.
1274+
Negative values for categorical features are treated as missing values.
12711275
12721276
Read more in the :ref:`User Guide <categorical_support_gbdt>`.
12731277
@@ -1623,9 +1627,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
16231627
data has feature names).
16241628
16251629
For each categorical feature, there must be at most `max_bins` unique
1626-
categories, and each categorical value must be in [0, max_bins -1].
1627-
During prediction, categories encoded as a negative value are treated as
1628-
missing values.
1630+
categories, and each categorical value must be less then `max_bins - 1`.
1631+
Negative values for categorical features are treated as missing values.
16291632
16301633
Read more in the :ref:`User Guide <categorical_support_gbdt>`.
16311634

sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ def test_map_to_bins(max_bins):
9595
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
9696
]
9797
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
98+
is_categorical = np.zeros(2, dtype=np.uint8)
9899
last_bin_idx = max_bins
99-
_map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
100+
_map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
100101
assert binned.shape == DATA.shape
101102
assert binned.dtype == np.uint8
102103
assert binned.flags.f_contiguous
@@ -357,10 +358,35 @@ def test_categorical_feature(n_bins):
357358
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
358359
assert_array_equal(bin_mapper.transform(X), expected_trans)
359360

360-
# For unknown categories, the mapping is incorrect / undefined. This never
361-
# happens in practice. This check is only for illustration purpose.
362-
X = np.array([[-1, 100]], dtype=X_DTYPE).T
363-
expected_trans = np.array([[0, 6]]).T
361+
# Negative categories are mapped to the missing values' bin
362+
# (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
3 6D38 63+
# Unknown positive categories does not happen in practice and tested
364+
# for illustration purpose.
365+
X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
366+
expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
367+
assert_array_equal(bin_mapper.transform(X), expected_trans)
368+
369+
370+
def test_categorical_feature_negative_missing():
371+
"""Make sure bin mapper treats negative categories as missing values."""
372+
X = np.array(
373+
[[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
374+
).T
375+
bin_mapper = _BinMapper(
376+
n_bins=4,
377+
is_categorical=np.array([True]),
378+
known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
379+
).fit(X)
380+
381+
assert bin_mapper.n_bins_non_missing_ == [3]
382+
383+
X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T
384+
385+
# Negative values for categorical features are considered as missing values.
386+
# They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
387+
# which is 3 here.
388+
assert bin_mapper.missing_values_bin_idx_ == 3
389+
expected_trans = np.array([[3, 0, 1, 2, 3]]).T
364390
assert_array_equal(bin_mapper.transform(X), expected_trans)
365391

366392

sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,10 @@ def test_staged_predict(HistGradientBoosting, X, y):
950950
"Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier)
951951
)
952952
@pytest.mark.parametrize("bool_categorical_parameter", [True, False])
953-
def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter):
953+
@pytest.mark.parametrize("missing_value", [np.nan, -1])
954+
def test_unknown_categories_nan(
955+
insert_missing, Est, bool_categorical_parameter, missing_value
956+
):
954957
# Make sure no error is raised at predict if a category wasn't seen during
955958
# fit. We also make sure they're treated as nans.
956959

@@ -970,7 +973,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
970973
if insert_missing:
971974
mask = rng.binomial(1, 0.01, size=X.shape).astype(bool)
972975
assert mask.sum() > 0
973-
X[mask] = np.nan
976+
X[mask] = missing_value
974977

975978
est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y)
976979
assert_array_equal(est.is_categorical_, [False, True])
@@ -979,7 +982,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
979982
# unknown categories will be treated as nans
980983
X_test = np.zeros((10, X.shape[1]), dtype=float)
981984
X_test[:5, 1] = 30
982-
X_test[5:, 1] = np.nan
985+
X_test[5:, 1] = missing_value
983986
assert len(np.unique(est.predict(X_test))) == 1
984987

985988

0 commit comments

Comments
 (0)
0