8000 FIX Adds support for negative values in categorical features in gradient boosting by thomasjpfan · Pull Request #25629 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FIX Adds support for negative values in categorical features in gradient boosting #25629

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
10 changes: 8 additions & 2 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,12 @@ Changelog
dataframe.
:pr:`25931` by :user:`Yao Xiao <Charlie-XIAO>`.

- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and
:class:`ensemble.HistGradientBoostingClassifier` treats negative values for
categorical features consistently as missing values, following LightGBM's and
pandas' conventions.
:pr:`25629` by `Thomas Fan`_.

:mod:`sklearn.exception`
........................
- |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
Expand Down Expand Up @@ -284,8 +290,8 @@ Changelog
estimators consistent with the rest of estimators.
:pr:`25697` by :user:`John Pangas <jpangas>`.

- |Enhancement| The `n_iter_` attribute has been included in
:class:`linear_model.ARDRegression` to expose the actual number of iterations
- |Enhancement| The `n_iter_` attribute has been included in
:class:`linear_model.ARDRegression` to expose the actual number of iterations
required to reach the stopping criterion.
:pr:`25697` by :user:`John Pangas <jpangas>`.

Expand Down
12 changes: 11 additions & 1 deletion sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C

def _map_to_bins(const X_DTYPE_C [:, :] data,
list binning_thresholds,
const unsigned char[::1] is_categorical,
const unsigned char missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [::1, :] binned):
Expand All @@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
binning_thresholds : list of arrays
For each feature, stores the increasing numeric values that are
used to separate the bins.
is_categorical : ndarray of unsigned char of shape (n_features,)
Indicates categorical features.
n_threads : int
Number of OpenMP threads to use.
binned : ndarray, shape (n_samples, n_features)
Expand All @@ -34,13 +37,15 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
for feature_idx in range(data.shape[1]):
_map_col_to_bins(data[:, feature_idx],
binning_thresholds[feature_idx],
is_categorical[feature_idx],
missing_values_bin_idx,
n_threads,
binned[:, feature_idx])


cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
const X_DTYPE_C [:] binning_thresholds,
const unsigned char is_categorical,
const unsigned char missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [:] binned):
Expand All @@ -53,7 +58,12 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data,

for i in prange(data.shape[0], schedule='static', nogil=True,
num_threads=n_threads):
if isnan(data[i]):
if (
isnan(data[i]) or
# To follow LightGBM's conventions, negative values for
# categorical features are considered as missing values.
(is_categorical and data[i] < 0)
):
binned[i] = missing_values_bin_idx
else:
# for known values, use binary search
Expand Down
7 changes: 6 additions & 1 deletion sklearn/ensemble/_hist_gradient_boosting/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,12 @@ def transform(self, X):
n_threads = _openmp_effective_n_threads(self.n_threads)
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
_map_to_bins(
X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
X,
self.bin_thresholds_,
self.is_categorical_,
self.missing_values_bin_idx_,
n_threads,
binned,
)
return binned

Expand Down
15 changes: 9 additions & 6 deletions sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,11 @@ def _check_categories(self, X):
if missing.any():
categories = categories[~missing]

# Treat negative values for categorical features as missing values.
negative_categories = categories < 0
if negative_categories.any():
categories = categories[~negative_categories]

if hasattr(self, "feature_names_in_"):
feature_name = f"'{self.feature_names_in_[f_idx]}'"
else:
Expand Down Expand Up @@ -1265,9 +1270,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
data has feature names).

For each categorical feature, there must be at most `max_bins` unique
categories, and each categorical value must be in [0, max_bins -1].
During prediction, categories encoded as a negative value are treated as
missing values.
categories, and each categorical value must be less then `max_bins - 1`.
Negative values for categorical features are treated as missing values.

Read more in the :ref:`User Guide <categorical_support_gbdt>`.

Expand Down Expand Up @@ -1623,9 +1627,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
data has feature names).

For each categorical feature, there must be at most `max_bins` unique
categories, and each categorical value must be in [0, max_bins -1].
During prediction, categories encoded as a negative value are treated as
missing values.
categories, and each categorical value must be less then `max_bins - 1`.
Negative values for categorical features are treated as missing values.

Read more in the :ref:`User Guide <categorical_support_gbdt>`.

Expand Down
36 changes: 31 additions & 5 deletions sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ def test_map_to_bins(max_bins):
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
]
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
is_categorical = np.zeros(2, dtype=np.uint8)
last_bin_idx = max_bins
_map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
_map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
assert binned.shape == DATA.shape
assert binned.dtype == np.uint8
assert binned.flags.f_contiguous
Expand Down Expand Up @@ -357,10 +358,35 @@ def test_categorical_feature(n_bins):
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)

# For unknown categories, the mapping is incorrect / undefined. This never
# happens in practice. This check is only for illustration purpose.
X = np.array([[-1, 100]], dtype=X_DTYPE).T
expected_trans = np.array([[0, 6]]).T
# Negative categories are mapped to the missing values' bin
# (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
# Unknown positive categories does not happen in practice and tested
# for illustration purpose.
X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)


def test_categorical_feature_negative_missing():
"""Make sure bin mapper treats negative categories as missing values."""
X = np.array(
[[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
).T
bin_mapper = _BinMapper(
n_bins=4,
is_categorical=np.array([True]),
known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
).fit(X)

assert bin_mapper.n_bins_non_missing_ == [3]

X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T

# Negative values for categorical features are considered as missing values.
# They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
# which is 3 here.
assert bin_mapper.missing_values_bin_idx_ == 3
expected_trans = np.array([[3, 0, 1, 2, 3]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,10 @@ def test_staged_predict(HistGradientBoosting, X, y):
"Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier)
)
@pytest.mark.parametrize("bool_categorical_parameter", [True, False])
def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter):
@pytest.mark.parametrize("missing_value", [np.nan, -1])
def test_unknown_categories_nan(
insert_missing, Est, bool_categorical_parameter, missing_value
):
# Make sure no error is raised at predict if a category wasn't seen during
# fit. We also make sure they're treated as nans.

Expand All @@ -970,7 +973,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
if insert_missing:
mask = rng.binomial(1, 0.01, size=X.shape).astype(bool)
assert mask.sum() > 0
X[mask] = np.nan
X[mask] = missing_value

est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y)
assert_array_equal(est.is_categorical_, [False, True])
Expand All @@ -979,7 +982,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
# unknown categories will be treated as nans
X_test = np.zeros((10, X.shape[1]), dtype=float)
X_test[:5, 1] = 30
X_test[5:, 1] = np.nan
X_test[5:, 1] = missing_value
assert len(np.unique(est.predict(X_test))) == 1


Expand Down
0