diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 4fede62e61b34..466d786869fc8 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -240,6 +240,12 @@ Changelog dataframe. :pr:`25931` by :user:`Yao Xiao `. +- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and + :class:`ensemble.HistGradientBoostingClassifier` treats negative values for + categorical features consistently as missing values, following LightGBM's and + pandas' conventions. + :pr:`25629` by `Thomas Fan`_. + :mod:`sklearn.exception` ........................ - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised @@ -284,8 +290,8 @@ Changelog estimators consistent with the rest of estimators. :pr:`25697` by :user:`John Pangas `. -- |Enhancement| The `n_iter_` attribute has been included in - :class:`linear_model.ARDRegression` to expose the actual number of iterations +- |Enhancement| The `n_iter_` attribute has been included in + :class:`linear_model.ARDRegression` to expose the actual number of iterations required to reach the stopping criterion. :pr:`25697` by :user:`John Pangas `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 5ba1527378d87..90241509cf96b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C def _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, + const unsigned char[::1] is_categorical, const unsigned char missing_values_bin_idx, int n_threads, X_BINNED_DTYPE_C [::1, :] binned): @@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. + is_categorical : ndarray of unsigned char of shape (n_features,) + Indicates categorical features. n_threads : int Number of OpenMP threads to use. binned : ndarray, shape (n_samples, n_features) @@ -34,6 +37,7 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, for feature_idx in range(data.shape[1]): _map_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], + is_categorical[feature_idx], missing_values_bin_idx, n_threads, binned[:, feature_idx]) @@ -41,6 +45,7 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, cdef void _map_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, + const unsigned char is_categorical, const unsigned char missing_values_bin_idx, int n_threads, X_BINNED_DTYPE_C [:] binned): @@ -53,7 +58,12 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True, num_threads=n_threads): - if isnan(data[i]): + if ( + isnan(data[i]) or + # To follow LightGBM's conventions, negative values for + # categorical features are considered as missing values. + (is_categorical and data[i] < 0) + ): binned[i] = missing_values_bin_idx else: # for known values, use binary search diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a553a307d262b..805a13b2d361b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -275,7 +275,12 @@ def transform(self, X): n_threads = _openmp_effective_n_threads(self.n_threads) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F") _map_to_bins( - X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned + X, + self.bin_thresholds_, + self.is_categorical_, + self.missing_values_bin_idx_, + n_threads, + binned, ) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 29cae2411807e..b35f37d4d7252 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -269,6 +269,11 @@ def _check_categories(self, X): if missing.any(): categories = categories[~missing] + # Treat negative values for categorical features as missing values. + negative_categories = categories < 0 + if negative_categories.any(): + categories = categories[~negative_categories] + if hasattr(self, "feature_names_in_"): feature_name = f"'{self.feature_names_in_[f_idx]}'" else: @@ -1265,9 +1270,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): data has feature names). For each categorical feature, there must be at most `max_bins` unique - categories, and each categorical value must be in [0, max_bins -1]. - During prediction, categories encoded as a negative value are treated as - missing values. + categories, and each categorical value must be less then `max_bins - 1`. + Negative values for categorical features are treated as missing values. Read more in the :ref:`User Guide `. @@ -1623,9 +1627,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): data has feature names). For each categorical feature, there must be at most `max_bins` unique - categories, and each categorical value must be in [0, max_bins -1]. - During prediction, categories encoded as a negative value are treated as - missing values. + categories, and each categorical value must be less then `max_bins - 1`. + Negative values for categorical features are treated as missing values. Read more in the :ref:`User Guide `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index c60b43c25d937..08bfebfcbf6c9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -95,8 +95,9 @@ def test_map_to_bins(max_bins): _find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2) ] binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F") + is_categorical = np.zeros(2, dtype=np.uint8) last_bin_idx = max_bins - _map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned) + _map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous @@ -357,10 +358,35 @@ def test_categorical_feature(n_bins): expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) - # For unknown categories, the mapping is incorrect / undefined. This never - # happens in practice. This check is only for illustration purpose. - X = np.array([[-1, 100]], dtype=X_DTYPE).T - expected_trans = np.array([[0, 6]]).T + # Negative categories are mapped to the missing values' bin + # (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1). + # Unknown positive categories does not happen in practice and tested + # for illustration purpose. + X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T + expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T + assert_array_equal(bin_mapper.transform(X), expected_trans) + + +def test_categorical_feature_negative_missing(): + """Make sure bin mapper treats negative categories as missing values.""" + X = np.array( + [[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE + ).T + bin_mapper = _BinMapper( + n_bins=4, + is_categorical=np.array([True]), + known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)], + ).fit(X) + + assert bin_mapper.n_bins_non_missing_ == [3] + + X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T + + # Negative values for categorical features are considered as missing values. + # They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`, + # which is 3 here. + assert bin_mapper.missing_values_bin_idx_ == 3 + expected_trans = np.array([[3, 0, 1, 2, 3]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index e5972eaf351f6..33f39f3dbf584 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -950,7 +950,10 @@ def test_staged_predict(HistGradientBoosting, X, y): "Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier) ) @pytest.mark.parametrize("bool_categorical_parameter", [True, False]) -def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter): +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_unknown_categories_nan( + insert_missing, Est, bool_categorical_parameter, missing_value +): # Make sure no error is raised at predict if a category wasn't seen during # fit. We also make sure they're treated as nans. @@ -970,7 +973,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter) if insert_missing: mask = rng.binomial(1, 0.01, size=X.shape).astype(bool) assert mask.sum() > 0 - X[mask] = np.nan + X[mask] = missing_value est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y) assert_array_equal(est.is_categorical_, [False, True]) @@ -979,7 +982,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter) # unknown categories will be treated as nans X_test = np.zeros((10, X.shape[1]), dtype=float) X_test[:5, 1] = 30 - X_test[5:, 1] = np.nan + X_test[5:, 1] = missing_value assert len(np.unique(est.predict(X_test))) == 1