From acd51fae71b839a527e8ef204a71aa6eb818dd4b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 16 Feb 2023 17:13:44 -0500 Subject: [PATCH 1/9] ENH Adds support for negative values in categorical features in gradient boosting --- doc/whats_new/v1.3.rst | 4 ++++ .../_hist_gradient_boosting/gradient_boosting.py | 14 ++++++++------ .../tests/test_gradient_boosting.py | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 08ebf4abc92c3..86495a9b7f797 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -129,6 +129,10 @@ Changelog out-of-bag scores via the `oob_scores_` or `oob_score_` attributes. :pr:`24882` by :user:`Ashwin Mathur `. +- |Feature| :class:`ensemble.HistGradientBoostingRegressor` and + :class:`ensemble.HistGradientBoostingClassifier` treats negative values + as missing values for categorical features. :pr:`xxxxx` by `Thomas Fan`_. + - |Efficiency| :class:`ensemble.IsolationForest` predict time is now faster (typically by a factor of 8 or more). Internally, the estimator now precomputes decision path lengths per tree at `fit` time. It is therefore not possible diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 31069fe14ee41..bcbb9aed4482d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -272,6 +272,10 @@ def _check_categories(self, X): if missing.any(): categories = categories[~missing] + negative_categories = categories < 0 + if categories.any(): + categories = categories[~negative_categories] + if hasattr(self, "feature_names_in_"): feature_name = f"'{self.feature_names_in_[f_idx]}'" else: @@ -1268,9 +1272,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): data has feature names). For each categorical feature, there must be at most `max_bins` unique - categories, and each categorical value must be in [0, max_bins -1]. - During prediction, categories encoded as a negative value are treated as - missing values. + categories, and each categorical value must be less then `max_bins - 1`. + Categories encoded as a negative value are treated as missing values. Read more in the :ref:`User Guide `. @@ -1632,9 +1635,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): data has feature names). For each categorical feature, there must be at most `max_bins` unique - categories, and each categorical value must be in [0, max_bins -1]. - During prediction, categories encoded as a negative value are treated as - missing values. + categories, and each categorical value must be less then `max_bins - 1`. + Categories encoded as a negative value are treated as missing values. Read more in the :ref:`User Guide `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 7e774d9f09f45..c272c68ec8568 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -977,7 +977,10 @@ def test_staged_predict(HistGradientBoosting, X, y): "Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier) ) @pytest.mark.parametrize("bool_categorical_parameter", [True, False]) -def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter): +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_unknown_categories_nan( + insert_missing, Est, bool_categorical_parameter, missing_value +): # Make sure no error is raised at predict if a category wasn't seen during # fit. We also make sure they're treated as nans. @@ -997,7 +1000,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter) if insert_missing: mask = rng.binomial(1, 0.01, size=X.shape).astype(bool) assert mask.sum() > 0 - X[mask] = np.nan + X[mask] = missing_value est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y) assert_array_equal(est.is_categorical_, [False, True]) @@ -1006,7 +1009,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter) # unknown categories will be treated as nans X_test = np.zeros((10, X.shape[1]), dtype=float) X_test[:5, 1] = 30 - X_test[5:, 1] = np.nan + X_test[5:, 1] = missing_value assert len(np.unique(est.predict(X_test))) == 1 From 729630ada53a6c3a2af757a817fa2eb92d75a075 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 16 Feb 2023 17:19:15 -0500 Subject: [PATCH 2/9] ENH Adds PR number --- doc/whats_new/v1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 86495a9b7f797..fd709065a2d38 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -131,7 +131,7 @@ Changelog - |Feature| :class:`ensemble.HistGradientBoostingRegressor` and :class:`ensemble.HistGradientBoostingClassifier` treats negative values - as missing values for categorical features. :pr:`xxxxx` by `Thomas Fan`_. + as missing values for categorical features. :pr:`25629` by `Thomas Fan`_. - |Efficiency| :class:`ensemble.IsolationForest` predict time is now faster (typically by a factor of 8 or more). Internally, the estimator now precomputes From 4e272a3f8108efcbebb03c1ccc916554aa738559 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 16 Feb 2023 17:24:01 -0500 Subject: [PATCH 3/9] DOC Adds comment --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index bcbb9aed4482d..d3b80191fea7f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -272,6 +272,7 @@ def _check_categories(self, X): if missing.any(): categories = categories[~missing] + # Treat negative categories as missing values negative_categories = categories < 0 if categories.any(): categories = categories[~negative_categories] From 81ac1b12b6eae87307cc03f2e20302883a7771e5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 17 Feb 2023 13:07:07 -0500 Subject: [PATCH 4/9] FIX Makes sure negative values are actual the same as missing --- .../_hist_gradient_boosting/_binning.pyx | 11 ++++++++- .../_hist_gradient_boosting/binning.py | 7 +++++- .../tests/test_binning.py | 24 ++++++++++++++++++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 5ba1527378d87..a778d90062b50 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C def _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, + const unsigned char[::1] is_categorical, const unsigned char missing_values_bin_idx, int n_threads, X_BINNED_DTYPE_C [::1, :] binned): @@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. + is_categorical : ndarray of unsigned char of shape (n_features,) + Indicates categorical features. n_threads : int Number of OpenMP threads to use. binned : ndarray, shape (n_samples, n_features) @@ -34,6 +37,7 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, for feature_idx in range(data.shape[1]): _map_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], + is_categorical[feature_idx], missing_values_bin_idx, n_threads, binned[:, feature_idx]) @@ -41,6 +45,7 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, cdef void _map_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, + const unsigned char is_categorical, const unsigned char missing_values_bin_idx, int n_threads, X_BINNED_DTYPE_C [:] binned): @@ -53,7 +58,11 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True, num_threads=n_threads): - if isnan(data[i]): + if ( + isnan(data[i]) or + # categorical features consider negative values as missing + (is_categorical and data[i] < 0) + ): binned[i] = missing_values_bin_idx else: # for known values, use binary search diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a553a307d262b..805a13b2d361b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -275,7 +275,12 @@ def transform(self, X): n_threads = _openmp_effective_n_threads(self.n_threads) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F") _map_to_bins( - X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned + X, + self.bin_thresholds_, + self.is_categorical_, + self.missing_values_bin_idx_, + n_threads, + binned, ) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 4581173fefe67..32ada01d81fcc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -95,8 +95,9 @@ def test_map_to_bins(max_bins): _find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2) ] binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F") + is_categorical = np.zeros(2, dtype=np.uint8) last_bin_idx = max_bins - _map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned) + _map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous @@ -364,6 +365,27 @@ def test_categorical_feature(n_bins): assert_array_equal(bin_mapper.transform(X), expected_trans) +def test_categorical_feature_negative_missing(): + """Make sure bin mapper treats negative categories as missing values.""" + X = np.array( + [[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE + ).T + bin_mapper = _BinMapper( + n_bins=4, + is_categorical=np.array([True]), + known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)], + ).fit(X) + + assert bin_mapper.n_bins_non_missing_ == [3] + + X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T + + # Negative categories are considered missing + # All missing values are mapped to n_bins_non_missing_ which is 3. + expected_trans = np.array([[3, 0, 1, 2, 3]]).T + assert_array_equal(bin_mapper.transform(X), expected_trans) + + @pytest.mark.parametrize("n_bins", (128, 256)) def test_categorical_with_numerical_features(n_bins): # basic check for binmapper with mixed data From 997bad2892c5a3269c9d95ef9f987e74afcc8078 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 20 Feb 2023 09:13:07 -0500 Subject: [PATCH 5/9] TST Fixes test --- .../_hist_gradient_boosting/tests/test_binning.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 32ada01d81fcc..413e636775939 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -358,10 +358,11 @@ def test_categorical_feature(n_bins): expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) - # For unknown categories, the mapping is incorrect / undefined. This never - # happens in practice. This check is only for illustration purpose. - X = np.array([[-1, 100]], dtype=X_DTYPE).T - expected_trans = np.array([[0, 6]]).T + # For negative categories, the mapping goes to the missing bin (n_bins -1) + # Unknown positive categories does not happen in practice and tested + # for illustration purpose. + X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T + expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) From 85f8747f5e5fc605e2fec1c28b0f0ecd69ccf78f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 20 Feb 2023 09:13:46 -0500 Subject: [PATCH 6/9] DOC Better wording --- sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 413e636775939..0c96cfb5af765 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -358,7 +358,7 @@ def test_categorical_feature(n_bins): expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) - # For negative categories, the mapping goes to the missing bin (n_bins -1) + # Negative categories are mapped to the missing bin (n_bins -1). # Unknown positive categories does not happen in practice and tested # for illustration purpose. X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T From 2ba2a33efcb38f9615b839189b285d34361a686d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 31 Mar 2023 11:30:31 -0400 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- doc/whats_new/v1.3.rst | 3 ++- sklearn/ensemble/_hist_gradient_boosting/_binning.pyx | 3 ++- .../_hist_gradient_boosting/gradient_boosting.py | 6 +++--- .../_hist_gradient_boosting/tests/test_binning.py | 9 ++++++--- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 043f298435ee6..265aaec1d03dc 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -137,7 +137,8 @@ Changelog - |Feature| :class:`ensemble.HistGradientBoostingRegressor` and :class:`ensemble.HistGradientBoostingClassifier` treats negative values - as missing values for categorical features. :pr:`25629` by `Thomas Fan`_. + as missing values for categorical features, following LightGBM's convention. + :pr:`25629` by `Thomas Fan`_. - |Efficiency| :class:`ensemble.IsolationForest` predict time is now faster (typically by a factor of 8 or more). Internally, the estimator now precomputes diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index a778d90062b50..90241509cf96b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -60,7 +60,8 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data, num_threads=n_threads): if ( isnan(data[i]) or - # categorical features consider negative values as missing + # To follow LightGBM's conventions, negative values for + # categorical features are considered as missing values. (is_categorical and data[i] < 0) ): binned[i] = missing_values_bin_idx diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 324c032cecdc6..32d92e9190eb6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -272,7 +272,7 @@ def _check_categories(self, X): if missing.any(): categories = categories[~missing] - # Treat negative categories as missing values + # Treat negative values for categorical features as missing values. negative_categories = categories < 0 if categories.any(): categories = categories[~negative_categories] @@ -1282,7 +1282,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): For each categorical feature, there must be at most `max_bins` unique categories, and each categorical value must be less then `max_bins - 1`. - Categories encoded as a negative value are treated as missing values. + Negative values for categorical features are treated as missing values. Read more in the :ref:`User Guide `. @@ -1645,7 +1645,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): For each categorical feature, there must be at most `max_bins` unique categories, and each categorical value must be less then `max_bins - 1`. - Categories encoded as a negative value are treated as missing values. + Negative values for categorical features are treated as missing values. Read more in the :ref:`User Guide `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 0c96cfb5af765..a95690ddb3572 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -358,7 +358,8 @@ def test_categorical_feature(n_bins): expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) - # Negative categories are mapped to the missing bin (n_bins -1). + # Negative categories are mapped to the missing values' bin + # (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1). # Unknown positive categories does not happen in practice and tested # for illustration purpose. X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T @@ -381,8 +382,10 @@ def test_categorical_feature_negative_missing(): X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T - # Negative categories are considered missing - # All missing values are mapped to n_bins_non_missing_ which is 3. + # Negative values for categorical features are considered as missing values. + # They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`, + # which is 3 here. + assert bin_mapper.missing_values_bin_idx_ == 3 expected_trans = np.array([[3, 0, 1, 2, 3]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) From 25e3a057860bb48e63251afabe08eb890dbacb20 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 3 Apr 2023 12:53:34 -0400 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Tim Head --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 32d92e9190eb6..e325fa65a8668 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -274,7 +274,7 @@ def _check_categories(self, X): # Treat negative values for categorical features as missing values. negative_categories = categories < 0 - if categories.any(): + if negative_categories.any(): categories = categories[~negative_categories] if hasattr(self, "feature_names_in_"): From 0062234dd2c555e1d534f1803d8d41653c64ee19 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 4 Apr 2023 13:41:04 -0400 Subject: [PATCH 9/9] DOC Update to bug fix --- doc/whats_new/v1.3.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 265aaec1d03dc..8b44ae8b43824 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -135,11 +135,6 @@ Changelog out-of-bag scores via the `oob_scores_` or `oob_score_` attributes. :pr:`24882` by :user:`Ashwin Mathur `. -- |Feature| :class:`ensemble.HistGradientBoostingRegressor` and - :class:`ensemble.HistGradientBoostingClassifier` treats negative values - as missing values for categorical features, following LightGBM's convention. - :pr:`25629` by `Thomas Fan`_. - - |Efficiency| :class:`ensemble.IsolationForest` predict time is now faster (typically by a factor of 8 or more). Internally, the estimator now precomputes decision path lengths per tree at `fit` time. It is therefore not possible @@ -151,6 +146,12 @@ Changelog :class:`ensemble.BaggingRegressor` expose the `allow_nan` tag from the underlying estimator. :pr:`25506` by `Thomas Fan`_. +- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and + :class:`ensemble.HistGradientBoostingClassifier` treats negative values for + categorical features consistently as missing values, following LightGBM's and + pandas' conventions. + :pr:`25629` by `Thomas Fan`_. + :mod:`sklearn.exception` ........................ - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised