scikit-learn · ogrisel · Apr 5, 2023 · Feb 16, 2023 · Feb 16, 2023 · Feb 16, 2023
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -240,6 +240,12 @@ Changelog
   dataframe.
   :pr:`25931` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and
+  :class:`ensemble.HistGradientBoostingClassifier` treats negative values for
+  categorical features consistently as missing values, following LightGBM's and
+  pandas' conventions.
+  :pr:`25629` by `Thomas Fan`_.
+
 :mod:`sklearn.exception`
 ........................
 - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
@@ -284,8 +290,8 @@ Changelog
   estimators consistent with the rest of estimators.
   :pr:`25697` by :user:`John Pangas <jpangas>`.
 
-- |Enhancement| The `n_iter_` attribute has been included in 
-  :class:`linear_model.ARDRegression` to expose the actual number of iterations 
+- |Enhancement| The `n_iter_` attribute has been included in
+  :class:`linear_model.ARDRegression` to expose the actual number of iterations
   required to reach the stopping criterion.
   :pr:`25697` by :user:`John Pangas <jpangas>`.
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
 
 def _map_to_bins(const X_DTYPE_C [:, :] data,
                  list binning_thresholds,
+                 const unsigned char[::1] is_categorical,
                  const unsigned char missing_values_bin_idx,
                  int n_threads,
                  X_BINNED_DTYPE_C [::1, :] binned):
@@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
     binning_thresholds : list of arrays
         For each feature, stores the increasing numeric values that are
         used to separate the bins.
+    is_categorical : ndarray of unsigned char of shape (n_features,)
+        Indicates categorical features.
     n_threads : int
         Number of OpenMP threads to use.
     binned : ndarray, shape (n_samples, n_features)
@@ -34,13 +37,15 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
     for feature_idx in range(data.shape[1]):
         _map_col_to_bins(data[:, feature_idx],
                              binning_thresholds[feature_idx],
+                             is_categorical[feature_idx],
                              missing_values_bin_idx,
                              n_threads,
                              binned[:, feature_idx])
 
 
 cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
                                const X_DTYPE_C [:] binning_thresholds,
+                               const unsigned char is_categorical,
                                const unsigned char missing_values_bin_idx,
                                int n_threads,
                                X_BINNED_DTYPE_C [:] binned):
@@ -53,7 +58,12 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
 
     for i in prange(data.shape[0], schedule='static', nogil=True,
                     num_threads=n_threads):
-        if isnan(data[i]):
+        if (
+            isnan(data[i]) or
+            # To follow LightGBM's conventions, negative values for
+            # categorical features are considered as missing values.
+            (is_categorical and data[i] < 0)
+        ):
             binned[i] = missing_values_bin_idx
         else:
             # for known values, use binary search

diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -275,7 +275,12 @@ def transform(self, X):
         n_threads = _openmp_effective_n_threads(self.n_threads)
         binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
         _map_to_bins(
-            X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
+            X,
+            self.bin_thresholds_,
+            self.is_categorical_,
+            self.missing_values_bin_idx_,
+            n_threads,
+            binned,
         )
         return binned
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -269,6 +269,11 @@ def _check_categories(self, X):
                 if missing.any():
                     categories = categories[~missing]
 
+                # Treat negative values for categorical features as missing values.
+                negative_categories = categories < 0
+                if negative_categories.any():
+                    categories = categories[~negative_categories]
+
                 if hasattr(self, "feature_names_in_"):
                     feature_name = f"'{self.feature_names_in_[f_idx]}'"
                 else:
@@ -1265,9 +1270,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
           data has feature names).
 
         For each categorical feature, there must be at most `max_bins` unique
-        categories, and each categorical value must be in [0, max_bins -1].
-        During prediction, categories encoded as a negative value are treated as
-        missing values.
+        categories, and each categorical value must be less then `max_bins - 1`.
+        Negative values for categorical features are treated as missing values.
 
         Read more in the :ref:`User Guide <categorical_support_gbdt>`.
 
@@ -1623,9 +1627,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
           data has feature names).
 
         For each categorical feature, there must be at most `max_bins` unique
-        categories, and each categorical value must be in [0, max_bins -1].
-        During prediction, categories encoded as a negative value are treated as
-        missing values.
+        categories, and each categorical value must be less then `max_bins - 1`.
+        Negative values for categorical features are treated as missing values.
 
         Read more in the :ref:`User Guide <categorical_support_gbdt>`.
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@@ -95,8 +95,9 @@ def test_map_to_bins(max_bins):
         _find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
     ]
     binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
+    is_categorical = np.zeros(2, dtype=np.uint8)
     last_bin_idx = max_bins
-    _map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
+    _map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
     assert binned.shape == DATA.shape
     assert binned.dtype == np.uint8
     assert binned.flags.f_contiguous
@@ -357,10 +358,35 @@ def test_categorical_feature(n_bins):
     expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
     assert_array_equal(bin_mapper.transform(X), expected_trans)
 
-    # For unknown categories, the mapping is incorrect / undefined. This never
-    # happens in practice. This check is only for illustration purpose.
-    X = np.array([[-1, 100]], dtype=X_DTYPE).T
-    expected_trans = np.array([[0, 6]]).T
+    # Negative categories are mapped to the missing values' bin
+    # (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
+    # Unknown positive categories does not happen in practice and tested
+    # for illustration purpose.
+    X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
+    expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
+    assert_array_equal(bin_mapper.transform(X), expected_trans)
+
+
+def test_categorical_feature_negative_missing():
+    """Make sure bin mapper treats negative categories as missing values."""
+    X = np.array(
+        [[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
+    ).T
+    bin_mapper = _BinMapper(
+        n_bins=4,
+        is_categorical=np.array([True]),
+        known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
+    ).fit(X)
+
+    assert bin_mapper.n_bins_non_missing_ == [3]
+
+    X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T
+
+    # Negative values for categorical features are considered as missing values.
+    # They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
+    # which is 3 here.
+    assert bin_mapper.missing_values_bin_idx_ == 3
+    expected_trans = np.array([[3, 0, 1, 2, 3]]).T
     assert_array_equal(bin_mapper.transform(X), expected_trans)
 
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -950,7 +950,10 @@ def test_staged_predict(HistGradientBoosting, X, y):
     "Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier)
 )
 @pytest.mark.parametrize("bool_categorical_parameter", [True, False])
-def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter):
+@pytest.mark.parametrize("missing_value", [np.nan, -1])
+def test_unknown_categories_nan(
+    insert_missing, Est, bool_categorical_parameter, missing_value
+):
     # Make sure no error is raised at predict if a category wasn't seen during
     # fit. We also make sure they're treated as nans.
 
@@ -970,7 +973,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
     if insert_missing:
         mask = rng.binomial(1, 0.01, size=X.shape).astype(bool)
         assert mask.sum() > 0
-        X[mask] = np.nan
+        X[mask] = missing_value
 
     est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y)
     assert_array_equal(est.is_categorical_, [False, True])
@@ -979,7 +982,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
     # unknown categories will be treated as nans
     X_test = np.zeros((10, X.shape[1]), dtype=float)
     X_test[:5, 1] = 30
-    X_test[5:, 1] = np.nan
+    X_test[5:, 1] = missing_value
     assert len(np.unique(est.predict(X_test))) == 1