From 244c409cdec9169872f0efe21dafc70bcdd3d371 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 12 Sep 2021 18:31:10 +0200 Subject: [PATCH 01/61] DOC add attribues to TreeGrower --- .../_hist_gradient_boosting/grower.py | 22 +++++++++++++++++++ .../_hist_gradient_boosting/splitting.pyx | 6 ++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 1733b5745f8a2..835b92bab915e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -182,6 +182,28 @@ class TreeGrower: to determine the effective number of threads use, which takes cgroups CPU quotes into account. See the docstring of `_openmp_effective_n_threads` for details. + + Attributes + ---------- + histogram_builder : HistogramBuilder + splitter : Splitter + root : TreeNode + finalized_leaves : list of TreeNode + splittable_nodes : list of TreeNode + missing_values_bin_idx : int + equals n_bins - 1 + n_categorical_splits : int + n_features : int + n_nodes : int + total_find_split_time : float + time spent finding the best splits + total_compute_hist_time : float + time spent computing histograms + total_apply_split_time : float + time spent splitting nodes + with_monotonic_cst : bool + Whether there are monotonic constraints that apply. False iff monotonic_cst + is None. """ def __init__( diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 5ddba5cd02678..92f9f7e3813a2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -144,6 +144,10 @@ cdef class Splitter: feature. is_categorical : ndarray of bool of shape (n_features,) Indicates categorical features. + monotonic_cst : ndarray of shape (n_features,), dtype=int + Indicates the monotonic constraint to enforce on each feature. -1, 1 + and 0 respectively correspond to a positive constraint, negative + constraint and no constraint. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -839,7 +843,7 @@ cdef class Splitter: # other category. The low-support categories will always be mapped to # the right child. We scan the sorted categories array from left to # right and from right to left, and we stop at the middle. - + # Considering ordered categories A B C D, with E being a low-support # category: A B C D # ^ From b31eea0c390806a9fa86d5921147560654dbcea6 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 12 Sep 2021 21:20:22 +0200 Subject: [PATCH 02/61] ENH add interaction_cst --- .../gradient_boosting.py | 35 +++++++++++++++++++ .../_hist_gradient_boosting/grower.py | 26 ++++++++++++-- .../_hist_gradient_boosting/splitting.pyx | 16 +++++++-- 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 35dcb1d7acd8b..e5435b47736cf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2,6 +2,7 @@ # Author: Nicolas Hug from abc import ABC, abstractmethod +from collections.abc import Sequence from functools import partial import warnings @@ -45,6 +46,7 @@ def __init__( max_bins, categorical_features, monotonic_cst, + interaction_cst, warm_start, early_stopping, scoring, @@ -63,6 +65,7 @@ def __init__( self.l2_regularization = l2_regularization self.max_bins = max_bins self.monotonic_cst = monotonic_cst + self.interaction_cst = interaction_cst self.categorical_features = categorical_features self.warm_start = warm_start self.early_stopping = early_stopping @@ -118,6 +121,17 @@ def _validate_parameters(self): "monotonic constraints are not supported for multiclass classification." ) + if ( + self.interaction_cst is not None + and not isinstance(self.interaction_cst, Sequence) + and not all(isinstance(x, (Sequence, set)) for x in self.interaction_cst) + ): + # TODO: better validation + # lets start with list or set of {list, tuple, set} + raise ValueError( + "interaction constraints must be None or a Sequence of {Sequence, set}" + ) + def _check_categories(self, X): """Check and validate categorical features in X @@ -255,6 +269,14 @@ def fit(self, X, y, sample_weight=None): self.is_categorical_, known_categories = self._check_categories(X) + # convert to list of sets and convert to integers + if self.interaction_cst is None: + self._interaction_cst = None + else: + self._interaction_cst = [ + set([int(x) for x in group]) for group in self.interaction_cst + ] + # we need this stateful variable to tell raw_predict() that it was # called from fit() (this current method), and that the data it has # received is pre-binned. @@ -516,6 +538,7 @@ def fit(self, X, y, sample_weight=None): has_missing_values=has_missing_values, is_categorical=self.is_categorical_, monotonic_cst=self.monotonic_cst, + interaction_cst=self._interaction_cst, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, @@ -1093,6 +1116,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 + interaction_cst : TODO + + .. versionadded:: 1.1 + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -1215,6 +1242,7 @@ def __init__( max_bins=255, categorical_features=None, monotonic_cst=None, + interaction_cst=None, warm_start=False, early_stopping="auto", scoring="loss", @@ -1234,6 +1262,7 @@ def __init__( l2_regularization=l2_regularization, max_bins=max_bins, monotonic_cst=monotonic_cst, + interaction_cst=interaction_cst, categorical_features=categorical_features, early_stopping=early_stopping, warm_start=warm_start, @@ -1405,6 +1434,10 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 + interaction_cst : TODO + + .. versionadded:: 1.1 + warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -1523,6 +1556,7 @@ def __init__( max_bins=255, categorical_features=None, monotonic_cst=None, + interaction_cst=None, warm_start=False, early_stopping="auto", scoring="loss", @@ -1543,6 +1577,7 @@ def __init__( max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, + interaction_cst=interaction_cst, warm_start=warm_start, early_stopping=early_stopping, scoring=scoring, diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 835b92bab915e..001d34d6d9b19 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -168,6 +168,7 @@ class TreeGrower: and 0 respectively correspond to a positive constraint, negative constraint and no constraint. Read more in the :ref:`User Guide `. + interaction_cst : list of sets of integers l2_regularization : float, default=0. The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -220,6 +221,7 @@ def __init__( has_missing_values=False, is_categorical=None, monotonic_cst=None, + interaction_cst=None, l2_regularization=0.0, min_hessian_to_split=1e-3, shrinkage=1.0, @@ -310,6 +312,7 @@ def __init__( self.max_leaf_nodes = max_leaf_nodes self.has_missing_values = has_missing_values self.monotonic_cst = monotonic_cst + self.interaction_cst = interaction_cst self.is_categorical = is_categorical self.l2_regularization = l2_regularization self.n_features = X_binned.shape[1] @@ -428,7 +431,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): ) self._compute_best_split_and_push(self.root) - def _compute_best_split_and_push(self, node): + def _compute_best_split_and_push(self, node, parent_feature_idx=None): """Compute the best possible split (SplitInfo) of a given node. Also push it in the heap of splittable nodes if gain isn't zero. @@ -436,6 +439,10 @@ def _compute_best_split_and_push(self, node): (best gain = 0), or if no split would satisfy the constraints, (min_hessians_to_split, min_gain_to_split, min_samples_leaf) """ + if (self.interaction_cst is None) or (parent_feature_idx is None): + allowed_features = None + else: + allowed_features = self._get_allowed_features(parent_feature_idx) node.split_info = self.splitter.find_node_split( node.n_samples, @@ -445,6 +452,7 @@ def _compute_best_split_and_push(self, node): node.value, node.children_lower_bound, node.children_upper_bound, + allowed_features, ) if node.split_info.gain <= 0: # no valid split @@ -585,9 +593,13 @@ def split_next(self): tic = time() if should_split_left: - self._compute_best_split_and_push(left_child_node) + self._compute_best_split_and_push( + left_child_node, parent_feature_idx=node.split_info.feature_idx + ) if should_split_right: - self._compute_best_split_and_push(right_child_node) + self._compute_best_split_and_push( + right_child_node, parent_feature_idx=node.split_info.feature_idx + ) self.total_find_split_time += time() - tic # Release memory used by histograms as they are no longer needed @@ -602,6 +614,14 @@ def split_next(self): return left_child_node, right_child_node + def _get_allowed_features(self, parent_feature_idx): + """Return all feature indices allowed to be split by interaction_cst.""" + allowed_features = [] + for group in self.interaction_cst: + if parent_feature_idx in group: + allowed_features.extend(group) + return np.array(sorted(allowed_features), dtype=int) + def _finalize_leaf(self, node): """Make node a leaf of the tree being grown.""" diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 92f9f7e3813a2..686fb628804a6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -411,6 +411,7 @@ cdef class Splitter: const Y_DTYPE_C value, const Y_DTYPE_C lower_bound=-INFINITY, const Y_DTYPE_C upper_bound=INFINITY, + const int [:] allowed_features=None, ): """For each feature, find the best bin to split on at a given node. @@ -443,6 +444,9 @@ cdef class Splitter: upper_bound : float Upper bound for the children values for respecting the monotonic constraints. + allowed_features : ndarray, dtype=int + Indices of the features that are allowed by interaction constraints to be + split. Returns ------- @@ -459,14 +463,22 @@ cdef class Splitter: const unsigned char [::1] is_categorical = self.is_categorical const signed char [::1] monotonic_cst = self.monotonic_cst int n_threads = self.n_threads + bint has_interaction_cst = False + int n_allowed_features = self.n_features + + if allowed_features is not None: + has_interaction_cst = True + n_allowed_features = allowed_features.shape[0] with nogil: split_infos = malloc( - self.n_features * sizeof(split_info_struct)) + n_features * sizeof(split_info_struct)) - for feature_idx in prange(n_features, schedule='static', + for feature_idx in prange(n_allowed_features, schedule='static', num_threads=n_threads): + if has_interaction_cst: + feature_idx = allowed_features[feature_idx] split_infos[feature_idx].feature_idx = feature_idx # For each feature, find best bin to split on From d9b273abf6371d18b40fd519c233ba03251df800 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 12 Sep 2021 23:26:23 +0200 Subject: [PATCH 03/61] use a set in _get_allowed_features --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 001d34d6d9b19..ce052f5e275b6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -616,11 +616,11 @@ def split_next(self): def _get_allowed_features(self, parent_feature_idx): """Return all feature indices allowed to be split by interaction_cst.""" - allowed_features = [] + allowed_features = set() for group in self.interaction_cst: if parent_feature_idx in group: - allowed_features.extend(group) - return np.array(sorted(allowed_features), dtype=int) + allowed_features.update(group) + return np.array(list(allowed_features), dtype=int) def _finalize_leaf(self, node): """Make node a leaf of the tree being grown.""" From 1cc1cb5ad716def29a513ca235e4807029d9fd31 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Sep 2021 00:08:59 +0200 Subject: [PATCH 04/61] complete overhaul --- .../_hist_gradient_boosting/grower.py | 73 ++++++++++++++----- .../_hist_gradient_boosting/splitting.pyx | 4 +- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index ce052f5e275b6..b2ed89a9a07c3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -66,12 +66,20 @@ class TreeNode: start position of the node's sample_indices in splitter.partition. partition_stop : int stop position of the node's sample_indices in splitter.partition. + allowed_features : None or ndarray, dtype=int + Indices of features allowed to split for children. + interaction_cst_idx : None or list of ints + Indices of the interaction sets/groups that have to be applied on + splits of child nodes. The less sets the harder the constraint as + less sets contain less features. """ split_info = None left_child = None right_child = None histograms = None + allowed_features = None + interaction_cst_idx = None # start and stop indices of the node in the splitter.partition # array. Concretely, @@ -429,9 +437,16 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): self.root.histograms = self.histogram_builder.compute_histograms_brute( self.root.sample_indices ) + + if self.interaction_cst is not None: + allowed_features = set().union(*self.interaction_cst) + self.root.allowed_features = np.array( + list(allowed_features), dtype=np.uint32 + ) + self._compute_best_split_and_push(self.root) - def _compute_best_split_and_push(self, node, parent_feature_idx=None): + def _compute_best_split_and_push(self, node): """Compute the best possible split (SplitInfo) of a given node. Also push it in the heap of splittable nodes if gain isn't zero. @@ -439,10 +454,6 @@ def _compute_best_split_and_push(self, node, parent_feature_idx=None): (best gain = 0), or if no split would satisfy the constraints, (min_hessians_to_split, min_gain_to_split, min_samples_leaf) """ - if (self.interaction_cst is None) or (parent_feature_idx is None): - allowed_features = None - else: - allowed_features = self._get_allowed_features(parent_feature_idx) node.split_info = self.splitter.find_node_split( node.n_samples, @@ -452,12 +463,17 @@ def _compute_best_split_and_push(self, node, parent_feature_idx=None): node.value, node.children_lower_bound, node.children_upper_bound, - allowed_features, + node.allowed_features, ) if node.split_info.gain <= 0: # no valid split self._finalize_leaf(node) else: + # Update node.allowed_features, node.interaction_cst_idx to be inherited by + # child nodes. + if self.interaction_cst is not None: + self._update_interactions(node) + heappush(self.splittable_nodes, node) def split_next(self): @@ -593,13 +609,9 @@ def split_next(self): tic = time() if should_split_left: - self._compute_best_split_and_push( - left_child_node, parent_feature_idx=node.split_info.feature_idx - ) + self._compute_best_split_and_push(left_child_node) if should_split_right: - self._compute_best_split_and_push( - right_child_node, parent_feature_idx=node.split_info.feature_idx - ) + self._compute_best_split_and_push(right_child_node) self.total_find_split_time += time() - tic # Release memory used by histograms as they are no longer needed @@ -614,13 +626,38 @@ def split_next(self): return left_child_node, right_child_node - def _get_allowed_features(self, parent_feature_idx): - """Return all feature indices allowed to be split by interaction_cst.""" + def _update_interactions(self, node): + r"""Update features allowed by interactions for child nods. + + Update node.allowed_features and node.interaction_cst_idx. + + Example: Assume constraints [{0, 1}, {1, 2}]. + 1 <- Both constraint groups could be applied from now on + / \ + 1 2 <- Left split still fulfills both constraint groups. + / \ / \ Right split at feature 2 has only group {1, 2} from now on. + + Parameters: + ---------- + node : TreeNode + A node that might have children and whose interaction info is updated + inplace. + """ + # Case of no interactions is already captured before function call. + if node.interaction_cst_idx is None: + # Already split root node + node.interaction_cst_idx = range(len(self.interaction_cst)) + + # Note: This is for nodes that are already split and have a + # node.split_info.feature_idx. allowed_features = set() - for group in self.interaction_cst: - if parent_feature_idx in group: - allowed_features.update(group) - return np.array(list(allowed_features), dtype=int) + interaction_cst_idx = list() + for i in node.interaction_cst_idx: + if node.split_info.feature_idx in self.interaction_cst[i]: + interaction_cst_idx.append(i) + allowed_features.update(self.interaction_cst[i]) + node.allowed_features = np.array(list(allowed_features), dtype=np.uint32) + node.interaction_cst_idx = interaction_cst_idx def _finalize_leaf(self, node): """Make node a leaf of the tree being grown.""" diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 686fb628804a6..0da8a59d58099 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -411,7 +411,7 @@ cdef class Splitter: const Y_DTYPE_C value, const Y_DTYPE_C lower_bound=-INFINITY, const Y_DTYPE_C upper_bound=INFINITY, - const int [:] allowed_features=None, + const unsigned int [:] allowed_features=None, ): """For each feature, find the best bin to split on at a given node. @@ -444,7 +444,7 @@ cdef class Splitter: upper_bound : float Upper bound for the children values for respecting the monotonic constraints. - allowed_features : ndarray, dtype=int + allowed_features : None or ndarray, dtype=np.uint32 Indices of the features that are allowed by interaction constraints to be split. From 7baf6955d91f454cb7ac48af3d7e801deccba808 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Sep 2021 21:36:16 +0200 Subject: [PATCH 05/61] TST test_split_interaction_constraints --- .../_hist_gradient_boosting/splitting.pyx | 2 +- .../tests/test_splitting.py | 67 +++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 0da8a59d58099..ab63c888ce88e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -454,7 +454,7 @@ cdef class Splitter: The info about the best possible split among all features. """ cdef: - int feature_idx + unsigned int feature_idx int best_feature_idx int n_features = self.n_features split_info_struct split_info diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 0d19bdc6df72b..e52dbabd029ee 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -856,3 +856,70 @@ def test_splitting_categorical_sanity( left_mask = np.isin(X_binned.ravel(), expected_categories_left) assert_array_equal(sample_indices[left_mask], samples_left) assert_array_equal(sample_indices[~left_mask], samples_right) + + +def test_split_interaction_constraints(): + """Check that allowed_features are respected.""" + rng = np.random.RandomState(919) + + n_features = 4 + # features 1 and 2 are not allowed to be split on + allowed_features = np.array([0, 3], dtype=np.uint32) + n_bins = 5 + n_samples = 10 + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + X_binned = np.asfortranarray( + rng.randint(0, n_bins - 1, size=(n_samples, n_features)), dtype=X_BINNED_DTYPE + ) + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples + hessians_are_constant = True + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + assert np.all(sample_indices == splitter.partition) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + si_root = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=allowed_features, + ) + + # only features 0 and 1 are allowed to be split on + assert si_root.feature_idx in {0, 3} From 8aced528d0caeb9a4a3fd4f304d13fc7a5d9ea41 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Sep 2021 22:12:54 +0200 Subject: [PATCH 06/61] DOC add is_leaf to Attributes section --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index b2ed89a9a07c3..0e9bbcf1dc4dc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -55,6 +55,8 @@ class TreeNode: The sum of the hessians of the samples at the node. split_info : SplitInfo or None The result of the split evaluation. + is_leaf : bool + True if node is a leaf left_child : TreeNode or None The left child of the node. None for leaves. right_child : TreeNode or None From 9a9862c7fccca06d432569485d1854a303b3300d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Sep 2021 22:45:47 +0200 Subject: [PATCH 07/61] DOC improve interaction_cst_idx --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 0e9bbcf1dc4dc..92799d625119d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -72,8 +72,8 @@ class TreeNode: Indices of features allowed to split for children. interaction_cst_idx : None or list of ints Indices of the interaction sets/groups that have to be applied on - splits of child nodes. The less sets the harder the constraint as - less sets contain less features. + splits of child nodes. The fewer sets the harder the constraint as + fewer sets contain fewer features. """ split_info = None From ed31a7e8a6e1aa53fa16e35239f8016879510ad2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Sep 2021 23:41:56 +0200 Subject: [PATCH 08/61] FIX fix logic --- .../_hist_gradient_boosting/grower.py | 39 +++++++----- .../_hist_gradient_boosting/splitting.pyx | 63 ++++++++++--------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 92799d625119d..d111a0fc2fb63 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -441,6 +441,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): ) if self.interaction_cst is not None: + self.root.interaction_cst_idx = list(range(len(self.interaction_cst))) allowed_features = set().union(*self.interaction_cst) self.root.allowed_features = np.array( list(allowed_features), dtype=np.uint32 @@ -458,21 +459,21 @@ def _compute_best_split_and_push(self, node): """ node.split_info = self.splitter.find_node_split( - node.n_samples, - node.histograms, - node.sum_gradients, - node.sum_hessians, - node.value, - node.children_lower_bound, - node.children_upper_bound, - node.allowed_features, + n_samples=node.n_samples, + histograms=node.histograms, + sum_gradients=node.sum_gradients, + sum_hessians=node.sum_hessians, + value=node.value, + lower_bound=node.children_lower_bound, + upper_bound=node.children_upper_bound, + allowed_features=node.allowed_features, ) if node.split_info.gain <= 0: # no valid split self._finalize_leaf(node) else: - # Update node.allowed_features, node.interaction_cst_idx to be inherited by - # child nodes. + # Update node.allowed_features and node.interaction_cst_idx to be inherited + # by child nodes. if self.interaction_cst is not None: self._update_interactions(node) @@ -527,6 +528,13 @@ def split_next(self): right_child_node.partition_start = left_child_node.partition_stop right_child_node.partition_stop = node.partition_stop + # set interaction constraints (the indices of the constraints sets) + if self.interaction_cst is not None: + left_child_node.interaction_cst_idx = node.interaction_cst_idx + left_child_node.allowed_features = node.allowed_features + right_child_node.interaction_cst_idx = node.interaction_cst_idx + right_child_node.allowed_features = node.allowed_features + if not self.has_missing_values[node.split_info.feature_idx]: # If no missing values are encountered at fit time, then samples # with missing values during predict() will go to whichever child @@ -645,13 +653,10 @@ def _update_interactions(self, node): A node that might have children and whose interaction info is updated inplace. """ - # Case of no interactions is already captured before function call. - if node.interaction_cst_idx is None: - # Already split root node - node.interaction_cst_idx = range(len(self.interaction_cst)) - - # Note: This is for nodes that are already split and have a - # node.split_info.feature_idx. + # Note: + # - Case of no interactions is already captured before function call. + # - This is for nodes that are already split and have a + # node.split_info.feature_idx. allowed_features = set() interaction_cst_idx = list() for i in node.interaction_cst_idx: diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index ab63c888ce88e..13af50f75a60f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -454,9 +454,10 @@ cdef class Splitter: The info about the best possible split among all features. """ cdef: - unsigned int feature_idx - int best_feature_idx - int n_features = self.n_features + int feature_idx + int split_info_idx + int best_split_info_idx + int n_allowed_features = self.n_features split_info_struct split_info split_info_struct * split_infos const unsigned char [::1] has_missing_values = self.has_missing_values @@ -464,7 +465,6 @@ cdef class Splitter: const signed char [::1] monotonic_cst = self.monotonic_cst int n_threads = self.n_threads bint has_interaction_cst = False - int n_allowed_features = self.n_features if allowed_features is not None: has_interaction_cst = True @@ -473,28 +473,33 @@ cdef class Splitter: with nogil: split_infos = malloc( - n_features * sizeof(split_info_struct)) + n_allowed_features * sizeof(split_info_struct)) - for feature_idx in prange(n_allowed_features, schedule='static', - num_threads=n_threads): + # split_info_idx is index of split_infos of size n_features_allowed + # features_idx is the index of the feature column in X + for split_info_idx in prange(n_allowed_features, schedule='static', + num_threads=n_threads): if has_interaction_cst: - feature_idx = allowed_features[feature_idx] - split_infos[feature_idx].feature_idx = feature_idx + feature_idx = allowed_features[split_info_idx] + else: + feature_idx = split_info_idx + + split_infos[split_info_idx].feature_idx = feature_idx # For each feature, find best bin to split on # Start with a gain of -1 (if no better split is found, that # means one of the constraints isn't respected # (min_samples_leaf, etc) and the grower will later turn the # node into a leaf. - split_infos[feature_idx].gain = -1 - split_infos[feature_idx].is_categorical = is_categorical[feature_idx] + split_infos[split_info_idx].gain = -1 + split_infos[split_info_idx].is_categorical = is_categorical[feature_idx] if is_categorical[feature_idx]: self._find_best_bin_to_split_category( feature_idx, has_missing_values[feature_idx], histograms, n_samples, sum_gradients, sum_hessians, value, monotonic_cst[feature_idx], lower_bound, - upper_bound, &split_infos[feature_idx]) + upper_bound, &split_infos[split_info_idx]) else: # We will scan bins from left to right (in all cases), and # if there are any missing values, we will also scan bins @@ -510,7 +515,7 @@ cdef class Splitter: feature_idx, has_missing_values[feature_idx], histograms, n_samples, sum_gradients, sum_hessians, value, monotonic_cst[feature_idx], - lower_bound, upper_bound, &split_infos[feature_idx]) + lower_bound, upper_bound, &split_infos[split_info_idx]) if has_missing_values[feature_idx]: # We need to explore both directions to check whether @@ -520,12 +525,13 @@ cdef class Splitter: feature_idx, histograms, n_samples, sum_gradients, sum_hessians, value, monotonic_cst[feature_idx], - lower_bound, upper_bound, &split_infos[feature_idx]) + lower_bound, upper_bound, &split_infos[split_info_idx]) # then compute best possible split among all features - best_feature_idx = self._find_best_feature_to_split_helper( - split_infos) - split_info = split_infos[best_feature_idx] + # split_info is the index of the best split_info + best_split_info_idx = self._find_best_feature_to_split_helper( + split_infos, n_allowed_features) + split_info = split_infos[best_split_info_idx] out = SplitInfo( split_info.gain, @@ -551,18 +557,19 @@ cdef class Splitter: return out cdef unsigned int _find_best_feature_to_split_helper( - self, - split_info_struct * split_infos) nogil: # IN - """Returns the best feature among those in splits_infos.""" + self, + split_info_struct * split_infos, # IN + int n_allowed_features, + ) nogil: + """Return the index of split_infos with the best feature split.""" cdef: - unsigned int feature_idx - unsigned int best_feature_idx = 0 - - for feature_idx in range(1, self.n_features): - if (split_infos[feature_idx].gain > - split_infos[best_feature_idx].gain): - best_feature_idx = feature_idx - return best_feature_idx + unsigned int split_info_idx + unsigned int best_split_info_idx = 0 + + for split_info_idx in range(1, n_allowed_features): + if (split_infos[split_info_idx].gain > split_infos[best_split_info_idx].gain): + best_split_info_idx = split_info_idx + return best_split_info_idx cdef void _find_best_bin_to_split_left_to_right( Splitter self, From f2a067914dfa98a59cfb7de2c5f54af689f71440 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 15 Sep 2021 22:58:06 +0200 Subject: [PATCH 09/61] TST add test_grower_interaction_constraints --- .../tests/test_grower.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 6ff30a5888fe3..7586293db3ec3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -567,3 +567,53 @@ def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target): assert predictor.get_max_depth() < predictor_ohe.get_max_depth() np.testing.assert_allclose(preds, preds_ohe) + + +@pytest.mark.parametrize("seed", range(20)) +def test_grower_interaction_constraints(seed): + """Check that grower respects interaction constraints.""" + rng = np.random.RandomState(seed) + n_features = 6 + interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}] + n_samples = 10 + n_bins = 10 + X_binned = rng.randint( + 0, n_bins - 1, size=(n_samples, n_features), dtype=X_BINNED_DTYPE + ) + X_binned = np.asfortranarray(X_binned) + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower( + X_binned, + gradients, + hessians, + max_depth=3, + n_bins=n_bins, + shrinkage=1.0, + max_leaf_nodes=None, + min_samples_leaf=1, + interaction_cst=interaction_cst, + ) + grower.grow() + + def get_all_children(node): + res = [] + if node.is_leaf: + return res + for n in [node.left_child, node.right_child]: + res.append(n) + if not n.is_leaf: + res.extend(get_all_children(n)) + return res + + map = {0: {0, 1}, 1: {0, 1, 2}, 2: {1, 2}} + if grower.root.split_info.feature_idx in {0, 1, 2}: + constraint_set = map[grower.root.split_info.feature_idx] + for node in get_all_children(grower.root): + if not node.is_leaf: + assert node.split_info.feature_idx in constraint_set + elif grower.root.split_info.feature_idx in {3, 4, 5}: + for node in get_all_children(grower.root): + if not node.is_leaf: + assert node.split_info.feature_idx in {3, 4, 5} From eb1e2558670277219bd2957154484e13f5e816cd Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 16 Sep 2021 21:24:05 +0200 Subject: [PATCH 10/61] CLN make allowed_features an instance variable --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index d111a0fc2fb63..577ce39516ea7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -74,14 +74,14 @@ class TreeNode: Indices of the interaction sets/groups that have to be applied on splits of child nodes. The fewer sets the harder the constraint as fewer sets contain fewer features. + children_lower_bound : float + children_upper_bound : float """ split_info = None left_child = None right_child = None histograms = None - allowed_features = None - interaction_cst_idx = None # start and stop indices of the node in the splitter.partition # array. Concretely, @@ -102,6 +102,8 @@ def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=Non self.sum_hessians = sum_hessians self.value = value self.is_leaf = False + self.allowed_features = None + self.interaction_cst_idx = None self.set_children_bounds(float("-inf"), float("+inf")) def set_children_bounds(self, lower, upper): From ec489454b917bf9ab4f0e62a9898972ffd4236e6 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 17 Sep 2021 13:28:27 +0200 Subject: [PATCH 11/61] TST restructure test_grower_interaction_constraints --- .../tests/test_grower.py | 77 +++++++++++-------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 7586293db3ec3..60e99c6d32a60 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -569,33 +569,13 @@ def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target): np.testing.assert_allclose(preds, preds_ohe) -@pytest.mark.parametrize("seed", range(20)) -def test_grower_interaction_constraints(seed): +def test_grower_interaction_constraints(): """Check that grower respects interaction constraints.""" - rng = np.random.RandomState(seed) n_features = 6 interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}] - n_samples = 10 - n_bins = 10 - X_binned = rng.randint( - 0, n_bins - 1, size=(n_samples, n_features), dtype=X_BINNED_DTYPE - ) - X_binned = np.asfortranarray(X_binned) - gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) - hessians = np.ones(shape=1, dtype=G_H_DTYPE) - - grower = TreeGrower( - X_binned, - gradients, - hessians, - max_depth=3, - n_bins=n_bins, - shrinkage=1.0, - max_leaf_nodes=None, - min_samples_leaf=1, - interaction_cst=interaction_cst, - ) - grower.grow() + n_samples = 5 + n_bins = 6 + root_feature_splits = [] def get_all_children(node): res = [] @@ -607,13 +587,42 @@ def get_all_children(node): res.extend(get_all_children(n)) return res - map = {0: {0, 1}, 1: {0, 1, 2}, 2: {1, 2}} - if grower.root.split_info.feature_idx in {0, 1, 2}: - constraint_set = map[grower.root.split_info.feature_idx] - for node in get_all_children(grower.root): - if not node.is_leaf: - assert node.split_info.feature_idx in constraint_set - elif grower.root.split_info.feature_idx in {3, 4, 5}: - for node in get_all_children(grower.root): - if not node.is_leaf: - assert node.split_info.feature_idx in {3, 4, 5} + for seed in range(20): + rng = np.random.RandomState(seed) + + X_binned = rng.randint( + 0, n_bins - 1, size=(n_samples, n_features), dtype=X_BINNED_DTYPE + ) + X_binned = np.asfortranarray(X_binned) + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower( + X_binned, + gradients, + hessians, + max_depth=3, + n_bins=n_bins, + shrinkage=1.0, + max_leaf_nodes=None, + min_samples_leaf=1, + interaction_cst=interaction_cst, + n_threads=2, + ) + grower.grow() + + root_feature_splits.append(grower.root.split_info.feature_idx) + + map = {0: {0, 1}, 1: {0, 1, 2}, 2: {1, 2}} + if grower.root.split_info.feature_idx in {0, 1, 2}: + constraint_set = map[grower.root.split_info.feature_idx] + for node in get_all_children(grower.root): + if not node.is_leaf: + assert node.split_info.feature_idx in constraint_set + elif grower.root.split_info.feature_idx in {3, 4, 5}: + for node in get_all_children(grower.root): + if not node.is_leaf: + assert node.split_info.feature_idx in {3, 4, 5} + + # Make sure that every feature is used at least once as split for the root node. + assert len(set(root_feature_splits)) == n_features From 1ed28d2b953ddab0cee50889385c94b139c65b00 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 17 Sep 2021 15:06:58 +0200 Subject: [PATCH 12/61] CLN improve logic --- .../_hist_gradient_boosting/grower.py | 39 +++++++++++-------- .../_hist_gradient_boosting/splitting.pyx | 13 ++++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 577ce39516ea7..716deb895d38b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -474,11 +474,6 @@ def _compute_best_split_and_push(self, node): if node.split_info.gain <= 0: # no valid split self._finalize_leaf(node) else: - # Update node.allowed_features and node.interaction_cst_idx to be inherited - # by child nodes. - if self.interaction_cst is not None: - self._update_interactions(node) - heappush(self.splittable_nodes, node) def split_next(self): @@ -532,10 +527,14 @@ def split_next(self): # set interaction constraints (the indices of the constraints sets) if self.interaction_cst is not None: - left_child_node.interaction_cst_idx = node.interaction_cst_idx - left_child_node.allowed_features = node.allowed_features - right_child_node.interaction_cst_idx = node.interaction_cst_idx - right_child_node.allowed_features = node.allowed_features + # Calculate allowed_features and interaction_cst_idx only once and inherit + # them by child nodes. + ( + left_child_node.allowed_features, + left_child_node.interaction_cst_idx, + ) = self._compute_interactions(node) + right_child_node.interaction_cst_idx = left_child_node.interaction_cst_idx + right_child_node.allowed_features = left_child_node.allowed_features if not self.has_missing_values[node.split_info.feature_idx]: # If no missing values are encountered at fit time, then samples @@ -638,10 +637,8 @@ def split_next(self): return left_child_node, right_child_node - def _update_interactions(self, node): - r"""Update features allowed by interactions for child nods. - - Update node.allowed_features and node.interaction_cst_idx. + def _compute_interactions(self, node): + r"""Compute features allowed by interactions to be inherited by child nodes. Example: Assume constraints [{0, 1}, {1, 2}]. 1 <- Both constraint groups could be applied from now on @@ -652,8 +649,17 @@ def _update_interactions(self, node): Parameters: ---------- node : TreeNode - A node that might have children and whose interaction info is updated - inplace. + A node that might have children. Based on its feature_idx, the interaction + constraints for possible child nodes are computed. + + Returns + ------- + allowed_features : None or ndarray, dtype=int + Indices of features allowed to split for children. + interaction_cst_idx : None or list of ints + Indices of the interaction sets/groups that have to be applied on + splits of child nodes. The fewer sets the harder the constraint as + fewer sets contain fewer features. """ # Note: # - Case of no interactions is already captured before function call. @@ -665,8 +671,7 @@ def _update_interactions(self, node): if node.split_info.feature_idx in self.interaction_cst[i]: interaction_cst_idx.append(i) allowed_features.update(self.interaction_cst[i]) - node.allowed_features = np.array(list(allowed_features), dtype=np.uint32) - node.interaction_cst_idx = interaction_cst_idx + return np.array(list(allowed_features), dtype=np.uint32), interaction_cst_idx def _finalize_leaf(self, node): """Make node a leaf of the tree being grown.""" diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 13af50f75a60f..0219bcf627b01 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -161,6 +161,8 @@ cdef class Splitter: be ignored. hessians_are_constant: bool, default is False Whether hessians are constant. + n_threads : int, default=1 + Number of OpenMP threads to use. """ cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned @@ -392,11 +394,12 @@ cdef class Splitter: &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - memcpy( - &sample_indices[right_offset[thread_idx]], - &right_indices_buffer[offset_in_buffers[thread_idx]], - sizeof(unsigned int) * right_counts[thread_idx] - ) + if right_counts[thread_idx] > 0: + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], From c7c8c3f1c0c183e9b3cee8a0245d0e1c2d638cef Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 17 Sep 2021 15:07:54 +0200 Subject: [PATCH 13/61] TST improve test --- .../tests/test_grower.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 60e99c6d32a60..358059e4df25b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -607,15 +607,26 @@ def get_all_children(node): max_leaf_nodes=None, min_samples_leaf=1, interaction_cst=interaction_cst, - n_threads=2, + n_threads=n_threads, ) grower.grow() root_feature_splits.append(grower.root.split_info.feature_idx) - map = {0: {0, 1}, 1: {0, 1, 2}, 2: {1, 2}} if grower.root.split_info.feature_idx in {0, 1, 2}: - constraint_set = map[grower.root.split_info.feature_idx] + root_feature_idx = grower.root.split_info.feature_idx + constraint_set = {0: {0, 1}, 1: {0, 1, 2}, 2: {1, 2}}[root_feature_idx] + for node in (grower.root.left_child, grower.root.right_child): + # test allowed_features of children of root node + assert_array_equal(node.allowed_features, list(constraint_set)) + if root_feature_idx in {0, 2}: + # test that {0, 1} and {1, 2} don't interact with each other + for node in get_all_children(grower.root): + if not node.is_leaf: + assert ( + node.split_info.feature_idx + in {0: {0, 1}, 2: {1, 2}}[root_feature_idx] + ) for node in get_all_children(grower.root): if not node.is_leaf: assert node.split_info.feature_idx in constraint_set From 764cdf5b96132265eb4073fec4ce7079addb8bfb Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 18 Sep 2021 12:11:47 +0200 Subject: [PATCH 14/61] DOC add docstring for interaction_cst --- .../_hist_gradient_boosting/gradient_boosting.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e5435b47736cf..b0ee2575d0bc0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1116,7 +1116,13 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 - interaction_cst : TODO + interaction_cst : list of lists or sets of int, default=None + Each inner list or set specifies the feature indices that are allowed + to interact with each other, meaning splits in child nodes are only + allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that + a single tree is either only split on features 0 and 1 or only on + features 2, 3 and 4. If there are more featres, e.g. 5 and 6, those + are not allowed to be split on as they are not listed. .. versionadded:: 1.1 @@ -1434,7 +1440,13 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 - interaction_cst : TODO + interaction_cst : list of lists or sets of int, default=None + Each inner list or set specifies the feature indices that are allowed + to interact with each other, meaning splits in child nodes are only + allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that + a single tree is either only split on features 0 and 1 or only on + features 2, 3 and 4. If there are more featres, e.g. 5 and 6, those + are not allowed to be split on as they are not listed. .. versionadded:: 1.1 From 5a26f6e18ef6e41ed4039c2e54ce6e26f0362c4a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 18 Sep 2021 12:57:43 +0200 Subject: [PATCH 15/61] ENH add validation of interaction_cst --- .../gradient_boosting.py | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index b0ee2575d0bc0..6351b8c0d5e71 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -121,17 +121,6 @@ def _validate_parameters(self): "monotonic constraints are not supported for multiclass classification." ) - if ( - self.interaction_cst is not None - and not isinstance(self.interaction_cst, Sequence) - and not all(isinstance(x, (Sequence, set)) for x in self.interaction_cst) - ): - # TODO: better validation - # lets start with list or set of {list, tuple, set} - raise ValueError( - "interaction constraints must be None or a Sequence of {Sequence, set}" - ) - def _check_categories(self, X): """Check and validate categorical features in X @@ -217,6 +206,32 @@ def _check_categories(self, X): return is_categorical, known_categories + def _check_interaction_cst(self, n_features): + """Check and validation interaction constraints.""" + if self.interaction_cst is None: + return None + + if self.interaction_cst is not None: + if not ( + isinstance(self.interaction_cst, Sequence) + and all(isinstance(x, (Sequence, set)) for x in self.interaction_cst) + ): + raise ValueError( + "Interaction constraints must be None or a Sequence of {Sequence," + " set}" + ) + if not all( + (x == int(x) and 0 <= x and x < n_features) + for cst_set in self.interaction_cst + for x in cst_set + ): + raise ValueError( + "Interaction constraints must consist of integers indices in [0," + " n_features - 1], specifying the position of features." + ) + + return [set([int(x) for x in group]) for group in self.interaction_cst] + def fit(self, X, y, sample_weight=None): """Fit the gradient boosting model. @@ -270,12 +285,7 @@ def fit(self, X, y, sample_weight=None): self.is_categorical_, known_categories = self._check_categories(X) # convert to list of sets and convert to integers - if self.interaction_cst is None: - self._interaction_cst = None - else: - self._interaction_cst = [ - set([int(x) for x in group]) for group in self.interaction_cst - ] + interaction_cst = self._check_interaction_cst(self._n_features) # we need this stateful variable to tell raw_predict() that it was # called from fit() (this current method), and that the data it has @@ -538,7 +548,7 @@ def fit(self, X, y, sample_weight=None): has_missing_values=has_missing_values, is_categorical=self.is_categorical_, monotonic_cst=self.monotonic_cst, - interaction_cst=self._interaction_cst, + interaction_cst=interaction_cst, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, From eb75a3044aaadbe4c214b435388bf2e74ad23646 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 18 Sep 2021 12:58:34 +0200 Subject: [PATCH 16/61] TST test input validation --- .../tests/test_gradient_boosting.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 79581525b50bb..7f58591f59f4f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -66,6 +66,29 @@ def _make_dumb_dataset(n_samples): ({"validation_fraction": -1}, "validation_fraction=-1 must be strictly"), ({"validation_fraction": 0}, "validation_fraction=0 must be strictly"), ({"tol": -1}, "tol=-1 must not be smaller than 0"), + ( + {"interaction_cst": np.array([[0, 1]])}, + "Interaction constraints must be None or a Sequence of {Sequence, set}", + ), + ( + {"interaction_cst": [0, 1]}, + "Interaction constraints must be None or a Sequence of {Sequence, set}", + ), + ( + {"interaction_cst": [{0, 9999}]}, + r"Interaction constraints must consist of integers indices in \[0," + r" n_features - 1\], specifying the position of features.", + ), + ( + {"interaction_cst": [{-1, 0}]}, + r"Interaction constraints must consist of integers indices in \[0," + r" n_features - 1\], specifying the position of features.", + ), + ( + {"interaction_cst": [{0.5}]}, + r"Interaction constraints must consist of integers indices in \[0," + r" n_features - 1\], specifying the position of features.", + ), ], ) def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg): From 3ea38292497d5072f346f0248244c4c2ec1fc985 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 18 Sep 2021 18:33:37 +0200 Subject: [PATCH 17/61] DEBUG uncomment if condition --- .../ensemble/_hist_gradient_boosting/splitting.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 0219bcf627b01..067c933165bf5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -1,5 +1,5 @@ # cython: cdivision=True -# cython: boundscheck=False +# cython: boundscheck=True # cython: wraparound=False # cython: language_level=3 @@ -394,12 +394,12 @@ cdef class Splitter: &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - if right_counts[thread_idx] > 0: - memcpy( - &sample_indices[right_offset[thread_idx]], - &right_indices_buffer[offset_in_buffers[thread_idx]], - sizeof(unsigned int) * right_counts[thread_idx] - ) + #if right_counts[thread_idx] > 0: + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], From 0570f61f9ce399df5f71832b1b41325c8b17135a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 21 Sep 2021 19:48:08 +0200 Subject: [PATCH 18/61] address review comments --- .../gradient_boosting.py | 16 +++++----- .../_hist_gradient_boosting/grower.py | 29 +++++++++++-------- .../_hist_gradient_boosting/splitting.pyx | 2 +- .../tests/test_splitting.py | 2 +- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 6351b8c0d5e71..037be74a73d0d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1129,10 +1129,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): interaction_cst : list of lists or sets of int, default=None Each inner list or set specifies the feature indices that are allowed to interact with each other, meaning splits in child nodes are only - allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that - a single tree is either only split on features 0 and 1 or only on - features 2, 3 and 4. If there are more featres, e.g. 5 and 6, those - are not allowed to be split on as they are not listed. + allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that a + single branch of a tree is either only split on features 0 and 1 or + only on features 2, 3 and 4. If there are more featres, e.g. 5 and 6, + those are not allowed to be split on as they are not listed. .. versionadded:: 1.1 @@ -1453,10 +1453,10 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): interaction_cst : list of lists or sets of int, default=None Each inner list or set specifies the feature indices that are allowed to interact with each other, meaning splits in child nodes are only - allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that - a single tree is either only split on features 0 and 1 or only on - features 2, 3 and 4. If there are more featres, e.g. 5 and 6, those - are not allowed to be split on as they are not listed. + allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that a + single branch of a tree is either only split on features 0 and 1 or + only on features 2, 3 and 4. If there are more featres, e.g. 5 and 6, + those are not allowed to be split on as they are not listed. .. versionadded:: 1.1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 716deb895d38b..e2df952b2c892 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -70,7 +70,7 @@ class TreeNode: stop position of the node's sample_indices in splitter.partition. allowed_features : None or ndarray, dtype=int Indices of features allowed to split for children. - interaction_cst_idx : None or list of ints + interaction_cst_indices : None or list of ints Indices of the interaction sets/groups that have to be applied on splits of child nodes. The fewer sets the harder the constraint as fewer sets contain fewer features. @@ -103,7 +103,7 @@ def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=Non self.value = value self.is_leaf = False self.allowed_features = None - self.interaction_cst_idx = None + self.interaction_cst_indices = None self.set_children_bounds(float("-inf"), float("+inf")) def set_children_bounds(self, lower, upper): @@ -443,7 +443,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): ) if self.interaction_cst is not None: - self.root.interaction_cst_idx = list(range(len(self.interaction_cst))) + self.root.interaction_cst_indices = list(range(len(self.interaction_cst))) allowed_features = set().union(*self.interaction_cst) self.root.allowed_features = np.array( list(allowed_features), dtype=np.uint32 @@ -527,13 +527,15 @@ def split_next(self): # set interaction constraints (the indices of the constraints sets) if self.interaction_cst is not None: - # Calculate allowed_features and interaction_cst_idx only once and inherit - # them by child nodes. + # Calculate allowed_features and interaction_cst_indices only once and + # inherit them by child nodes. ( left_child_node.allowed_features, - left_child_node.interaction_cst_idx, + left_child_node.interaction_cst_indices, ) = self._compute_interactions(node) - right_child_node.interaction_cst_idx = left_child_node.interaction_cst_idx + right_child_node.interaction_cst_indices = ( + left_child_node.interaction_cst_indices + ) right_child_node.allowed_features = left_child_node.allowed_features if not self.has_missing_values[node.split_info.feature_idx]: @@ -656,7 +658,7 @@ def _compute_interactions(self, node): ------- allowed_features : None or ndarray, dtype=int Indices of features allowed to split for children. - interaction_cst_idx : None or list of ints + interaction_cst_indices : None or list of ints Indices of the interaction sets/groups that have to be applied on splits of child nodes. The fewer sets the harder the constraint as fewer sets contain fewer features. @@ -666,12 +668,15 @@ def _compute_interactions(self, node): # - This is for nodes that are already split and have a # node.split_info.feature_idx. allowed_features = set() - interaction_cst_idx = list() - for i in node.interaction_cst_idx: + interaction_cst_indices = list() + for i in node.interaction_cst_indices: if node.split_info.feature_idx in self.interaction_cst[i]: - interaction_cst_idx.append(i) + interaction_cst_indices.append(i) allowed_features.update(self.interaction_cst[i]) - return np.array(list(allowed_features), dtype=np.uint32), interaction_cst_idx + return ( + np.array(list(allowed_features), dtype=np.uint32), + interaction_cst_indices, + ) def _finalize_leaf(self, node): """Make node a leaf of the tree being grown.""" diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 067c933165bf5..87ca01925ae08 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -531,7 +531,7 @@ cdef class Splitter: lower_bound, upper_bound, &split_infos[split_info_idx]) # then compute best possible split among all features - # split_info is the index of the best split_info + # split_info is set to the best of split_infos best_split_info_idx = self._find_best_feature_to_split_helper( split_infos, n_allowed_features) split_info = split_infos[best_split_info_idx] diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index e52dbabd029ee..8d9119af7311f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -921,5 +921,5 @@ def test_split_interaction_constraints(): allowed_features=allowed_features, ) - # only features 0 and 1 are allowed to be split on + # only features 0 and 3 are allowed to be split on assert si_root.feature_idx in {0, 3} From 16fc0b8be46245acd65fb242d0744727f274d9d0 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 21 Sep 2021 19:49:01 +0200 Subject: [PATCH 19/61] Revert "DEBUG uncomment if condition" This reverts commit 3ea38292497d5072f346f0248244c4c2ec1fc985. --- .../ensemble/_hist_gradient_boosting/splitting.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 87ca01925ae08..bee5b0f988924 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -1,5 +1,5 @@ # cython: cdivision=True -# cython: boundscheck=True +# cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 @@ -394,12 +394,12 @@ cdef class Splitter: &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - #if right_counts[thread_idx] > 0: - memcpy( - &sample_indices[right_offset[thread_idx]], - &right_indices_buffer[offset_in_buffers[thread_idx]], - sizeof(unsigned int) * right_counts[thread_idx] - ) + if right_counts[thread_idx] > 0: + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], From 2b7e1e25394f9a6c665a9ad5f32c99303c33eb1b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 21 Sep 2021 19:54:10 +0200 Subject: [PATCH 20/61] TST increase max_depth and n_samples in test_grower_interaction_constraints --- sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 358059e4df25b..0d817d9df134c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -573,7 +573,7 @@ def test_grower_interaction_constraints(): """Check that grower respects interaction constraints.""" n_features = 6 interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}] - n_samples = 5 + n_samples = 10 n_bins = 6 root_feature_splits = [] @@ -601,7 +601,7 @@ def get_all_children(node): X_binned, gradients, hessians, - max_depth=3, + max_depth=5, n_bins=n_bins, shrinkage=1.0, max_leaf_nodes=None, From ead3b0c767bdc673ada0cddd34bcafdec2e08945 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 22 Sep 2021 00:16:44 +0200 Subject: [PATCH 21/61] ENH add default group to interaction constraints --- .../gradient_boosting.py | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 037be74a73d0d..686feab17738a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2,7 +2,7 @@ # Author: Nicolas Hug from abc import ABC, abstractmethod -from collections.abc import Sequence +from collections.abc import Iterable from functools import partial import warnings @@ -211,26 +211,31 @@ def _check_interaction_cst(self, n_features): if self.interaction_cst is None: return None - if self.interaction_cst is not None: - if not ( - isinstance(self.interaction_cst, Sequence) - and all(isinstance(x, (Sequence, set)) for x in self.interaction_cst) - ): - raise ValueError( - "Interaction constraints must be None or a Sequence of {Sequence," - " set}" - ) - if not all( - (x == int(x) and 0 <= x and x < n_features) - for cst_set in self.interaction_cst - for x in cst_set - ): - raise ValueError( - "Interaction constraints must consist of integers indices in [0," - " n_features - 1], specifying the position of features." - ) + if not ( + isinstance(self.interaction_cst, Iterable) + and all(isinstance(x, Iterable) for x in self.interaction_cst) + ): + raise ValueError( + "Interaction constraints must be None or an iterable of iterables" + ) + if not all( + (x == int(x) and 0 <= x and x < n_features) + for cst_set in self.interaction_cst + for x in cst_set + ): + raise ValueError( + "Interaction constraints must consist of integer indices in [0," + " n_features - 1], specifying the position of features." + ) + + constraints = [set([int(x) for x in group]) for group in self.interaction_cst] + + # Add all not listed features as own group by default. + rest = set(range(n_features)).difference(set().union(*constraints)) + if len(rest) > 0: + constraints.append(rest) - return [set([int(x) for x in group]) for group in self.interaction_cst] + return constraints def fit(self, X, y, sample_weight=None): """Fit the gradient boosting model. From 5a35ab7b468a056d1d003d39817106c6535b2a7b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 22 Sep 2021 00:18:24 +0200 Subject: [PATCH 22/61] TST test_check_interaction_cst --- .../tests/test_gradient_boosting.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 7f58591f59f4f..bc83cf4f24e9c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -67,26 +67,26 @@ def _make_dumb_dataset(n_samples): ({"validation_fraction": 0}, "validation_fraction=0 must be strictly"), ({"tol": -1}, "tol=-1 must not be smaller than 0"), ( - {"interaction_cst": np.array([[0, 1]])}, - "Interaction constraints must be None or a Sequence of {Sequence, set}", + {"interaction_cst": "string"}, + "", ), ( {"interaction_cst": [0, 1]}, - "Interaction constraints must be None or a Sequence of {Sequence, set}", + "Interaction constraints must be None or an iterable of iterables", ), ( {"interaction_cst": [{0, 9999}]}, - r"Interaction constraints must consist of integers indices in \[0," + r"Interaction constraints must consist of integer indices in \[0," r" n_features - 1\], specifying the position of features.", ), ( {"interaction_cst": [{-1, 0}]}, - r"Interaction constraints must consist of integers indices in \[0," + r"Interaction constraints must consist of integer indices in \[0," r" n_features - 1\], specifying the position of features.", ), ( {"interaction_cst": [{0.5}]}, - r"Interaction constraints must consist of integers indices in \[0," + r"Interaction constraints must consist of integer indices in \[0," r" n_features - 1\], specifying the position of features.", ), ], @@ -1083,6 +1083,21 @@ def test_uint8_predict(Est): est.predict(X) +@pytest.mark.parametrize( + "interaction_cst, n_features, result", + [ + (None, 931, None), + ([{0, 1}], 2, [{0, 1}]), + ([(1, 0), [5, 1]], 6, [{0, 1}, {1, 5}, {2, 3, 4}]), + ], +) +def test_check_interaction_cst(interaction_cst, n_features, result): + """Checkt that _check_interaction_cst returns the expected list of sets""" + est = HistGradientBoostingRegressor() + est.set_params(interaction_cst=interaction_cst) + assert est._check_interaction_cst(n_features) == result + + # TODO: Remove in v1.2 @pytest.mark.parametrize( "old_loss, new_loss", From c93d3f061b814eb3d760daa579ca1ea01630fff4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 22 Sep 2021 00:19:00 +0200 Subject: [PATCH 23/61] DOC udpate docstring of interaction_cst with default group --- .../gradient_boosting.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 686feab17738a..39d680d8af10b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1131,13 +1131,15 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 - interaction_cst : list of lists or sets of int, default=None + interaction_cst : iterable of iterables of int, default=None Each inner list or set specifies the feature indices that are allowed to interact with each other, meaning splits in child nodes are only - allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that a - single branch of a tree is either only split on features 0 and 1 or - only on features 2, 3 and 4. If there are more featres, e.g. 5 and 6, - those are not allowed to be split on as they are not listed. + allowed in those sets. If there are more features than specified in + these constraints, they are treated as if they were specified as an + additional set. + For instance, with 5 features in total, `[{0, 1}]` is equivalent to + `[{0, 1}, {2, 3, 4}]`, and says that a single branch of a tree is + either only split on features 0 and 1 or only on features 2, 3 and 4. .. versionadded:: 1.1 @@ -1455,13 +1457,15 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 - interaction_cst : list of lists or sets of int, default=None + interaction_cst : iterable of iterables of int, default=None Each inner list or set specifies the feature indices that are allowed to interact with each other, meaning splits in child nodes are only - allowed in those sets. For instance, `[{0, 1}, {2, 3, 4}]` says that a - single branch of a tree is either only split on features 0 and 1 or - only on features 2, 3 and 4. If there are more featres, e.g. 5 and 6, - those are not allowed to be split on as they are not listed. + allowed in those sets. If there are more features than specified in + these constraints, they are treated as if they were specified as an + additional set. + For instance, with 5 features in total, `[{0, 1}]` is equivalent to + `[{0, 1}, {2, 3, 4}]`, and says that a single branch of a tree is + either only split on features 0 and 1 or only on features 2, 3 and 4. .. versionadded:: 1.1 From 5092f6b2ffa5180c04a218e900905b81bf32a927 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 22 Sep 2021 21:34:51 +0200 Subject: [PATCH 24/61] DOC add whatsnew --- doc/whats_new/v1.1.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index fba40e25a9e7e..c0939a4f18018 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -38,6 +38,16 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. + +:mod:`sklearn.ensemble` +....................... + +- |Feature| Added interaction constraints for + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` via the new + argument `interaction_cst`. + :pr:`21020` by :user:`Christian Lorentzen `. + :mod:`sklearn.utils` .................... From a18b5ee5151ac528a600af594d5080fff3be1810 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 23 Sep 2021 23:15:59 +0200 Subject: [PATCH 25/61] DEBUG --- .../_hist_gradient_boosting/grower.py | 3 +++ .../_hist_gradient_boosting/splitting.pyx | 23 +++++++++++++++++-- .../tests/test_grower.py | 6 ++--- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e2df952b2c892..dc359b5f2355a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -489,6 +489,9 @@ def split_next(self): # Consider the node with the highest loss reduction (a.k.a. gain) node = heappop(self.splittable_nodes) + print("\nGROWER:") + print(f"\npartition.shape = {self.splitter.partition.shape}", flush=True) + tic = time() ( sample_indices_left, diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index bee5b0f988924..741154adb59ef 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -1,5 +1,5 @@ # cython: cdivision=True -# cython: boundscheck=False +# cython: boundscheck=True # cython: wraparound=False # cython: language_level=3 @@ -332,6 +332,7 @@ cdef class Splitter: int [:] left_offset = np.zeros(n_threads, dtype=np.int32) int [:] right_offset = np.zeros(n_threads, dtype=np.int32) + # only set left_cat_bitset when is_categorical is True if is_categorical: left_cat_bitset = &cat_bitset_tmp[0] @@ -387,14 +388,32 @@ cdef class Splitter: # map indices in left/right_indices_buffer back into # sample_indices. This also updates self.partition since # sample_indices is a view. + with gil: + print("\nsplit_indices") + print(f"n_threads = {n_threads}") + print(f"len(sample_indices) = {len(sample_indices)}") + print(f"sample_indices = {np.array(sample_indices)}") for thread_idx in prange(n_threads, schedule='static', chunksize=1): + with gil: + print( + f"\nthread_idx = {np.array(thread_idx)}" + f"\noffset_in_buffers = {np.array(offset_in_buffers)}" + f"\nleft_counts = {np.array(left_counts)}" + f"\nleft_offset = {np.array(left_offset)}" + f"\nleft_indices_buffer = {np.array(left_indices_buffer)}" + f"\nright_counts = {np.array(right_counts)}" + f"\nright_offset = {np.array(right_offset)}" + f"\nright_indices_buffer = {np.array(right_indices_buffer)}", + flush=True, + ) + memcpy( &sample_indices[left_offset[thread_idx]], &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - if right_counts[thread_idx] > 0: + if right_counts[thread_idx] > -1: memcpy( &sample_indices[right_offset[thread_idx]], &right_indices_buffer[offset_in_buffers[thread_idx]], diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 0d817d9df134c..ac2895c8e4f57 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -573,7 +573,7 @@ def test_grower_interaction_constraints(): """Check that grower respects interaction constraints.""" n_features = 6 interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}] - n_samples = 10 + n_samples = 3 n_bins = 6 root_feature_splits = [] @@ -587,7 +587,7 @@ def get_all_children(node): res.extend(get_all_children(n)) return res - for seed in range(20): + for seed in range(1): rng = np.random.RandomState(seed) X_binned = rng.randint( @@ -607,7 +607,7 @@ def get_all_children(node): max_leaf_nodes=None, min_samples_leaf=1, interaction_cst=interaction_cst, - n_threads=n_threads, + n_threads=2, ) grower.grow() From 6a0205840816525fa89bfe1a6b19527c1316a7f8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 28 Sep 2021 19:40:02 +0200 Subject: [PATCH 26/61] TST make test_split_interaction_constraints more tighter --- .../tests/test_splitting.py | 120 +++++++++++------- 1 file changed, 74 insertions(+), 46 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 8d9119af7311f..e3316aa64e6c4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -860,8 +860,6 @@ def test_splitting_categorical_sanity( def test_split_interaction_constraints(): """Check that allowed_features are respected.""" - rng = np.random.RandomState(919) - n_features = 4 # features 1 and 2 are not allowed to be split on allowed_features = np.array([0, 3], dtype=np.uint32) @@ -871,55 +869,85 @@ def test_split_interaction_constraints(): min_hessian_to_split = 1e-3 min_samples_leaf = 1 min_gain_to_split = 0.0 - X_binned = np.asfortranarray( - rng.randint(0, n_bins - 1, size=(n_samples, n_features)), dtype=X_BINNED_DTYPE - ) - X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) + sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) all_hessians = np.ones(1, dtype=G_H_DTYPE) - sum_gradients = all_gradients.sum() sum_hessians = 1 * n_samples hessians_are_constant = True - builder = HistogramBuilder( - X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads - ) - n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - monotonic_cst = np.array( - [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 - ) - is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) - missing_values_bin_idx = n_bins - 1 - splitter = Splitter( - X_binned, - n_bins_non_missing, - missing_values_bin_idx, - has_missing_values, - is_categorical, - monotonic_cst, - l2_regularization, - min_hessian_to_split, - min_samples_leaf, - min_gain_to_split, - hessians_are_constant, - ) + split_features = [] - assert np.all(sample_indices == splitter.partition) + for i in range(10): + rng = np.random.RandomState(919 + i) + X_binned = np.asfortranarray( + rng.randint(0, n_bins - 1, size=(n_samples, n_features)), + dtype=X_BINNED_DTYPE, + ) + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) + + # Make feature 1 very important + all_gradients = (10 * X_binned[:, 1] + rng.randn(n_samples)).astype(G_H_DTYPE) + sum_gradients = all_gradients.sum() + + builder = HistogramBuilder( + X_binned, + n_bins, + all_gradients, + all_hessians, + hessians_are_constant, + n_threads, + ) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) - histograms = builder.compute_histograms_brute(sample_indices) - value = compute_node_value( - sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization - ) - si_root = splitter.find_node_split( - n_samples, - histograms, - sum_gradients, - sum_hessians, - value, - allowed_features=allowed_features, - ) + assert np.all(sample_indices == splitter.partition) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + + # with all features allowed, feature 1 should be split on as it is the most + # important one by construction of the gradients + si_root = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=None, + ) + assert si_root.feature_idx == 1 + + # only features 0 and 3 are allowed to be split on + si_root = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=allowed_features, + ) + split_features.append(si_root.feature_idx) + assert si_root.feature_idx in {0, 3} - # only features 0 and 3 are allowed to be split on - assert si_root.feature_idx in {0, 3} + # make sure feature 0 and feature 3 are split on in the constraint setting + assert {0, 3} == set(split_features) From aa21d1604455a10dfd8818522cff488c46f62c32 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 28 Sep 2021 19:48:37 +0200 Subject: [PATCH 27/61] better comments and less typos --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 ++-- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 39d680d8af10b..6ef1b76b1846d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -207,7 +207,7 @@ def _check_categories(self, X): return is_categorical, known_categories def _check_interaction_cst(self, n_features): - """Check and validation interaction constraints.""" + """Check and validation for interaction constraints.""" if self.interaction_cst is None: return None @@ -289,7 +289,7 @@ def fit(self, X, y, sample_weight=None): self.is_categorical_, known_categories = self._check_categories(X) - # convert to list of sets and convert to integers + # Encode constraints into a list of sets of features indices (integers). interaction_cst = self._check_interaction_cst(self._n_features) # we need this stateful variable to tell raw_predict() that it was diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index bc83cf4f24e9c..5b533480d294f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1092,7 +1092,7 @@ def test_uint8_predict(Est): ], ) def test_check_interaction_cst(interaction_cst, n_features, result): - """Checkt that _check_interaction_cst returns the expected list of sets""" + """Check that _check_interaction_cst returns the expected list of sets""" est = HistGradientBoostingRegressor() est.set_params(interaction_cst=interaction_cst) assert est._check_interaction_cst(n_features) == result From 299f31b0ff3a476602e37d0045c41ea71450ce07 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 28 Sep 2021 23:02:03 +0200 Subject: [PATCH 28/61] Revert "DEBUG" This reverts commit a18b5ee5151ac528a600af594d5080fff3be1810. --- .../_hist_gradient_boosting/grower.py | 3 --- .../_hist_gradient_boosting/splitting.pyx | 23 ++----------------- .../tests/test_grower.py | 6 ++--- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index dc359b5f2355a..e2df952b2c892 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -489,9 +489,6 @@ def split_next(self): # Consider the node with the highest loss reduction (a.k.a. gain) node = heappop(self.splittable_nodes) - print("\nGROWER:") - print(f"\npartition.shape = {self.splitter.partition.shape}", flush=True) - tic = time() ( sample_indices_left, diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index be733995d37bd..ed57511fa57ca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -1,5 +1,5 @@ # cython: cdivision=True -# cython: boundscheck=True +# cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 @@ -332,7 +332,6 @@ cdef class Splitter: int [:] left_offset = np.zeros(n_threads, dtype=np.int32) int [:] right_offset = np.zeros(n_threads, dtype=np.int32) - # only set left_cat_bitset when is_categorical is True if is_categorical: left_cat_bitset = &cat_bitset_tmp[0] @@ -388,32 +387,14 @@ cdef class Splitter: # map indices in left/right_indices_buffer back into # sample_indices. This also updates self.partition since # sample_indices is a view. - with gil: - print("\nsplit_indices") - print(f"n_threads = {n_threads}") - print(f"len(sample_indices) = {len(sample_indices)}") - print(f"sample_indices = {np.array(sample_indices)}") for thread_idx in prange(n_threads, schedule='static', chunksize=1): - with gil: - print( - f"\nthread_idx = {np.array(thread_idx)}" - f"\noffset_in_buffers = {np.array(offset_in_buffers)}" - f"\nleft_counts = {np.array(left_counts)}" - f"\nleft_offset = {np.array(left_offset)}" - f"\nleft_indices_buffer = {np.array(left_indices_buffer)}" - f"\nright_counts = {np.array(right_counts)}" - f"\nright_offset = {np.array(right_offset)}" - f"\nright_indices_buffer = {np.array(right_indices_buffer)}", - flush=True, - ) - memcpy( &sample_indices[left_offset[thread_idx]], &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - if right_counts[thread_idx] > -1: + if right_counts[thread_idx] > 0: memcpy( &sample_indices[right_offset[thread_idx]], &right_indices_buffer[offset_in_buffers[thread_idx]], diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 3055c87e2716f..5182480106555 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -573,7 +573,7 @@ def test_grower_interaction_constraints(): """Check that grower respects interaction constraints.""" n_features = 6 interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}] - n_samples = 3 + n_samples = 10 n_bins = 6 root_feature_splits = [] @@ -587,7 +587,7 @@ def get_all_children(node): res.extend(get_all_children(n)) return res - for seed in range(1): + for seed in range(20): rng = np.random.RandomState(seed) X_binned = rng.randint( @@ -607,7 +607,7 @@ def get_all_children(node): max_leaf_nodes=None, min_samples_leaf=1, interaction_cst=interaction_cst, - n_threads=2, + n_threads=n_threads, ) grower.grow() From 4c9e1a34b3ba8dfe0db3811a4a0007b176666521 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 22 Oct 2021 17:55:51 +0200 Subject: [PATCH 29/61] DOC address review comments for docstrings --- .../gradient_boosting.py | 40 +++++++++++-------- .../_hist_gradient_boosting/grower.py | 12 +++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 77cef66c9b743..4af5f81c0d898 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1132,14 +1132,18 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 interaction_cst : iterable of iterables of int, default=None - Each inner list or set specifies the feature indices that are allowed - to interact with each other, meaning splits in child nodes are only - allowed in those sets. If there are more features than specified in - these constraints, they are treated as if they were specified as an - additional set. - For instance, with 5 features in total, `[{0, 1}]` is equivalent to - `[{0, 1}, {2, 3, 4}]`, and says that a single branch of a tree is - either only split on features 0 and 1 or only on features 2, 3 and 4. + Specify interaction constraints, i.e. sets of features which can + only interact with each other in child nodes splits. + + Each iterable materializes a constraint by the set of indices of + the features that are allowed to interact with each other. + If there are more features than specified in these constraints, + they are treated as if they were specified as an additional set. + + For instance, with 5 features in total, `interaction_cst=[{0, 1}]` + is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`, + and specifies that a single branch of a tree will either only split + on features 0 and 1 or only split on features 2, 3 and 4. .. versionadded:: 1.1 @@ -1458,14 +1462,18 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): .. versionadded:: 0.23 interaction_cst : iterable of iterables of int, default=None - Each inner list or set specifies the feature indices that are allowed - to interact with each other, meaning splits in child nodes are only - allowed in those sets. If there are more features than specified in - these constraints, they are treated as if they were specified as an - additional set. - For instance, with 5 features in total, `[{0, 1}]` is equivalent to - `[{0, 1}, {2, 3, 4}]`, and says that a single branch of a tree is - either only split on features 0 and 1 or only on features 2, 3 and 4. + Specify interaction constraints, i.e. sets of features which can + only interact with each other in child nodes splits. + + Each iterable materializes a constraint by the set of indices of + the features that are allowed to interact with each other. + If there are more features than specified in these constraints, + they are treated as if they were specified as an additional set. + + For instance, with 5 features in total, `interaction_cst=[{0, 1}]` + is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`, + and specifies that a single branch of a tree will either only split + on features 0 and 1 or only split on features 2, 3 and 4. .. versionadded:: 1.1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e2df952b2c892..6d82588d15664 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -71,9 +71,9 @@ class TreeNode: allowed_features : None or ndarray, dtype=int Indices of features allowed to split for children. interaction_cst_indices : None or list of ints - Indices of the interaction sets/groups that have to be applied on - splits of child nodes. The fewer sets the harder the constraint as - fewer sets contain fewer features. + Indices of the interaction sets that have to be applied on splits of + child nodes. The fewer sets the stronger the constraint as fewer sets + contain fewer features. children_lower_bound : float children_upper_bound : float """ @@ -659,9 +659,9 @@ def _compute_interactions(self, node): allowed_features : None or ndarray, dtype=int Indices of features allowed to split for children. interaction_cst_indices : None or list of ints - Indices of the interaction sets/groups that have to be applied on - splits of child nodes. The fewer sets the harder the constraint as - fewer sets contain fewer features. + Indices of the interaction sets that have to be applied on splits of + child nodes. The fewer sets the stronger the constraint as fewer sets + contain fewer features. """ # Note: # - Case of no interactions is already captured before function call. From c09ba918cb580ee00014ad5ad1fae0694cf0445d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 22 Oct 2021 17:57:56 +0200 Subject: [PATCH 30/61] TST reviewer suggestion for improved grower test --- .../tests/test_grower.py | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 5182480106555..a5f2ac78e1969 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -611,29 +611,28 @@ def get_all_children(node): ) grower.grow() - root_feature_splits.append(grower.root.split_info.feature_idx) - - if grower.root.split_info.feature_idx in {0, 1, 2}: - root_feature_idx = grower.root.split_info.feature_idx - constraint_set = {0: {0, 1}, 1: {0, 1, 2}, 2: {1, 2}}[root_feature_idx] - for node in (grower.root.left_child, grower.root.right_child): - # test allowed_features of children of root node - assert_array_equal(node.allowed_features, list(constraint_set)) - if root_feature_idx in {0, 2}: - # test that {0, 1} and {1, 2} don't interact with each other - for node in get_all_children(grower.root): - if not node.is_leaf: - assert ( - node.split_info.feature_idx - in {0: {0, 1}, 2: {1, 2}}[root_feature_idx] - ) - for node in get_all_children(grower.root): - if not node.is_leaf: - assert node.split_info.feature_idx in constraint_set - elif grower.root.split_info.feature_idx in {3, 4, 5}: - for node in get_all_children(grower.root): - if not node.is_leaf: - assert node.split_info.feature_idx in {3, 4, 5} + root_feature_idx = grower.root.split_info.feature_idx + root_feature_splits.append(root_feature_idx) + + feature_idx_to_constraint_set = { + 0: {0, 1}, + 1: {0, 1, 2}, + 2: {1, 2}, + 3: {3, 4, 5}, + 4: {3, 4, 5}, + 5: {3, 4, 5}, + } + + root_constraint_set = feature_idx_to_constraint_set[root_feature_idx] + for node in (grower.root.left_child, grower.root.right_child): + # Root's children's allowed_features must be the root's constraints set. + assert_array_equal(node.allowed_features, list(root_constraint_set)) + for node in get_all_children(grower.root): + # Nodes accessible from the root must have their index in the root's + # constraints set. For example, sets {0, 1} and {1, 2} must not interact + # with each other. + if not node.is_leaf: + assert node.split_info.feature_idx in root_constraint_set # Make sure that every feature is used at least once as split for the root node. assert len(set(root_feature_splits)) == n_features From ba78cb9db2439e147accdadb702b7bc73354a775 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 22 Oct 2021 20:15:42 +0200 Subject: [PATCH 31/61] TST check interaction constraints numerically --- .../tests/test_gradient_boosting.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 5b533480d294f..bb6c7374cc921 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1098,6 +1098,52 @@ def test_check_interaction_cst(interaction_cst, n_features, result): assert est._check_interaction_cst(n_features) == result +def test_interaction_cst_numerically(): + """Check that interaction constraints have no forbidden interactions.""" + rng = np.random.RandomState(42) + n_samples = 1000 + X = rng.uniform(size=(n_samples, 2)) + # Construct y with a strong interaction term + # y = x0 + x1 + 5 * x0 * x1 + y = np.c_[X, 5 * X[:, 0] * X[:, 1]].sum(axis=1) + + est = HistGradientBoostingRegressor() + est.fit(X, y) + est_no_interactions = HistGradientBoostingRegressor(interaction_cst=[{0}, {1}]) + est_no_interactions.fit(X, y) + + delta = 0.25 + # Make sure we do not extrapolate out of the training set as tree-based estimators + # are very bad in doing so. + X_test = X[(X[:, 0] < 1 - delta) & (X[:, 1] < 1 - delta)] + X_delta_0 = X_test + [delta, 0] + X_delta_1 = X_test + [0, delta] + X_delta_0_1 = X_test + [delta, delta] + + # Note: For true y, we have + # y(x0+d, x1+d) = y(x0, x1) + 5 * d * (2/5 + x0 + x1) + 5 * d**2 + # y(x0+d, x1) = y(x0, x1) + 5 * d * (1/5 + x1) + # y(x0, x1+d) = y(x0, x1) + 5 * d * (1/5 + x0) + assert_allclose( + est_no_interactions.predict(X_delta_0_1) + + est_no_interactions.predict(X_test) + - est_no_interactions.predict(X_delta_0) + - est_no_interactions.predict(X_delta_1), + 0, + atol=1e-12, + ) + + # Correct result of the expressions is 5 * delta**2. But this is hard to achieve by + # a fitted tree-based model. The expression should, however, at least be positive! + assert np.all( + est.predict(X_delta_0_1) + + est.predict(X_test) + - est.predict(X_delta_0) + - est.predict(X_delta_1) + > 0.01 + ) + + # TODO: Remove in v1.2 @pytest.mark.parametrize( "old_loss, new_loss", From c8a3a3047542812c3c85b8132c10adadc2e75f85 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 24 Oct 2021 15:28:14 +0200 Subject: [PATCH 32/61] EXA add interaction constraints to partial dependence --- .../inspection/plot_partial_dependence.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 4d22077fdbc09..5e157c6d105c1 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -252,6 +252,44 @@ # house age, whereas for values less than two there is a strong dependence on # age. # +# The histogram gradient boosters do have the interesting option to constrain +# possible interactions among features. We do so by not allowing any +# interaction and thus render the model as a version of a tree-based boosted +# generalized additive model (GAM). This makes the model more interpretable +# as the effect of each feature can be investaged independently of all others: + +print("Training interaction constraint HistGradientBoostingRegressor...") +tic = time() +est_no_interactions = HistGradientBoostingRegressor( + interaction_cst=list(zip(range(X_train.shape[1]))) +) +est_no_interactions.fit(X_train, y_train) +print(f"done in {time() - tic:.3f}s") +print("Computing partial dependence plots...") +tic = time() +_, ax = plt.subplots(ncols=3, figsize=(9, 4)) +display = PartialDependenceDisplay.from_estimator( + est_no_interactions, + X_train, + features, + kind="average", + n_jobs=3, + grid_resolution=20, + ax=ax, +) +print(f"done in {time() - tic:.3f}s") +display.figure_.suptitle( + "Partial dependence of house value with Gradient Boosting\n" + "and not interactions allowed" +) +display.figure_.subplots_adjust(wspace=0.4, hspace=0.3) + +# %% +# In the contour plot, we clearly see that the is hardly any interaction left. +# The remaing one might be a result of numerically precision of partial +# dependence. We also see that the univariate dependence plots are slightly +# different as the model tries to compensate the forbidden interactions. +# # 3D interaction plots # -------------------- # From 3b6703aa7994fc5191f4e7bce39966615f54ff16 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 24 Oct 2021 16:11:12 +0200 Subject: [PATCH 33/61] CLN colon in example --- examples/inspection/plot_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 5e157c6d105c1..d605d1b45e776 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -256,7 +256,7 @@ # possible interactions among features. We do so by not allowing any # interaction and thus render the model as a version of a tree-based boosted # generalized additive model (GAM). This makes the model more interpretable -# as the effect of each feature can be investaged independently of all others: +# as the effect of each feature can be investaged independently of all others. print("Training interaction constraint HistGradientBoostingRegressor...") tic = time() From 255646a57d2f05e7cd4a5b74f76b9ff31a3cd42f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 24 Oct 2021 20:35:37 +0200 Subject: [PATCH 34/61] CLN fix whatsnew --- doc/whats_new/v1.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 49c5335f9c219..5ff8c1319303e 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -64,6 +64,7 @@ Changelog :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor`. :pr:`21130` :user:`Christian Lorentzen `. + :mod:`sklearn.decomposition` ............................ From 7100600d5a02f1a3f269550ae15c5cf5678bc88f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 25 Oct 2021 16:39:23 +0200 Subject: [PATCH 35/61] TST better error messages --- .../_hist_gradient_boosting/gradient_boosting.py | 15 ++++++++++----- .../tests/test_gradient_boosting.py | 6 +++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 4af5f81c0d898..8e4b0efd6241e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -216,16 +216,21 @@ def _check_interaction_cst(self, n_features): and all(isinstance(x, Iterable) for x in self.interaction_cst) ): raise ValueError( - "Interaction constraints must be None or an iterable of iterables" + "Interaction constraints must be None or an iterable of iterables, " + f"got: {self.interaction_cst!r}." ) - if not all( - (x == int(x) and 0 <= x and x < n_features) + + invalid_indices = [ + x for cst_set in self.interaction_cst for x in cst_set - ): + if not (x == int(x) and 0 <= x and x < n_features) + ] + if invalid_indices: raise ValueError( "Interaction constraints must consist of integer indices in [0," - " n_features - 1], specifying the position of features." + f" n_features - 1] = [0, {n_features - 1}], specifying the position of" + f" features, got invalid indices: {invalid_indices!r}" ) constraints = [set([int(x) for x in group]) for group in self.interaction_cst] diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index bb6c7374cc921..367f1c4e475d3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -77,17 +77,17 @@ def _make_dumb_dataset(n_samples): ( {"interaction_cst": [{0, 9999}]}, r"Interaction constraints must consist of integer indices in \[0," - r" n_features - 1\], specifying the position of features.", + r" n_features - 1\] = \[.*\], specifying the position of features,", ), ( {"interaction_cst": [{-1, 0}]}, r"Interaction constraints must consist of integer indices in \[0," - r" n_features - 1\], specifying the position of features.", + r" n_features - 1\] = \[.*\], specifying the position of features,", ), ( {"interaction_cst": [{0.5}]}, r"Interaction constraints must consist of integer indices in \[0," - r" n_features - 1\], specifying the position of features.", + r" n_features - 1\] = \[.*\], specifying the position of features,", ), ], ) From 31c6c3eb78b4d149b79ead4ec760c8d31b3febfb Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 25 Oct 2021 18:58:50 +0200 Subject: [PATCH 36/61] EXA add 1D ice plots to see parallel lines --- .../inspection/plot_partial_dependence.py | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index d605d1b45e776..93604bea82ef8 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -252,11 +252,18 @@ # house age, whereas for values less than two there is a strong dependence on # age. # +# Interaction constraints +# ....................... +# # The histogram gradient boosters do have the interesting option to constrain # possible interactions among features. We do so by not allowing any # interaction and thus render the model as a version of a tree-based boosted # generalized additive model (GAM). This makes the model more interpretable # as the effect of each feature can be investaged independently of all others. +# +# We first train the :class:`~sklearn.ensemble.HistGradientBoostingRegressor` +# with `interaction_cst`, where we pass each feature index in its own +# list, e.g. `[[0], [1], [2], ..]`. print("Training interaction constraint HistGradientBoostingRegressor...") tic = time() @@ -265,13 +272,45 @@ ) est_no_interactions.fit(X_train, y_train) print(f"done in {time() - tic:.3f}s") + +# %% +# The easiest way to show the effect of forbidden interactions is again the +# ICE plots. + +print("Computing partial dependence plots...") +tic = time() +display = PartialDependenceDisplay.from_estimator( + est_no_interactions, + X_train, + ["MedInc", "AveOccup", "HouseAge", "AveRooms"], + kind="both", + subsample=50, + n_jobs=3, + grid_resolution=20, + random_state=0, + ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5}, + pd_line_kw={"color": "tab:orange", "linestyle": "--"}, +) +display.figure_.suptitle( + "Partial dependence of house value with Gradient Boosting\n" + "and no interactions allowed" +) +display.figure_.subplots_adjust(wspace=0.4, hspace=0.3) + +# %% +# All 4 plots show parallel lines meaning there is no interaction in the model. +# (Note that to see the same with a +# :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, we would need to +# plot in the link space, i.e. "logit(predict_proba(X))"). +# Let us also have a look at the corresponding 2D-plot. + print("Computing partial dependence plots...") tic = time() _, ax = plt.subplots(ncols=3, figsize=(9, 4)) display = PartialDependenceDisplay.from_estimator( est_no_interactions, X_train, - features, + ["AveOccup", "HouseAge", ("AveOccup", "HouseAge")], kind="average", n_jobs=3, grid_resolution=20, @@ -280,15 +319,17 @@ print(f"done in {time() - tic:.3f}s") display.figure_.suptitle( "Partial dependence of house value with Gradient Boosting\n" - "and not interactions allowed" + "and no interactions allowed" ) display.figure_.subplots_adjust(wspace=0.4, hspace=0.3) # %% -# In the contour plot, we clearly see that the is hardly any interaction left. -# The remaing one might be a result of numerically precision of partial -# dependence. We also see that the univariate dependence plots are slightly -# different as the model tries to compensate the forbidden interactions. +# Although the 2D-plot shows much less interaction compared with the 2D-plot +# from above, it is much harder to come to the conclusion that there is no +# interaction at all. This might be a cause of the discrete predictions of +# trees in combination with numerically precision of partial dependence. +# We also observe that the univariate dependence plots have slightly changed +# as the model tries to compensate for the forbidden interactions. # # 3D interaction plots # -------------------- From bd62aeab46325ea018c749c6216caf55979fe66b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 25 Oct 2021 19:08:55 +0200 Subject: [PATCH 37/61] TST rely more on default values --- sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index a5f2ac78e1969..70efa32a6db31 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -601,10 +601,7 @@ def get_all_children(node): X_binned, gradients, hessians, - max_depth=5, n_bins=n_bins, - shrinkage=1.0, - max_leaf_nodes=None, min_samples_leaf=1, interaction_cst=interaction_cst, n_threads=n_threads, From eed05ac01189ce2311dce1462c0ef89f33a2c427 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Nov 2021 22:29:01 +0100 Subject: [PATCH 38/61] DOC add blank lines in whats_new --- doc/whats_new/v1.1.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index dfaab57434525..990fbe86499bf 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -82,10 +82,12 @@ Changelog :class:`~sklearn.ensemble.HistGradientBoostingRegressor` via the new argument `interaction_cst`. :pr:`21020` by :user:`Christian Lorentzen `. + - |Fix| Fixed a bug that could produce a segfault in rare cases for :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor`. :pr:`21130` :user:`Christian Lorentzen `. + :mod:`sklearn.datasets` ....................... From d66f40af84a5cb524c8291f5d1dff485646bc3fc Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Nov 2021 22:31:38 +0100 Subject: [PATCH 39/61] DOC remove 1.0.1 entry in whats_new 1.1 --- doc/whats_new/v1.1.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 990fbe86499bf..44ebc8bba76d8 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -83,11 +83,6 @@ Changelog argument `interaction_cst`. :pr:`21020` by :user:`Christian Lorentzen `. -- |Fix| Fixed a bug that could produce a segfault in rare cases for - :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor`. - :pr:`21130` :user:`Christian Lorentzen `. - :mod:`sklearn.datasets` ....................... From ee86a779bc00c1d0f4b91be7f7f1899ab4d1248c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 28 Jun 2022 20:16:51 +0200 Subject: [PATCH 40/61] CLN fix merge with parameter validation --- .../_hist_gradient_boosting/gradient_boosting.py | 1 + .../tests/test_gradient_boosting.py | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index ece5032686f00..f73db55fdccc2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -97,6 +97,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC): "min_samples_leaf": [Interval(Integral, 1, None, closed="left")], "l2_regularization": [Interval(Real, 0, None, closed="left")], "monotonic_cst": ["array-like", None], + "interaction_cst": [Iterable, None], "n_iter_no_change": [Interval(Integral, 1, None, closed="left")], "validation_fraction": [ Interval(Real, 0, 1, closed="neither"), diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 9682a5676ed02..28d012e8542e0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -57,21 +57,6 @@ def _make_dumb_dataset(n_samples): @pytest.mark.parametrize( "params, err_msg", [ - ({"loss": "blah"}, "Loss blah is not supported for"), - ({"learning_rate": 0}, "learning_rate=0 must be strictly positive"), - ({"learning_rate": -1}, "learning_rate=-1 must be strictly positive"), - ({"max_iter": 0}, "max_iter=0 must not be smaller than 1"), - ({"max_leaf_nodes": 0}, "max_leaf_nodes=0 should not be smaller than 2"), - ({"max_leaf_nodes": 1}, "max_leaf_nodes=1 should not be smaller than 2"), - ({"max_depth": 0}, "max_depth=0 should not be smaller than 1"), - ({"min_samples_leaf": 0}, "min_samples_leaf=0 should not be smaller"), - ({"l2_regularization": -1}, "l2_regularization=-1 must be positive"), - ({"max_bins": 1}, "max_bins=1 should be no smaller than 2 and no larger"), - ({"max_bins": 256}, "max_bins=256 should be no smaller than 2 and no"), - ({"n_iter_no_change": -1}, "n_iter_no_change=-1 must be positive"), - ({"validation_fraction": -1}, "validation_fraction=-1 must be strictly"), - ({"validation_fraction": 0}, "validation_fraction=0 must be strictly"), - ({"tol": -1}, "tol=-1 must not be smaller than 0"), ( {"interaction_cst": "string"}, "", From 13b0aaf451ac09cc40743c61a1c21e3fd2f5ae0a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 29 Jun 2022 10:55:56 +0200 Subject: [PATCH 41/61] DOC move whatsnew to v1.2 --- doc/whats_new/v1.1.rst | 6 ------ doc/whats_new/v1.2.rst | 8 +++++++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 717828fa471d9..4c46c0d631f76 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -626,12 +626,6 @@ Changelog - |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest` by skipping repetitive input checks. :pr:`23149` by :user:`Zhehao Liu `. -- |Feature| Added interaction constraints for - :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and - :class:`~sklearn.ensemble.HistGradientBoostingRegressor` via the new - argument `interaction_cst`. - :pr:`21020` by :user:`Christian Lorentzen `. - :mod:`sklearn.feature_extraction` ................................. diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 9c4a70eb20f1b..4c22cf16d8f15 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -123,6 +123,12 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |Feature| Added interaction constraints for + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` via the new + argument `interaction_cst`. + :pr:`21020` by :user:`Christian Lorentzen `. + - |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest` by avoiding data copies. :pr:`23252` by :user:`Zhehao Liu `. @@ -168,7 +174,7 @@ Changelog negative likelihood ratios derived from the confusion matrix of a binary classification problem. :pr:`22518` by :user:`Arturo Amor `. - + - |Fix| :func:`metrics.ndcg_score` will now trigger a warning when the `y_true` value contains a negative value. Users may still use negative values, but the result may not be between 0 and 1. Starting in v1.4, passing in negative From 1c7563094e36c68bf2b193d9c6884fd0987e4cdf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 16 Aug 2022 22:57:15 +0200 Subject: [PATCH 42/61] CLN move missing docstring additions to other PR --- .../_hist_gradient_boosting/grower.py | 22 ------------------- .../_hist_gradient_boosting/splitting.pyx | 6 ----- 2 files changed, 28 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 5f1ecb850c33e..152253e93ca8f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -195,28 +195,6 @@ class TreeGrower: to determine the effective number of threads use, which takes cgroups CPU quotes into account. See the docstring of `_openmp_effective_n_threads` for details. - - Attributes - ---------- - histogram_builder : HistogramBuilder - splitter : Splitter - root : TreeNode - finalized_leaves : list of TreeNode - splittable_nodes : list of TreeNode - missing_values_bin_idx : int - equals n_bins - 1 - n_categorical_splits : int - n_features : int - n_nodes : int - total_find_split_time : float - time spent finding the best splits - total_compute_hist_time : float - time spent computing histograms - total_apply_split_time : float - time spent splitting nodes - with_monotonic_cst : bool - Whether there are monotonic constraints that apply. False iff monotonic_cst - is None. """ def __init__( diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index b97c0d29aa6aa..9164bc89e2faa 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -136,10 +136,6 @@ cdef class Splitter: feature. is_categorical : ndarray of bool of shape (n_features,) Indicates categorical features. - monotonic_cst : ndarray of shape (n_features,), dtype=int - Indicates the monotonic constraint to enforce on each feature. -1, 1 - and 0 respectively correspond to a positive constraint, negative - constraint and no constraint. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -153,8 +149,6 @@ cdef class Splitter: be ignored. hessians_are_constant: bool, default is False Whether hessians are constant. - n_threads : int, default=1 - Number of OpenMP threads to use. """ cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned From b9d880bf105f454b1a17789b39e6df3d3dcf802e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 17 Aug 2022 15:26:07 +0200 Subject: [PATCH 43/61] DOC add user guide entry --- doc/modules/ensemble.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 7d64a0e91181c..cd5ef2fa3e93d 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1169,6 +1169,30 @@ supported for multiclass context. * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py` +.. _interaction_cst_hgbt: + +Interaction constraints +----------------------- + +A priori, the histogram gradient boosting trees are allowed to use any feature +to split a node into child nodes. This creates so called interactions between +features. Sometimes, one wants to restrict the possible interactions. This can +be done by the parameter ``interaction_cst``, where one can specify the +indices of features that are allowed to interact +For instance, with 3 features in total, ``interaction_cst=[{0}, {1}, {2}]`` +forbids all interactions. +The constraints ``[{0, 1}, {1, 2}]`` specifies two groups of possibly +interacting features. The first and second features may interact, as well as +the second and third features. But note that the first and third are forbidden +to interact. The following depicts a tree and the the possible +splits of the tree:: + + 1 <- Both constraint groups could be applied from now on + / \ + 1 2 <- Left split still fulfills both constraint groups. + / \ / \ Right split at feature 2 has only group {1, 2} from now on. + + Low-level parallelism --------------------- From 10023c00a5a5c17a3a1bd04bf1d3edd11b72e4d7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 17 Aug 2022 15:33:49 +0200 Subject: [PATCH 44/61] MNT change versionadded to 1.2 --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f73db55fdccc2..cfe11060b18b6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1251,7 +1251,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): and specifies that a single branch of a tree will either only split on features 0 and 1 or only split on features 2, 3 and 4. - .. versionadded:: 1.1 + .. versionadded:: 1.2 warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit @@ -1608,7 +1608,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): and specifies that a single branch of a tree will either only split on features 0 and 1 or only split on features 2, 3 and 4. - .. versionadded:: 1.1 + .. versionadded:: 1.2 warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit From 4265d238e34c9609eaa33914850547b2fe0dbe46 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 17 Aug 2022 16:02:15 +0200 Subject: [PATCH 45/61] DOC use code-block:: text --- doc/modules/ensemble.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index cd5ef2fa3e93d..05d8a082961fb 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1185,7 +1185,9 @@ The constraints ``[{0, 1}, {1, 2}]`` specifies two groups of possibly interacting features. The first and second features may interact, as well as the second and third features. But note that the first and third are forbidden to interact. The following depicts a tree and the the possible -splits of the tree:: +splits of the tree: + +.. code-block:: text 1 <- Both constraint groups could be applied from now on / \ From a7559b19257b2ef2d16beb75651fb5340c06b758 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 17 Aug 2022 16:07:04 +0200 Subject: [PATCH 46/61] DOC allowed_features has dtype uint32 --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 152253e93ca8f..bcb73c42e1ed1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -608,7 +608,7 @@ def _compute_interactions(self, node): Returns ------- - allowed_features : None or ndarray, dtype=int + allowed_features : None or ndarray, dtype=uint32 Indices of features allowed to split for children. interaction_cst_indices : None or list of ints Indices of the interaction sets that have to be applied on splits of From 3bacb790402d9f3d2d24c23645137ba9bcd13a4d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 22 Aug 2022 20:39:22 +0200 Subject: [PATCH 47/61] DOC remove None from interaction_cst_indices --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index bcb73c42e1ed1..f9bfb18c7a0b7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -608,9 +608,9 @@ def _compute_interactions(self, node): Returns ------- - allowed_features : None or ndarray, dtype=uint32 + allowed_features : ndarray, dtype=uint32 Indices of features allowed to split for children. - interaction_cst_indices : None or list of ints + interaction_cst_indices : list of ints Indices of the interaction sets that have to be applied on splits of child nodes. The fewer sets the stronger the constraint as fewer sets contain fewer features. From 6653a4e228e0bed5466bf242c9d5e8aa4392f980 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Aug 2022 18:12:32 +0200 Subject: [PATCH 48/61] DOC fix typo --- doc/modules/ensemble.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index d099963d7ded4..61e356f05e72e 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1184,8 +1184,7 @@ forbids all interactions. The constraints ``[{0, 1}, {1, 2}]`` specifies two groups of possibly interacting features. The first and second features may interact, as well as the second and third features. But note that the first and third are forbidden -to interact. The following depicts a tree and the the possible -splits of the tree: +to interact. The following depicts a tree and the possible splits of the tree: .. code-block:: text From 8d02553d7ec7b5b10ed32098accf3d93929815a6 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Aug 2022 18:20:17 +0200 Subject: [PATCH 49/61] DOC try none to switch off language highlightning --- doc/modules/ensemble.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 61e356f05e72e..0d6fabca29d98 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1186,7 +1186,7 @@ interacting features. The first and second features may interact, as well as the second and third features. But note that the first and third are forbidden to interact. The following depicts a tree and the possible splits of the tree: -.. code-block:: text +.. code-block:: none 1 <- Both constraint groups could be applied from now on / \ From 4989a26b9d86fab40b343c4575174eafec2a4aba Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Aug 2022 18:54:35 +0200 Subject: [PATCH 50/61] DOC address features by numbers --- doc/modules/ensemble.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 0d6fabca29d98..0e01baf03a59a 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1182,9 +1182,9 @@ indices of features that are allowed to interact For instance, with 3 features in total, ``interaction_cst=[{0}, {1}, {2}]`` forbids all interactions. The constraints ``[{0, 1}, {1, 2}]`` specifies two groups of possibly -interacting features. The first and second features may interact, as well as -the second and third features. But note that the first and third are forbidden -to interact. The following depicts a tree and the possible splits of the tree: +interacting features. Features 0 and 1 may interact with each other, as well +as features 1 and 2. But note that features 0 and 2 are forbidden to interact. +The following depicts a tree and the possible splits of the tree: .. code-block:: none From 61b1e068a96f542f2bf934587f7c536c67a48b68 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 6 Sep 2022 20:14:50 +0200 Subject: [PATCH 51/61] address reviewer comments --- examples/inspection/plot_partial_dependence.py | 2 +- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/grower.py | 7 ++++--- .../tests/test_gradient_boosting.py | 1 - 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index ea9f842e8c1bc..937f456eafbbb 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -271,7 +271,7 @@ print("Training interaction constraint HistGradientBoostingRegressor...") tic = time() est_no_interactions = HistGradientBoostingRegressor( - interaction_cst=list(zip(range(X_train.shape[1]))) + interaction_cst=[[i] for i in range(X_train.shape[1])] ) est_no_interactions.fit(X_train, y_train) print(f"done in {time() - tic:.3f}s") diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 9a7fa3dff8e57..7a184826d0970 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -271,7 +271,7 @@ def _check_interaction_cst(self, n_features): x for cst_set in self.interaction_cst for x in cst_set - if not (x == int(x) and 0 <= x and x < n_features) + if not (isinstance(x, int) and 0 <= x and x < n_features) ] if invalid_indices: raise ValueError( @@ -280,7 +280,7 @@ def _check_interaction_cst(self, n_features): f" features, got invalid indices: {invalid_indices!r}" ) - constraints = [set([int(x) for x in group]) for group in self.interaction_cst] + constraints = [set(group) for group in self.interaction_cst] # Add all not listed features as own group by default. rest = set(range(n_features)).difference(set().union(*constraints)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 105c78ea184ae..1b4e5426a8408 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -180,7 +180,8 @@ class TreeGrower: and 0 respectively correspond to a positive constraint, negative constraint and no constraint. Read more in the :ref:`User Guide `. - interaction_cst : list of sets of integers + interaction_cst : list of sets of integers, default=None + List of interaction constraints. l2_regularization : float, default=0. The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -417,7 +418,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): ) if self.interaction_cst is not None: - self.root.interaction_cst_indices = list(range(len(self.interaction_cst))) + self.root.interaction_cst_indices = range(len(self.interaction_cst)) allowed_features = set().union(*self.interaction_cst) self.root.allowed_features = np.array( list(allowed_features), dtype=np.uint32 @@ -642,7 +643,7 @@ def _compute_interactions(self, node): # - This is for nodes that are already split and have a # node.split_info.feature_idx. allowed_features = set() - interaction_cst_indices = list() + interaction_cst_indices = [] for i in node.interaction_cst_indices: if node.split_info.feature_idx in self.interaction_cst[i]: interaction_cst_indices.append(i) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 28d012e8542e0..d70c86d4bd1c0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1184,7 +1184,6 @@ def test_interaction_cst_numerically(): ) -# TODO: Remove in v1.2 @pytest.mark.parametrize( "old_loss, new_loss, Estimator", [ From 38caedbade49b0c0615023673a6ede48fe9e442f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 6 Sep 2022 20:43:52 +0200 Subject: [PATCH 52/61] DOC add note about LightGBM logic --- doc/modules/ensemble.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 0e01baf03a59a..8c11ec6f8c2b6 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1176,9 +1176,10 @@ Interaction constraints A priori, the histogram gradient boosting trees are allowed to use any feature to split a node into child nodes. This creates so called interactions between -features. Sometimes, one wants to restrict the possible interactions. This can -be done by the parameter ``interaction_cst``, where one can specify the -indices of features that are allowed to interact +features, i.e. usage of different features as split along a brang. Sometimes, +one wants to restrict the possible interactions. This can be done by the +parameter ``interaction_cst``, where one can specify the indices of features +that are allowed to interact. For instance, with 3 features in total, ``interaction_cst=[{0}, {1}, {2}]`` forbids all interactions. The constraints ``[{0, 1}, {1, 2}]`` specifies two groups of possibly @@ -1193,6 +1194,11 @@ The following depicts a tree and the possible splits of the tree: 1 2 <- Left split still fulfills both constraint groups. / \ / \ Right split at feature 2 has only group {1, 2} from now on. +LightGBM uses the same logic for overlapping groups. + +Note that features not listed in ``interaction_cst`` are automatically +assigned an interaction group for themselves. With again 3 features, this +means that ``[{0}]`` is equivalent to ``[{0}, {1, 2}]``. Low-level parallelism --------------------- From 966793713bd901ac63a6d1b0d14b963d25066074 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 7 Sep 2022 16:03:21 +0200 Subject: [PATCH 53/61] DOC fix typo --- doc/modules/ensemble.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 8c11ec6f8c2b6..45903ea1a674f 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1176,7 +1176,7 @@ Interaction constraints A priori, the histogram gradient boosting trees are allowed to use any feature to split a node into child nodes. This creates so called interactions between -features, i.e. usage of different features as split along a brang. Sometimes, +features, i.e. usage of different features as split along a branch. Sometimes, one wants to restrict the possible interactions. This can be done by the parameter ``interaction_cst``, where one can specify the indices of features that are allowed to interact. From 9fb3e55f40479830c92053b2608e964e3b945be4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 26 Sep 2022 23:42:24 +0200 Subject: [PATCH 54/61] CLN better comment on test construction --- .../tests/test_gradient_boosting.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 40137ededfe57..8548a56347b81 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1161,10 +1161,12 @@ def test_interaction_cst_numerically(): X_delta_1 = X_test + [0, delta] X_delta_0_1 = X_test + [delta, delta] - # Note: For true y, we have + # Note: For the y from above as a function of x0 and x1, we have # y(x0+d, x1+d) = y(x0, x1) + 5 * d * (2/5 + x0 + x1) + 5 * d**2 - # y(x0+d, x1) = y(x0, x1) + 5 * d * (1/5 + x1) - # y(x0, x1+d) = y(x0, x1) + 5 * d * (1/5 + x0) + # y(x0+d, x1) = y(x0, x1) + 5 * d * (1/5 + x1) + # y(x0, x1+d) = y(x0, x1) + 5 * d * (1/5 + x0) + # Without interaction constraints, we would expect a result of 5 * d**2 for the + # following expression, but zero with constraints in place. assert_allclose( est_no_interactions.predict(X_delta_0_1) + est_no_interactions.predict(X_test) From 5240d9fc682e823586da349347bcfc32a197c398 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 9 Oct 2022 22:40:43 +0200 Subject: [PATCH 55/61] EXA review comments --- examples/inspection/plot_partial_dependence.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 937f456eafbbb..6ccfab8ec3454 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -258,9 +258,9 @@ # Interaction constraints # ....................... # -# The histogram gradient boosters do have the interesting option to constrain -# possible interactions among features. We do so by not allowing any -# interaction and thus render the model as a version of a tree-based boosted +# The histogram gradient boosters have an interesting option to constrain +# possible interactions among features. In the following, we do not allow any +# interactions and thus render the model as a version of a tree-based boosted # generalized additive model (GAM). This makes the model more interpretable # as the effect of each feature can be investaged independently of all others. # @@ -294,6 +294,8 @@ ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5}, pd_line_kw={"color": "tab:orange", "linestyle": "--"}, ) + +print(f"done in {time() - tic:.3f}s") display.figure_.suptitle( "Partial dependence of house value with Gradient Boosting\n" "and no interactions allowed" @@ -301,10 +303,8 @@ display.figure_.subplots_adjust(wspace=0.4, hspace=0.3) # %% -# All 4 plots show parallel lines meaning there is no interaction in the model. -# (Note that to see the same with a -# :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, we would need to -# plot in the link space, i.e. "logit(predict_proba(X))"). +# All 4 plots have parallel ICE lines meaning there is no interaction in the +# model. # Let us also have a look at the corresponding 2D-plot. print("Computing partial dependence plots...") From 295aeee2134a6116dbef4d3cca1c5438be57b910 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 9 Oct 2022 23:26:31 +0200 Subject: [PATCH 56/61] ENH improvements from Thomas review comments --- .../gradient_boosting.py | 2 +- .../_hist_gradient_boosting/grower.py | 9 ++++-- .../_hist_gradient_boosting/splitting.pyx | 8 ++++-- .../tests/test_gradient_boosting.py | 28 ++++++++++--------- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 2066211366aed..9e098c571a113 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -286,7 +286,7 @@ def _check_interaction_cst(self, n_features): constraints = [set(group) for group in self.interaction_cst] # Add all not listed features as own group by default. - rest = set(range(n_features)).difference(set().union(*constraints)) + rest = set(range(n_features)) - set().union(*constraints) if len(rest) > 0: constraints.append(rest) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 81ee447233205..83293010ad857 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -422,8 +422,8 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): if self.interaction_cst is not None: self.root.interaction_cst_indices = range(len(self.interaction_cst)) allowed_features = set().union(*self.interaction_cst) - self.root.allowed_features = np.array( - list(allowed_features), dtype=np.uint32 + self.root.allowed_features = np.fromiter( + allowed_features, dtype=np.uint32, count=len(allowed_features) ) self._compute_best_split_and_push(self.root) @@ -625,6 +625,9 @@ def _compute_interactions(self, node): 1 2 <- Left split still fulfills both constraint groups. / \ / \ Right split at feature 2 has only group {1, 2} from now on. + LightGBM uses the same logic for overlapping groups. See + https://github.com/microsoft/LightGBM/issues/4481 for details. + Parameters: ---------- node : TreeNode @@ -651,7 +654,7 @@ def _compute_interactions(self, node): interaction_cst_indices.append(i) allowed_features.update(self.interaction_cst[i]) return ( - np.array(list(allowed_features), dtype=np.uint32), + np.fromiter(allowed_features, dtype=np.uint32, count=len(allowed_features)), interaction_cst_indices, ) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index f53e276e7e303..23d0e0a798c72 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -467,7 +467,7 @@ cdef class Splitter: int feature_idx int split_info_idx int best_split_info_idx - int n_allowed_features = self.n_features + int n_allowed_features split_info_struct split_info split_info_struct * split_infos const unsigned char [::1] has_missing_values = self.has_missing_values @@ -476,9 +476,11 @@ cdef class Splitter: int n_threads = self.n_threads bint has_interaction_cst = False - if allowed_features is not None: - has_interaction_cst = True + has_interaction_cst = allowed_features is not None + if has_interaction_cst: n_allowed_features = allowed_features.shape[0] + else: + n_allowed_features = self.n_features with nogil: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 8548a56347b81..561e47e388bb7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1123,7 +1123,6 @@ def test_uint8_predict(Est): est.predict(X) -# TODO(1.3): Remove @pytest.mark.parametrize( "interaction_cst, n_features, result", [ @@ -1146,20 +1145,22 @@ def test_interaction_cst_numerically(): X = rng.uniform(size=(n_samples, 2)) # Construct y with a strong interaction term # y = x0 + x1 + 5 * x0 * x1 - y = np.c_[X, 5 * X[:, 0] * X[:, 1]].sum(axis=1) + y = np.hstack((X, 5 * X[:, [0]] * X[:, [1]])).sum(axis=1) - est = HistGradientBoostingRegressor() + est = HistGradientBoostingRegressor(random_state=42) est.fit(X, y) - est_no_interactions = HistGradientBoostingRegressor(interaction_cst=[{0}, {1}]) + est_no_interactions = HistGradientBoostingRegressor( + interaction_cst=[{0}, {1}], random_state=42 + ) est_no_interactions.fit(X, y) delta = 0.25 # Make sure we do not extrapolate out of the training set as tree-based estimators # are very bad in doing so. X_test = X[(X[:, 0] < 1 - delta) & (X[:, 1] < 1 - delta)] - X_delta_0 = X_test + [delta, 0] - X_delta_1 = X_test + [0, delta] - X_delta_0_1 = X_test + [delta, delta] + X_delta_d_0 = X_test + [delta, 0] + X_delta_0_d = X_test + [0, delta] + X_delta_d_d = X_test + [delta, delta] # Note: For the y from above as a function of x0 and x1, we have # y(x0+d, x1+d) = y(x0, x1) + 5 * d * (2/5 + x0 + x1) + 5 * d**2 @@ -1168,10 +1169,10 @@ def test_interaction_cst_numerically(): # Without interaction constraints, we would expect a result of 5 * d**2 for the # following expression, but zero with constraints in place. assert_allclose( - est_no_interactions.predict(X_delta_0_1) + est_no_interactions.predict(X_delta_d_d) + est_no_interactions.predict(X_test) - - est_no_interactions.predict(X_delta_0) - - est_no_interactions.predict(X_delta_1), + - est_no_interactions.predict(X_delta_d_0) + - est_no_interactions.predict(X_delta_0_d), 0, atol=1e-12, ) @@ -1179,14 +1180,15 @@ def test_interaction_cst_numerically(): # Correct result of the expressions is 5 * delta**2. But this is hard to achieve by # a fitted tree-based model. The expression should, however, at least be positive! assert np.all( - est.predict(X_delta_0_1) + est.predict(X_delta_d_d) + est.predict(X_test) - - est.predict(X_delta_0) - - est.predict(X_delta_1) + - est.predict(X_delta_d_0) + - est.predict(X_delta_0_d) > 0.01 ) +# TODO(1.3): Remove @pytest.mark.parametrize( "old_loss, new_loss, Estimator", [ From 9560ea77f707eca3b6324685b66b000d7b6349f9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 10 Oct 2022 20:45:35 +0200 Subject: [PATCH 57/61] CLN Julien's review comments --- doc/whats_new/v1.2.rst | 8 ++++---- examples/inspection/plot_partial_dependence.py | 8 ++++---- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 6 +++--- sklearn/ensemble/_hist_gradient_boosting/grower.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 3 ++- .../tests/test_gradient_boosting.py | 3 ++- .../_hist_gradient_boosting/tests/test_splitting.py | 8 +++++--- 7 files changed, 22 insertions(+), 18 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 76dca9daa5fe7..0de25ff7f45e6 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -242,10 +242,10 @@ Changelog :mod:`sklearn.ensemble` ....................... -- |Feature| Added interaction constraints for - :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and - :class:`~sklearn.ensemble.HistGradientBoostingRegressor` via the new - argument `interaction_cst`. +- |Feature| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` now support + interaction constraints via the argument `interaction_cst` of their + constructors. :pr:`21020` by :user:`Christian Lorentzen `. - |Feature| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`. diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 6ccfab8ec3454..a7ef29edef183 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -262,11 +262,11 @@ # possible interactions among features. In the following, we do not allow any # interactions and thus render the model as a version of a tree-based boosted # generalized additive model (GAM). This makes the model more interpretable -# as the effect of each feature can be investaged independently of all others. +# as the effect of each feature can be investigated independently of all others. # -# We first train the :class:`~sklearn.ensemble.HistGradientBoostingRegressor` -# with `interaction_cst`, where we pass each feature index in its own -# list, e.g. `[[0], [1], [2], ..]`. +# We train the :class:`~sklearn.ensemble.HistGradientBoostingRegressor` again, +# now with `interaction_cst`, where we pass for each feature a list containing +# only its own index, e.g. `[[0], [1], [2], ..]`. print("Training interaction constraint HistGradientBoostingRegressor...") tic = time() diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 9e098c571a113..c5fe46496a577 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -274,7 +274,7 @@ def _check_interaction_cst(self, n_features): x for cst_set in self.interaction_cst for x in cst_set - if not (isinstance(x, int) and 0 <= x and x < n_features) + if not (isinstance(x, Integral) and 0 <= x < n_features) ] if invalid_indices: raise ValueError( @@ -1246,7 +1246,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): For instance, with 5 features in total, `interaction_cst=[{0, 1}]` is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`, - and specifies that a single branch of a tree will either only split + and specifies that each branch of a tree will either only split on features 0 and 1 or only split on features 2, 3 and 4. .. versionadded:: 1.2 @@ -1578,7 +1578,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): For instance, with 5 features in total, `interaction_cst=[{0, 1}]` is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`, - and specifies that a single branch of a tree will either only split + and specifies that each branch of a tree will either only split on features 0 and 1 or only split on features 2, 3 and 4. .. versionadded:: 1.2 diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 83293010ad857..5e3010fa4a509 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -504,8 +504,8 @@ def split_next(self): # set interaction constraints (the indices of the constraints sets) if self.interaction_cst is not None: - # Calculate allowed_features and interaction_cst_indices only once and - # inherit them by child nodes. + # Calculate allowed_features and interaction_cst_indices only once. Child + # nodes inherit them before they get split. ( left_child_node.allowed_features, left_child_node.interaction_cst_indices, diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 23d0e0a798c72..f6630efd28a0f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -542,7 +542,8 @@ cdef class Splitter: # then compute best possible split among all features # split_info is set to the best of split_infos best_split_info_idx = self._find_best_feature_to_split_helper( - split_infos, n_allowed_features) + split_infos, n_allowed_features + ) split_info = split_infos[best_split_info_idx] out = SplitInfo( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 561e47e388bb7..dcdd01c4f28ec 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1178,7 +1178,8 @@ def test_interaction_cst_numerically(): ) # Correct result of the expressions is 5 * delta**2. But this is hard to achieve by - # a fitted tree-based model. The expression should, however, at least be positive! + # a fitted tree-based model. However, with 100 iterations the expression should + # at least be positive! assert np.all( est.predict(X_delta_d_d) + est.predict(X_test) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index e3316aa64e6c4..d1da34015a2a4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -872,11 +872,13 @@ def test_split_interaction_constraints(): sample_indices = np.arange(n_samples, dtype=np.uint32) all_hessians = np.ones(1, dtype=G_H_DTYPE) - sum_hessians = 1 * n_samples + sum_hessians = n_samples hessians_are_constant = True split_features = [] + # The loop is to ensure that we split at least once on each allowed feature (0, 3). + # This is tracked by split_features and checked at the end. for i in range(10): rng = np.random.RandomState(919 + i) X_binned = np.asfortranarray( @@ -947,7 +949,7 @@ def test_split_interaction_constraints(): allowed_features=allowed_features, ) split_features.append(si_root.feature_idx) - assert si_root.feature_idx in {0, 3} + assert si_root.feature_idx in allowed_features # make sure feature 0 and feature 3 are split on in the constraint setting - assert {0, 3} == set(split_features) + assert set(allowed_features) == set(split_features) From 28c45787006fb6803fa62c61601f6ea8948c194f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 10 Oct 2022 20:46:01 +0200 Subject: [PATCH 58/61] TST fix test_grower_interaction_constraints --- .../tests/test_grower.py | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 38813c0c5ae7c..233d2ebd2a141 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -625,11 +625,28 @@ def get_all_children(node): # Root's children's allowed_features must be the root's constraints set. assert_array_equal(node.allowed_features, list(root_constraint_set)) for node in get_all_children(grower.root): - # Nodes accessible from the root must have their index in the root's - # constraints set. For example, sets {0, 1} and {1, 2} must not interact - # with each other. - if not node.is_leaf: - assert node.split_info.feature_idx in root_constraint_set + if node.is_leaf: + continue + # Ensure that each node uses a subset of features of its parent node. + parent_interaction_cst_indices = set(node.interaction_cst_indices) + right_interactions_cst_indices = set( + node.right_child.interaction_cst_indices + ) + left_interactions_cst_indices = set(node.left_child.interaction_cst_indices) + + assert right_interactions_cst_indices.issubset( + parent_interaction_cst_indices + ) + assert left_interactions_cst_indices.issubset( + parent_interaction_cst_indices + ) + # The features used for split must have been present in the root's + # constraint set. + assert node.split_info.feature_idx in root_constraint_set # Make sure that every feature is used at least once as split for the root node. - assert len(set(root_feature_splits)) == n_features + assert ( + len(set(root_feature_splits)) + == len(set().union(*interaction_cst)) + == n_features + ) From 4d4b80aff3d4ede07e96fc6729b348e752b2d194 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 11 Oct 2022 21:09:39 +0200 Subject: [PATCH 59/61] DOC add reference Mayer 2022 - add reference Machine Learning Applications to Land and Structure Valuation - add arxiv qualifier for G. Louppe --- doc/modules/ensemble.rst | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index b7bf7fd63585b..0c4159165e181 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -317,9 +317,9 @@ to the prediction function. .. topic:: References - .. [L2014] G. Louppe, - "Understanding Random Forests: From Theory to Practice", - PhD Thesis, U. of Liege, 2014. + .. [L2014] G. Louppe, :arxiv:`"Understanding Random Forests: From Theory to + Practice" <1407.7502>`, + PhD Thesis, U. of Liege, 2014. .. _random_trees_embedding: @@ -1182,9 +1182,9 @@ Interaction constraints A priori, the histogram gradient boosting trees are allowed to use any feature to split a node into child nodes. This creates so called interactions between features, i.e. usage of different features as split along a branch. Sometimes, -one wants to restrict the possible interactions. This can be done by the -parameter ``interaction_cst``, where one can specify the indices of features -that are allowed to interact. +one wants to restrict the possible interactions, see [Mayer2022]_. This can be +done by the parameter ``interaction_cst``, where one can specify the indices +of features that are allowed to interact. For instance, with 3 features in total, ``interaction_cst=[{0}, {1}, {2}]`` forbids all interactions. The constraints ``[{0, 1}, {1, 2}]`` specifies two groups of possibly @@ -1205,6 +1205,13 @@ Note that features not listed in ``interaction_cst`` are automatically assigned an interaction group for themselves. With again 3 features, this means that ``[{0}]`` is equivalent to ``[{0}, {1, 2}]``. +.. topic:: References + + .. [Mayer2022] M. Mayer, S.C. Bourassa, M. Hoesli, and D.F. Scognamiglio. + 2022. :doi:`Machine Learning Applications to Land and Structure Valuation + <10.3390/jrfm15050193>`. + Journal of Risk and Financial Management 15, no. 5: 193 + Low-level parallelism --------------------- From 461cd6ace07b321fa26efc6e7a6be486eda5be8e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 11 Oct 2022 21:12:13 +0200 Subject: [PATCH 60/61] CLN remove if node.is_leaf in for loop --- sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 233d2ebd2a141..de694f6d690de 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -625,8 +625,6 @@ def get_all_children(node): # Root's children's allowed_features must be the root's constraints set. assert_array_equal(node.allowed_features, list(root_constraint_set)) for node in get_all_children(grower.root): - if node.is_leaf: - continue # Ensure that each node uses a subset of features of its parent node. parent_interaction_cst_indices = set(node.interaction_cst_indices) right_interactions_cst_indices = set( From e0e822061f245ea4d720c7a318f8fa41dd0a6c9d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 11 Oct 2022 22:49:42 +0200 Subject: [PATCH 61/61] CLN fix test_grower_interaction_constraints --- .../ensemble/_hist_gradient_boosting/tests/test_grower.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index de694f6d690de..c4ae90b7e7d96 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -583,8 +583,7 @@ def get_all_children(node): return res for n in [node.left_child, node.right_child]: res.append(n) - if not n.is_leaf: - res.extend(get_all_children(n)) + res.extend(get_all_children(n)) return res for seed in range(20): @@ -625,6 +624,8 @@ def get_all_children(node): # Root's children's allowed_features must be the root's constraints set. assert_array_equal(node.allowed_features, list(root_constraint_set)) for node in get_all_children(grower.root): + if node.is_leaf: + continue # Ensure that each node uses a subset of features of its parent node. parent_interaction_cst_indices = set(node.interaction_cst_indices) right_interactions_cst_indices = set(