From 4827570c722b5f0df5f2661aa165d7569e27bc7c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 3 Sep 2020 18:49:53 +0200 Subject: [PATCH 1/5] Remove useless TreeNode attributes to break cyclic references --- .../_hist_gradient_boosting/grower.py | 13 +---------- .../tests/test_monotonic_contraints.py | 22 +++++++++---------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 58b0c3020e548..7d864da4c59c3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -39,8 +39,6 @@ class TreeNode: The sum of the gradients of the samples at the node. sum_hessians : float The sum of the hessians of the samples at the node. - parent : TreeNode, default=None - The parent of the node. None for root. Attributes ---------- @@ -52,8 +50,6 @@ class TreeNode: The sum of the gradients of the samples at the node. sum_hessians : float The sum of the hessians of the samples at the node. - parent : TreeNode or None - The parent of the node. None for root. split_info : SplitInfo or None The result of the split evaluation. left_child : TreeNode or None @@ -73,8 +69,6 @@ class TreeNode: left_child = None right_child = None histograms = None - sibling = None - parent = None # start and stop indices of the node in the splitter.partition # array. Concretely, @@ -88,13 +82,12 @@ class TreeNode: partition_stop = 0 def __init__(self, depth, sample_indices, sum_gradients, - sum_hessians, parent=None, value=None): + sum_hessians, value=None): self.depth = depth self.sample_indices = sample_indices self.n_samples = sample_indices.shape[0] self.sum_gradients = sum_gradients self.sum_hessians = sum_hessians - self.parent = parent self.value = value self.is_leaf = False self.set_children_bounds(float('-inf'), float('+inf')) @@ -388,19 +381,15 @@ def split_next(self): sample_indices_left, node.split_info.sum_gradient_left, node.split_info.sum_hessian_left, - parent=node, value=node.split_info.value_left, ) right_child_node = TreeNode(depth, sample_indices_right, node.split_info.sum_gradient_right, node.split_info.sum_hessian_right, - parent=node, value=node.split_info.value_right, ) - left_child_node.sibling = right_child_node - right_child_node.sibling = left_child_node node.right_child = right_child_node node.left_child = left_child_node diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py index 827f588540d9f..c642bdd59335a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py @@ -100,30 +100,30 @@ def assert_children_values_bounded(grower, monotonic_cst): if monotonic_cst == MonotonicConstraint.NO_CST: return - def recursively_check_children_node_values(node): + def recursively_check_children_node_values(node, right_sibling=None): if node.is_leaf: return - if node is not grower.root and node is node.parent.left_child: - sibling = node.sibling # on the right - middle = (node.value + sibling.value) / 2 + if right_sibling is not None: + middle = (node.value + right_sibling.value) / 2 if monotonic_cst == MonotonicConstraint.POS: assert (node.left_child.value <= node.right_child.value <= middle) - if not sibling.is_leaf: + if not right_sibling.is_leaf: assert (middle <= - sibling.left_child.value <= - sibling.right_child.value) + right_sibling.left_child.value <= + right_sibling.right_child.value) else: # NEG assert (node.left_child.value >= node.right_child.value >= middle) - if not sibling.is_leaf: + if not right_sibling.is_leaf: assert (middle >= - sibling.left_child.value >= - sibling.right_child.value) + right_sibling.left_child.value >= + right_sibling.right_child.value) - recursively_check_children_node_values(node.left_child) + recursively_check_children_node_values(node.left_child, + right_sibling=node.right_child) recursively_check_children_node_values(node.right_child) recursively_check_children_node_values(grower.root) From aecda185e6435bbfa4ce7d8c9629e99caa5c10ad Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 3 Sep 2020 18:50:22 +0200 Subject: [PATCH 2/5] Free histograms memory as soon as possible --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 7d864da4c59c3..858b1fac50169 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -481,6 +481,16 @@ def split_next(self): self._compute_best_split_and_push(right_child_node) self.total_find_split_time += time() - tic + # Free memory used by histograms as they are no longer needed for + # leaf nodes once the optimal split has been found. + for child in (left_child_node, right_child_node): + if child.is_leaf: + del child.histograms + + # Free memory used by histograms as they are no longer needed for + # internal nodes once children histograms have been computed. + del node.histograms + return left_child_node, right_child_node def _finalize_leaf(self, node): From 46c398ad76f33c49d23411088608660984fc7065 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 3 Sep 2020 20:52:40 +0200 Subject: [PATCH 3/5] Wording in comment --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 858b1fac50169..6f07b24172f05 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -481,13 +481,13 @@ def split_next(self): self._compute_best_split_and_push(right_child_node) self.total_find_split_time += time() - tic - # Free memory used by histograms as they are no longer needed for - # leaf nodes once the optimal split has been found. + # Release memory used by histograms as they are no longer needed + # for leaf nodes once the optimal split has been found. for child in (left_child_node, right_child_node): if child.is_leaf: del child.histograms - # Free memory used by histograms as they are no longer needed for + # Release memory used by histograms as they are no longer needed for # internal nodes once children histograms have been computed. del node.histograms From b6aec726fe9cb68f6bca1ad874aa1e70cae93880 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 3 Sep 2020 21:01:19 +0200 Subject: [PATCH 4/5] Add whats new entry --- doc/whats_new/v0.24.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index c3f3f993a2f57..b76a0a658b387 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -188,6 +188,13 @@ Changelog method `staged_predict`, which allows monitoring of each stage. :pr:`16985` by :user:`Hao Chun Chang `. +- |Efficiency| break cyclic references in the tree nodes used internally in + :class:`ensemble.HistGradientBoostingRegressor` and + :class:`ensemble.HistGradientBoostingClassifier` to allow for the timely + garbage collection of large intermediate datastructures and to improve memory + usage in `fit`. :pr:`18334` by `Olivier Grisel`_ `Nicolas Hug`_, `Thomas + Fan`_ and `Andreas Müller`_. + - |API|: The parameter ``n_classes_`` is now deprecated in :class:`ensemble.GradientBoostingRegressor` and returns `1`. :pr:`17702` by :user:`Simona Maggio `. From 80add09f5911631d27005b9d75ae9dd8fdf1aaa9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 3 Sep 2020 21:15:54 +0200 Subject: [PATCH 5/5] Update sklearn/ensemble/_hist_gradient_boosting/grower.py Co-authored-by: Nicolas Hug --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 6f07b24172f05..473dc37674684 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -482,7 +482,7 @@ def split_next(self): self.total_find_split_time += time() - tic # Release memory used by histograms as they are no longer needed - # for leaf nodes once the optimal split has been found. + # for leaf nodes since they won't be split. for child in (left_child_node, right_child_node): if child.is_leaf: del child.histograms