scikit-learn · thomasjpfan · Aug 23, 2020 · Aug 23, 2020 · Aug 23, 2020 · Aug 23, 2020
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -192,9 +192,15 @@ Changelog
   :class:`ensemble.HistGradientBoostingRegressor` and
   :class:`ensemble.HistGradientBoostingClassifier` to allow for the timely
   garbage collection of large intermediate datastructures and to improve memory
-  usage in `fit`. :pr:`18334` by `Olivier Grisel`_ `Nicolas Hug`_, `Thomas
+  usage in `fit`. :pr:`18334` by `Olivier Grisel`_, `Nicolas Hug`_, `Thomas
   Fan`_ and `Andreas Müller`_.
 
+- |Efficiency| :class:`ensemble.HistGradientBoostingRegressor` and
+  :class:`ensemble.HistGradientBoostingClassifier` have been futhermore
+  improved to have a low and stable memory usage (on macOS in particular) by
+  recycling previously allocated histogram datastructures instead of relying on
+  the Python GC. :pr:`18242` by `Thomas Fan`_ `Olivier Grisel`_, `Nicolas Hug`_.
+
 - |Efficiency| Histogram initialization is now done in parallel in
   :class:`ensemble.HistGradientBoostingRegressor` and
   :class:`ensemble.HistGradientBoostingClassifier` which results in speed

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_histogram_pool.py b/sklearn/ensemble/_hist_gradient_boosting/_histogram_pool.py
@@ -0,0 +1,50 @@
+import numpy as np
+from .common import HISTOGRAM_DTYPE
+
+
+class HistogramPool:
+    """Histogram pool to be used by the growers.
+
+    The pool allocates and returns histograms to the caller. When the `reset`
+    method is called, all the previously allocated histograms will be available
+    for the next grower to use. New histograms will be allocated when there are
+    no histograms left in the available_pool. HistogramPool is used for memory
+    allocation/management only. The computation of the histograms is done in
+    HistogramBuilder.
+
+    Empirically, this strategy has proven to be more memory efficient (on macOS
+    in particular) than allocating new histograms each time and relying on the
+    Python GC to keep the overall python process memory low when fitting GBRT
+    on datasets with many features and target classes.
+    """
+    def __init__(self, n_features, n_bins):
+        self.n_features = n_features
+        self.n_bins = n_bins
+        self.available_pool = []
+        self.used_pool = []
+
+    def reset(self):
+        """Reset the pool."""
+        self.available_pool.extend(self.used_pool)
+        self.used_pool = []
+
+    def get(self):
+        """Return a non-initialized array of histogram for one grower node."""
+        if self.available_pool:
+            histograms = self.available_pool.pop()
+        else:
+            histograms = np.empty(
+                shape=(self.n_features, self.n_bins), dtype=HISTOGRAM_DTYPE
+            )
+        self.used_pool.append(histograms)
+        return histograms
+
+    def release(self, histograms):
+        """Move a specific histogram array to the available pool"""
+        try:
+            idx = next(idx for idx, h in enumerate(self.used_pool)
+                       if h is histograms)
+        except StopIteration as e:
+            raise ValueError("Could not find histograms in used_pool") from e
+        self.used_pool.pop(idx)
+        self.available_pool.append(histograms)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -18,6 +18,7 @@
 from ...model_selection import train_test_split
 from ...preprocessing import LabelEncoder
 from ._gradient_boosting import _update_raw_predictions
+from ._histogram_pool import HistogramPool
 from .common import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE
 
 from .binning import _BinMapper
@@ -340,6 +341,8 @@ def fit(self, X, y, sample_weight=None):
             prediction_dim=self.n_trees_per_iteration_,
             sample_weight=sample_weight_train
         )
+        histogram_pool = HistogramPool(n_features=self._n_features,
+                                       n_bins=n_bins)
 
         for iteration in range(begin_at_stage, self.max_iter):
 
@@ -360,6 +363,7 @@ def fit(self, X, y, sample_weight=None):
             for k in range(self.n_trees_per_iteration_):
                 grower = TreeGrower(
                     X_binned_train, gradients[k, :], hessians[k, :],
+                    histogram_pool=histogram_pool,
                     n_bins=n_bins,
                     n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
                     has_missing_values=has_missing_values,

diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -18,6 +18,7 @@
 from .common import PREDICTOR_RECORD_DTYPE
 from .common import Y_DTYPE
 from .common import MonotonicConstraint
+from ._histogram_pool import HistogramPool
 
 
 EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
@@ -135,6 +136,9 @@ class TreeGrower:
     hessians : ndarray of shape (n_samples,)
         The hessians of each training sample. Those are the hessians of the
         loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    histogram_pool : HistogramPool, default=None
+        A cache to hold the created histograms between growers. If None, a
+        new pool cache is created.
     max_leaf_nodes : int, default=None
         The maximum number of leaves for each tree. If None, there is no
         maximum limit.
@@ -175,7 +179,8 @@ class TreeGrower:
         learning rate.
     """
 
-    def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
+    def __init__(self, X_binned, gradients, hessians, histogram_pool=None,
+                 max_leaf_nodes=None,
                  max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
                  n_bins=256, n_bins_non_missing=None, has_missing_values=False,
                  monotonic_cst=None, l2_regularization=0.,
@@ -247,6 +252,14 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
         self.total_find_split_time = 0.  # time spent finding the best splits
         self.total_compute_hist_time = 0.  # time spent computing histograms
         self.total_apply_split_time = 0.  # time spent splitting nodes
+
+        if histogram_pool is None:
+            self.histogram_pool = HistogramPool(n_features=self.n_features,
+                                                n_bins=n_bins)
+        else:
+            self.histogram_pool = histogram_pool
+
+        self.histogram_pool.reset()
         self._intilialize_root(gradients, hessians, hessians_are_constant)
         self.n_nodes = 1
 
@@ -330,8 +343,9 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
             self._finalize_leaf(self.root)
             return
 
-        self.root.histograms = self.histogram_builder.compute_histograms_brute(
-            self.root.sample_indices)
+        self.root.histograms = self.histogram_pool.get()
+        self.histogram_builder.compute_histograms_brute(
+            self.root.sample_indices, self.root.histograms)
         self._compute_best_split_and_push(self.root)
 
     def _compute_best_split_and_push(self, node):
@@ -466,12 +480,14 @@ def split_next(self):
             # smallest number of samples, and the subtraction trick O(n_bins)
             # on the other one.
             tic = time()
-            smallest_child.histograms = \
-                self.histogram_builder.compute_histograms_brute(
-                    smallest_child.sample_indices)
-            largest_child.histograms = \
-                self.histogram_builder.compute_histograms_subtraction(
-                    node.histograms, smallest_child.histograms)
+            smallest_child.histograms = self.histogram_pool.get()
+            self.histogram_builder.compute_histograms_brute(
+                smallest_child.sample_indices, smallest_child.histograms)
+
+            largest_child.histograms = self.histogram_pool.get()
+            self.histogram_builder.compute_histograms_subtraction(
+                node.histograms, smallest_child.histograms,
+                largest_child.histograms)
             self.total_compute_hist_time += time() - tic
 
             tic = time()
@@ -485,11 +501,13 @@ def split_next(self):
             # for leaf nodes since they won't be split.
             for child in (left_child_node, right_child_node):
                 if child.is_leaf:
-                    del child.histograms
+                    self.histogram_pool.release(child.histograms)
+                    child.histograms = None
 
         # Release memory used by histograms as they are no longer needed for
         # internal nodes once children histograms have been computed.
-        del node.histograms
+        self.histogram_pool.release(node.histograms)
+        node.histograms = None
 
         return left_child_node, right_child_node
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -105,7 +105,8 @@ cdef class HistogramBuilder:
 
     def compute_histograms_brute(
             HistogramBuilder self,
-            const unsigned int [::1] sample_indices):  # IN
+            const unsigned int [::1] sample_indices,  # IN
+            hist_struct [:, ::1] histograms):  # OUT
         """Compute the histograms of the node by scanning through all the data.
 
         For a given feature, the complexity is O(n_samples)
@@ -115,10 +116,8 @@ cdef class HistogramBuilder:
         sample_indices : array of int, shape (n_samples_at_node,)
             The indices of the samples at the node to split.
 
-        Returns
-        -------
         histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins)
-            The computed histograms of the current node.
+            Where the histograms of the current node will be stored.
         """
         cdef:
             int n_samples
@@ -132,11 +131,6 @@ cdef class HistogramBuilder:
             G_H_DTYPE_C [::1] gradients = self.gradients
             G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians
             G_H_DTYPE_C [::1] hessians = self.hessians
-            # Histograms will be initialized to zero later within a prange
-            hist_struct [:, ::1] histograms = np.empty(
-                shape=(self.n_features, self.n_bins),
-                dtype=HISTOGRAM_DTYPE
-            )
 
         with nogil:
             n_samples = sample_indices.shape[0]
@@ -158,8 +152,6 @@ cdef class HistogramBuilder:
                 self._compute_histogram_brute_single_feature(
                     feature_idx, sample_indices, histograms)
 
-        return histograms
-
     cdef void _compute_histogram_brute_single_feature(
             HistogramBuilder self,
             const int feature_idx,
@@ -179,7 +171,13 @@ cdef class HistogramBuilder:
             unsigned char hessians_are_constant = \
                 self.hessians_are_constant
             unsigned int bin_idx = 0
-
+
+        # Initialize histograms to 0 here as we might recycle a previously
+        # allocated histogram via the histogram_pool of the grower for
+        # memory efficiency purpose.
+        # Also this delayed zero init happens in the openmp parallel
+        # section for each feature which yields a small additional
+        # performance improvement.
         for bin_idx in range(self.n_bins):
             histograms[feature_idx, bin_idx].sum_gradients = 0.
             histograms[feature_idx, bin_idx].sum_hessians = 0.
@@ -207,7 +205,8 @@ cdef class HistogramBuilder:
     def compute_histograms_subtraction(
             HistogramBuilder self,
             hist_struct [:, ::1] parent_histograms,  # IN
-            hist_struct [:, ::1] sibling_histograms):  # IN
+            hist_struct [:, ::1] sibling_histograms,  # IN
+            hist_struct [:, ::1] histograms):  # OUT
         """Compute the histograms of the node using the subtraction trick.
 
         hist(parent) = hist(left_child) + hist(right_child)
@@ -224,20 +223,13 @@ cdef class HistogramBuilder:
         sibling_histograms : ndarray of HISTOGRAM_DTYPE, \
                 shape (n_features, n_bins)
             The histograms of the sibling.
-
-        Returns
-        -------
         histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins)
-            The computed histograms of the current node.
+            Where the histograms of the current node will be stored.
         """
 
         cdef:
             int feature_idx
             int n_features = self.n_features
-            hist_struct [:, ::1] histograms = np.empty(
-                shape=(self.n_features, self.n_bins),
-                dtype=HISTOGRAM_DTYPE
-            )
 
         for feature_idx in prange(n_features, schedule='static', nogil=True):
             # Compute histogram of each feature
@@ -246,7 +238,6 @@ cdef class HistogramBuilder:
                                  parent_histograms,
                                  sibling_histograms,
                                  histograms)
-        return histograms
 
 
 cpdef void _build_histogram_naive(

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -20,6 +20,7 @@
 from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
 from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.utils import shuffle
 
@@ -673,8 +674,10 @@ def test_sum_hessians_are_sample_weight(loss_name):
     # Build histogram
     grower = TreeGrower(X_binned, gradients[0], hessians[0],
                         n_bins=bin_mapper.n_bins)
-    histograms = grower.histogram_builder.compute_histograms_brute(
-        grower.root.sample_indices)
+    histograms = np.empty(shape=(n_features, bin_mapper.n_bins),
+                          dtype=HISTOGRAM_DTYPE)
+    grower.histogram_builder.compute_histograms_brute(
+        grower.root.sample_indices, histograms)
 
     for feature_idx in range(n_features):
         for bin_idx in range(bin_mapper.n_bins):

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram_pool.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram_pool.py
@@ -0,0 +1,43 @@
+from sklearn.ensemble._hist_gradient_boosting._histogram_pool import (
+    HistogramPool
+)
+import pytest
+
+
+def test_histograms_pool():
+    # simple check how HistogramPool manages state
+    n_features, n_bins = 20, 5
+    pool = HistogramPool(n_features=n_features, n_bins=n_bins)
+
+    histograms1 = pool.get()
+    assert histograms1.shape == (n_features, n_bins)
+
+    assert pool.used_pool == [histograms1]
+    assert pool.available_pool == []
+
+    histograms2 = pool.get()
+    assert histograms2.shape == (n_features, n_bins)
+    assert pool.used_pool == [histograms1, histograms2]
+    assert pool.available_pool == []
+
+    pool.release(histograms1)
+    assert pool.used_pool == [histograms2]
+    assert pool.available_pool == [histograms1]
+
+    # Cannot release an already released histogram
+    with pytest.raises(ValueError):
+        pool.release(histograms1)
+
+    # when pool is reset histograms in the used pool is moved to the
+    # avaliable pool
+    pool.reset()
+    assert pool.available_pool == [histograms1, histograms2]
+    assert pool.used_pool == []
+
+    histograms3 = pool.get()
+    assert histograms3.shape == (n_features, n_bins)
+
+    # only histograms1 is in the avaliable pool
+    assert pool.available_pool == [histograms1]
+    assert histograms3 is histograms2
+    assert pool.used_pool == [histograms3]
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -4,6 +4,7 @@
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
 from sklearn.ensemble._hist_gradient_boosting.splitting import (
     Splitter,
@@ -311,7 +312,8 @@ def test_bounded_value_min_gain_to_split():
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, hessians_are_constant)
 
-    histograms = builder.compute_histograms_brute(sample_indices)
+    histograms = np.empty(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE)
+    builder.compute_histograms_brute(sample_indices, histograms)
 
     # Since the gradient array is [1, 1, 100, 1, 1]
     # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)