scikit-learn · jnothman · Mar 18, 2019 · Feb 26, 2019 · Mar 10, 2019 · Mar 12, 2019
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -144,6 +144,10 @@ Support for Python 3.4 and below has been officially dropped.
   by avoiding keeping in memory each tree prediction. :issue:`13260` by
   `Nicolas Goix`_.
 
+- |Efficiency| :class:`ensemble.IsolationForest` now uses chunks of data at
+  prediction step, thus capping the memory usage. :issue:`13283` by
+  `Nicolas Goix`_.
+
 - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
   the gradients would be incorrectly computed in multiclass classification
   problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.

diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
@@ -9,9 +9,14 @@
 from warnings import warn
 
 from ..tree import ExtraTreeRegressor
-from ..utils import check_random_state, check_array
+from ..utils import (
+    check_random_state,
+    check_array,
+    gen_batches,
+    get_chunk_n_rows,
+)
 from ..utils.fixes import _joblib_parallel_args
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _num_samples
 from ..base import OutlierMixin
 
 from .bagging import BaseBagging
@@ -388,21 +393,69 @@ def score_samples(self, X):
                              "match the input. Model n_features is {0} and "
                              "input n_features is {1}."
                              "".format(self.n_features_, X.shape[1]))
-        n_samples = X.shape[0]
 
-        n_samples_leaf = np.zeros(n_samples, order="f")
-        depths = np.zeros(n_samples, order="f")
+        # Take the opposite of the scores as bigger is better (here less
+        # abnormal)
+        return -self._compute_chunked_score_samples(X)
+
+    @property
+    def threshold_(self):
+        if self.behaviour != 'old':
+            raise AttributeError("threshold_ attribute does not exist when "
+                                 "behaviour != 'old'")
+        warn("threshold_ attribute is deprecated in 0.20 and will"
+             " be removed in 0.22.", DeprecationWarning)
+        return self._threshold_
+
+    def _compute_chunked_score_samples(self, X):
+
+        n_samples = _num_samples(X)
 
         if self._max_features == X.shape[1]:
             subsample_features = False
         else:
             subsample_features = True
 
+        # We get as many rows as possible within our working_memory budget
+        # (defined by sklearn.get_config()['working_memory']) to store
+        # self._max_features in each row during computation.
+        #
+        # Note:
+        #  - this will get at least 1 row, even if 1 row of score will
+        #    exceed working_memory.
+        #  - this does only account for temporary memory usage while loading
+        #    the data needed to compute the scores -- the returned scores
+        #    themselves are 1D.
+
+        chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
+                                        max_n_rows=n_samples)
+        slices = gen_batches(n_samples, chunk_n_rows)
+
+        scores = np.zeros(n_samples, order="f")
+
+        for sl in slices:
+            # compute score on the slices of test samples:
+            scores[sl] = self._compute_score_samples(X[sl], subsample_features)
+
+        return scores
+
+    def _compute_score_samples(self, X, subsample_features):
+        """Compute the score of each samples in X going through the extra trees.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix
+
+        subsample_features : bool,
+            whether features should be subsampled
+        """
+        n_samples = X.shape[0]
+
+        depths = np.zeros(n_samples, order="f")
+
         for tree, features in zip(self.estimators_, self.estimators_features_):
-            if subsample_features:
-                X_subset = X[:, features]
-            else:
-                X_subset = X
+            X_subset = X[:, features] if subsample_features else X
+
             leaves_index = tree.apply(X_subset)
             node_indicator = tree.decision_path(X_subset)
             n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
@@ -418,19 +471,7 @@ def score_samples(self, X):
             / (len(self.estimators_)
                * _average_path_length([self.max_samples_]))
         )
-
-        # Take the opposite of the scores as bigger is better (here less
-        # abnormal)
-        return -scores
-
-    @property
-    def threshold_(self):
-        if self.behaviour != 'old':
-            raise AttributeError("threshold_ attribute does not exist when "
-                                 "behaviour != 'old'")
-        warn("threshold_ attribute is deprecated in 0.20 and will"
-             " be removed in 0.22.", DeprecationWarning)
-        return self._threshold_
+        return scores
 
 
 def _average_path_length(n_samples_leaf):

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -29,6 +29,7 @@
 from sklearn.metrics import roc_auc_score
 
 from scipy.sparse import csc_matrix, csr_matrix
+from unittest.mock import Mock, patch
 
 rng = check_random_state(0)
 
@@ -325,3 +326,36 @@ def test_behaviour_param():
     clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
     assert_array_equal(clf1.decision_function([[2., 2.]]),
                        clf2.decision_function([[2., 2.]]))
+
+
+# mock get_chunk_n_rows to actually test more than one chunk (here one
+# chunk = 3 rows:
+@patch(
+    "sklearn.ensemble.iforest.get_chunk_n_rows",
+    side_effect=Mock(**{"return_value": 3}),
+)
+@pytest.mark.parametrize(
+    "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
+)
+@pytest.mark.filterwarnings("ignore:threshold_ attribute")
+def test_iforest_chunks_works1(
+    mocked_get_chunk, contamination, n_predict_calls
+):
+    test_iforest_works(contamination)
+    assert mocked_get_chunk.call_count == n_predict_calls
+
+
+# idem with chunk_size = 5 rows
+@patch(
+    "sklearn.ensemble.iforest.get_chunk_n_rows",
+    side_effect=Mock(**{"return_value": 10}),
+)
+@pytest.mark.parametrize(
+    "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
+)
+@pytest.mark.filterwarnings("ignore:threshold_ attribute")
+def test_iforest_chunks_works2(
+    mocked_get_chunk, contamination, n_predict_calls
+):
+    test_iforest_works(contamination)
+    assert mocked_get_chunk.call_count == n_predict_calls