8000 ENH iforest's score_samples uses chunks for fixed-memory computation … · scikit-learn/scikit-learn@c22b871 · GitHub
[go: up one dir, main page]

Skip to content

Commit c22b871

Browse files
ngoixjnothman
authored andcommitted
ENH iforest's score_samples uses chunks for fixed-memory computation (#13283)
1 parent 568ef06 commit c22b871

File tree

3 files changed

+101
-22
lines changed

3 files changed

+101
-22
lines changed

doc/whats_new/v0.21.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ Support for Python 3.4 and below has been officially dropped.
168168
by avoiding keeping in memory each tree prediction. :issue:`13260` by
169169
`Nicolas Goix`_.
170170

171+
- |Efficiency| :class:`ensemble.IsolationForest` now uses chunks of data at
172+
prediction step, thus capping the memory usage. :issue:`13283` by
173+
`Nicolas Goix`_.
174+
171175
- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
172176
the gradients would be incorrectly computed in multiclass classification
173177
problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.

sklearn/ensemble/iforest.py

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,14 @@
99
from warnings import warn
1010

1111
from ..tree import ExtraTreeRegressor
12-
from ..utils import check_random_state, check_array
12+
from ..utils import (
13+
check_random_state,
14+
check_array,
15+
gen_batches,
16+
get_chunk_n_rows,
17+
)
1318
from ..utils.fixes import _joblib_parallel_args
14-
from ..utils.validation import check_is_fitted
19+
from ..utils.validation import check_is_fitted, _num_samples
1520
from ..base import OutlierMixin
1621

1722
from .bagging import BaseBagging
@@ -388,21 +393,69 @@ def score_samples(self, X):
388393
"match the input. Model n_features is {0} and "
389394
"input n_features is {1}."
390395
"".format(self.n_features_, X.shape[1]))
391-
n_samples = X.shape[0]
392396

393-
n_samples_leaf = np.zeros(n_samples, order="f")
394-
depths = np.zeros(n_samples, order="f")
397+
# Take the opposite of the scores as bigger is better (here less
398+
# abnormal)
399+
return -self._compute_chunked_score_samples(X)
400+
401+
@property
402+
def threshold_(self):
403+
if self.behaviour != 'old':
404+
raise AttributeError("threshold_ attribute does not exist when "
405+
"behaviour != 'old'")
406+
warn("threshold_ attribute is deprecated in 0.20 and will"
407+
" be removed in 0.22.", DeprecationWarning)
408+
return self._threshold_
409+
410+
def _compute_chunked_score_samples(self, X):
411+
412+
n_samples = _num_samples(X)
395413

396414
if self._max_features == X.shape[1]:
397415
subsample_features = False
398416
else:
399417
subsample_features = True
400418

419+
# We get as many rows as possible within our working_memory budget
420+
# (defined by sklearn.get_config()['working_memory']) to store
421+
# self._max_features in each row during computation.
422+
#
423+
# Note:
424+
# - this will get at least 1 row, even if 1 row of score will
425+
# exceed working_memory.
426+
# - this does only account for temporary memory usage while loading
427+
# the data needed to compute the scores -- the returned scores
428+
# themselves are 1D.
429+
430+
chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
431+
max_n_rows=n_samples)
432+
slices = gen_batches(n_samples, chunk_n_rows)
433+
434+
scores = np.zeros(n_samples, order="f")
435+
436+
for sl in slices:
437+
# compute score on the slices of test samples:
438+
scores[sl] = self._compute_score_samples(X[sl], subsample_features)
439+
440+
return scores
441+
442+
def _compute_score_samples(self, X, subsample_features):
443+
"""Compute the score of each samples in X going through the extra trees.
444+
445+
Parameters
446+
----------
447+
X : array-like or sparse matrix
448+
449+
subsample_features : bool,
450+
whether features should be subsampled
451+
"""
452+
n_samples = X.shape[0]
453+
454+
depths = np.zeros(n_samples, order="f")
455+
401456
for tree, features in zip(self.estimators_, self.estimators_features_):
402-
if subsample_features:
403-
X_subset = X[:, features]
404-
else:
405-
X_subset = X
457+
X_subset = X[:, features] if subsample_features else X
458+
406459
leaves_index = tree.apply(X_subset)
407460
node_indicator = tree.decision_path(X_subset)
408461
n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
@@ -418,19 +471,7 @@ def score_samples(self, X):
418471
/ (len(self.estimators_)
419472
* _average_path_length([self.max_samples_]))
420473
)
421-
422-
# Take the opposite of the scores as bigger is better (here less
423-
# abnormal)
424-
return -scores
425-
426-
@property
427-
def threshold_(self):
428-
if self.behaviour != 'old':
429-
raise AttributeError("threshold_ attribute does not exist when "
430-
"behaviour != 'old'")
431-
warn("threshold_ attribute is deprecated in 0.20 and will"
432-
" be removed in 0.22.", DeprecationWarning)
433-
return self._threshold_
474+
return scores
434475

435476

436477
def _average_path_length(n_samples_leaf):

sklearn/ensemble/tests/test_iforest.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from sklearn.metrics import roc_auc_score
3030

3131
from scipy.sparse import csc_matrix, csr_matrix
32+
from unittest.mock import Mock, patch
3233

3334
rng = check_random_state(0)
3435

@@ -325,3 +326,36 @@ def test_behaviour_param():
325326
clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
326327
assert_array_equal(clf1.decision_function([[2., 2.]]),
327328
clf2.decision_function([[2., 2.]]))
329< 10000 code class="diff-text syntax-highlighted-line addition">+
330+
331+
# mock get_chunk_n_rows to actually test more than one chunk (here one
332+
# chunk = 3 rows:
333+
@patch(
334+
"sklearn.ensemble.iforest.get_chunk_n_rows",
335+
side_effect=Mock(**{"return_value": 3}),
336+
)
337+
@pytest.mark.parametrize(
338+
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
339+
)
340+
@pytest.mark.filterwarnings("ignore:threshold_ attribute")
341+
def test_iforest_chunks_works1(
342+
mocked_get_chunk, contamination, n_predict_calls
343+
):
344+
test_iforest_works(contamination)
345+
assert mocked_get_chunk.call_count == n_predict_calls
346+
347+
348+
# idem with chunk_size = 5 rows
349+
@patch(
350+
"sklearn.ensemble.iforest.get_chunk_n_rows",
351+
side_effect=Mock(**{"return_value": 10}),
352+
)
353+
@pytest.mark.parametrize(
354+
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
355+
)
356+
@pytest.mark.filterwarnings("ignore:threshold_ attribute")
357+
def test_iforest_chunks_works2(
358+
mocked_get_chunk, contamination, n_predict_calls
359+
):
360+
test_iforest_works(contamination)
361+
assert mocked_get_chunk.call_count == n_predict_calls

0 commit comments

Comments
 (0)
0