8000 EFF Improve IsolationForest predict time (#25186) · scikit-learn/scikit-learn@9956210 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9956210

Browse files
fsiolaFelipe Breve Siolaglemaitreogriselbetatim
authored
EFF Improve IsolationForest predict time (#25186)
Co-authored-by: Felipe Breve Siola <felipe.breve-siola@klarna.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com> Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: Tim Head <betatim@gmail.com>
1 parent bd1d165 commit 9956210

File tree

4 files changed

+54
-9
lines changed

4 files changed

+54
-9
lines changed

doc/whats_new/v1.3.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,13 @@ Changelog
123123
out-of-bag scores via the `oob_scores_` or `oob_score_` attributes.
124124
:pr:`24882` by :user:`Ashwin Mathur <awinml>`.
125125

126+
- |Efficiency| :class:`ensemble.IsolationForest` predict time is now faster
127+
(typically by a factor of 8 or more). Internally, the estimator now precomputes
128+
decision path lengths per tree at `fit` time. It is therefore not possible
129+
to load an estimator trained with scikit-learn 1.2 to make it predict with
130+
scikit-learn 1.3: retraining with scikit-learn 1.3 is required.
131+
:pr:`25186` by :user:`Felipe Breve Siola <fsiola>`.
132+
126133
:mod:`sklearn.exception`
127134
........................
128135
- |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised

sklearn/ensemble/_iforest.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,16 @@ def fit(self, X, y=None, sample_weight=None):
327327
check_input=False,
328328
)
329329

330+
self._average_path_length_per_tree, self._decision_path_lengths = zip(
331+
*[
332+
(
333+
_average_path_length(tree.tree_.n_node_samples),
334+
tree.tree_.compute_node_depths(),
335+
)
336+
for tree in self.estimators_
337+
]
338+
)
339+
330340
if self.contamination == "auto":
331341
# 0.5 plays a special role as described in the original paper.
332342
# we take the opposite as we consider the opposite of their score.
@@ -422,14 +432,13 @@ def score_samples(self, X):
422432
check_is_fitted(self)
423433

424434
# Check data
425-
X = self._validate_data(X, accept_sparse="csr", reset=False)
435+
X = self._validate_data(X, accept_sparse="csr", dtype=np.float32, reset=False)
426436

427437
# Take the opposite of the scores as bigger is better (here less
428438
# abnormal)
429439
return -self._compute_chunked_score_samples(X)
430440

431441
def _compute_chunked_score_samples(self, X):
432-
433442
n_samples = _num_samples(X)
434443

435444
if self._max_features == X.shape[1]:
@@ -477,19 +486,21 @@ def _compute_score_samples(self, X, subsample_features):
477486

478487
depths = np.zeros(n_samples, order="f")
479488

480-
for tree, features in zip(self.estimators_, self.estimators_features_):
489+
average_path_length_max_samples = _average_path_length([self._max_samples])
490+
491+
for tree_idx, (tree, features) in enumerate(
492+
zip(self.estimators_, self.estimators_features_)
493+
):
481494
X_subset = X[:, features] if subsample_features else X
482495

483-
leaves_index = tree.apply(X_subset)
484-
node_indicator = tree.decision_path(X_subset)
485-
n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
496+
leaves_index = tree.apply(X_subset, check_input=False)
486497

487498
depths += (
488-
np.ravel(node_indicator.sum(axis=1))
489-
+ _average_path_length(n_samples_leaf)
499+
self._decision_path_lengths[tree_idx][leaves_index]
500+
+ self._average_path_length_per_tree[tree_idx][leaves_index]
490501
- 1.0
491502
)
492-
denominator = len(self.estimators_) * _average_path_length([self.max_samples_])
503+
denominator = len(self.estimators_) * average_path_length_max_samples
493504
scores = 2 ** (
494505
# For a single training sample, denominator and depth are 0.
495506
# Therefore, we set the score manually to 1.

sklearn/tree/_tree.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ cdef class Tree:
7575
cdef object _decision_path_dense(self, object X)
7676
cdef object _decision_path_sparse_csr(self, object X)
7777

78+
cpdef compute_node_depths(self)
7879
cpdef compute_feature_importances(self, normalize=*)
7980

8081

sklearn/tree/_tree.pyx

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,6 +1056,32 @@ cdef class Tree:
10561056

10571057
return out
10581058

1059+
cpdef compute_node_depths(self):
1060+
"""Compute the depth of each node in a tree.
1061+
1062+
.. versionadded:: 1.3
1063+
1064+
Returns
1065+
-------
1066+
depths : ndarray of shape (self.node_count,), dtype=np.int64
1067+
The depth of each node in the tree.
1068+
"""
1069+
cdef:
1070+
cnp.int64_t[::1] depths = np.empty(self.node_count, dtype=np.int64)
1071+
cnp.npy_intp[:] children_left = self.children_left
1072+
cnp.npy_intp[:] children_right = self.children_right
1073+
cnp.npy_intp node_id
1074+
cnp.npy_intp node_count = self.node_count
1075+
cnp.int64_t depth
1076+
1077+
depths[0] = 1 # init root node
1078+
for node_id in range(node_count):
1079+
if children_left[node_id] != _TREE_LEAF:
1080+
depth = depths[node_id] + 1
1081+
depths[children_left[node_id]] = depth
1082+
depths[children_right[node_id]] = depth
1083+
1084+
return depths.base
10591085

10601086
cpdef compute_feature_importances(self, normalize=True):
10611087
"""Computes the importance of each feature (aka variable)."""

0 commit comments

Comments
 (0)
0