diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 005925ae42720..83affd2674d94 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -126,6 +126,10 @@ Support for Python 3.4 and below has been officially dropped. :issue:`13251` by :user:`Albert Thomas ` and :user:`joshuakennethjones `. +- |Efficiency| Make :class:`ensemble.IsolationForest` more memory efficient + by avoiding keeping in memory each tree prediction. :issue:`13260` by + `Nicolas Goix`_. + - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where the gradients would be incorrectly computed in multiclass classification problems. :issue:`12715` by :user:`Nicolas Hug`. diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 7050fb18f96e9..a855d5a16281a 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -330,9 +330,10 @@ def decision_function(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The training input samples. Sparse matrices are accepted only if - they are supported by the base estimator. + X : array-like or sparse matrix, shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. Returns ------- @@ -361,9 +362,8 @@ def score_samples(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The training input samples. Sparse matrices are accepted only if - they are supported by the base estimator. + X : array-like or sparse matrix, shape (n_samples, n_features) + The input samples. Returns ------- @@ -383,30 +383,34 @@ def score_samples(self, X): "".format(self.n_features_, X.shape[1])) n_samples = X.shape[0] - n_samples_leaf = np.zeros((n_samples, self.n_estimators), order="f") - depths = np.zeros((n_samples, self.n_estimators), order="f") + n_samples_leaf = np.zeros(n_samples, order="f") + depths = np.zeros(n_samples, order="f") if self._max_features == X.shape[1]: subsample_features = False else: subsample_features = True - for i, (tree, features) in enumerate(zip(self.estimators_, - self.estimators_features_)): + for tree, features in zip(self.estimators_, self.estimators_features_): if subsample_features: X_subset = X[:, features] else: X_subset = X leaves_index = tree.apply(X_subset) node_indicator = tree.decision_path(X_subset) - n_samples_leaf[:, i] = tree.tree_.n_node_samples[leaves_index] - depths[:, i] = np.ravel(node_indicator.sum(axis=1)) - depths[:, i] -= 1 + n_samples_leaf = tree.tree_.n_node_samples[leaves_index] - depths += _average_path_length(n_samples_leaf) + depths += ( + np.ravel(node_indicator.sum(axis=1)) + + _average_path_length(n_samples_leaf) + - 1.0 + ) - scores = 2 ** (-depths.mean(axis=1) / _average_path_length( - self.max_samples_)) + scores = 2 ** ( + -depths + / (len(self.estimators_) + * _average_path_length([self.max_samples_])) + ) # Take the opposite of the scores as bigger is better (here less # abnormal) @@ -423,12 +427,12 @@ def threshold_(self): def _average_path_length(n_samples_leaf): - """ The average path length in a n_samples iTree, which is equal to + """The average path length in a n_samples iTree, which is equal to the average path length of an unsuccessful BST search since the latter has the same structure as an isolation tree. Parameters ---------- - n_samples_leaf : array-like, shape (n_samples, n_estimators), or int. + n_samples_leaf : array-like, shape (n_samples,). The number of training samples in each test sample leaf, for each estimators. @@ -437,29 +441,22 @@ def _average_path_length(n_samples_leaf): average_path_length : array, same shape as n_samples_leaf """ - if isinstance(n_samples_leaf, INTEGER_TYPES): - if n_samples_leaf <= 1: - return 0. - elif n_samples_leaf <= 2: - return 1. - else: - return 2. * (np.log(n_samples_leaf - 1.) + np.euler_gamma) - 2. * ( - n_samples_leaf - 1.) / n_samples_leaf - else: + n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False) - n_samples_leaf_shape = n_samples_leaf.shape - n_samples_leaf = n_samples_leaf.reshape((1, -1)) - average_path_length = np.zeros(n_samples_leaf.shape) + n_samples_leaf_shape = n_samples_leaf.shape + n_samples_leaf = n_samples_leaf.reshape((1, -1)) + average_path_length = np.zeros(n_samples_leaf.shape) - mask_1 = n_samples_leaf <= 1 - mask_2 = n_samples_leaf == 2 - not_mask = ~np.logical_or(mask_1, mask_2) + mask_1 = n_samples_leaf <= 1 + mask_2 = n_samples_leaf == 2 + not_mask = ~np.logical_or(mask_1, mask_2) - average_path_length[mask_1] = 0. - average_path_length[mask_2] = 1. - average_path_length[not_mask] = 2. * ( - np.log(n_samples_leaf[not_mask] - 1.) + np.euler_gamma) - 2. * ( - n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask] + average_path_length[mask_1] = 0. + average_path_length[mask_2] = 1. + average_path_length[not_mask] = ( + 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) + - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask] + ) - return average_path_length.reshape(n_samples_leaf_shape) + return average_path_length.reshape(n_samples_leaf_shape) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 66d6c906c5e9e..e33547a44e41a 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -219,21 +219,22 @@ def test_iforest_performance(): assert_greater(roc_auc_score(y_test, y_pred), 0.98) -@pytest.mark.filterwarnings('ignore:threshold_ attribute') -def test_iforest_works(): +@pytest.mark.parametrize("contamination", [0.25, "auto"]) +@pytest.mark.filterwarnings("ignore:threshold_ attribute") +def test_iforest_works(contamination): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest - for contamination in [0.25, "auto"]: - clf = IsolationForest(behaviour='new', random_state=rng, - contamination=contamination) - clf.fit(X) - decision_func = - clf.decision_function(X) - pred = clf.predict(X) - # assert detect outliers: - assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) - assert_array_equal(pred, 6 * [1] + 2 * [-1]) + clf = IsolationForest( + behaviour="new", random_state=rng, contamination=contamination + ) + clf.fit(X) + decision_func = -clf.decision_function(X) + pred = clf.predict(X) + # assert detect outliers: + assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) + assert_array_equal(pred, 6 * [1] + 2 * [-1]) @pytest.mark.filterwarnings('ignore:default contamination') @@ -263,17 +264,17 @@ def test_iforest_average_path_length(): # It tests non-regression for #8549 which used the wrong formula # for average path length, strictly for the integer case # Updated to check average path length when input is <= 2 (issue #11839) - - result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5. - result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999. - assert _average_path_length(0) == pytest.approx(0) - assert _average_path_length(1) == pytest.approx(0) - assert _average_path_length(2) == pytest.approx(1) - assert_allclose(_average_path_length(5), result_one) - assert_allclose(_average_path_length(999), result_two) - assert_allclose(_average_path_length(np.array([1, 2, 5, 999])), - [0., 1., result_one, result_two]) - + result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0 + result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0 + assert_allclose(_average_path_length([0]), [0.0]) + assert_allclose(_average_path_length([1]), [0.0]) + assert_allclose(_average_path_length([2]), [1.0]) + assert_allclose(_average_path_length([5]), [result_one]) + assert_allclose(_average_path_length([999]), [result_two]) + assert_allclose( + _average_path_length(np.array([1, 2, 5, 999])), + [0.0, 1.0, result_one, result_two], + ) # _average_path_length is increasing avg_path_length = _average_path_length(np.arange(5)) assert_array_equal(avg_path_length, np.sort(avg_path_length))