diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 501f2425541e8..833adb02cc831 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -501,9 +501,7 @@ def _average_path_length(n_samples_leaf): average_path_length[mask_1] = 0. average_path_length[mask_2] = 1. - average_path_length[not_mask] = ( - 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) - - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask] - ) + average_path_length[not_mask] = 2.0 * (np.log(n_samples_leaf[not_mask]) + + np.euler_gamma - 1.0) return average_path_length.reshape(n_samples_leaf_shape) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 3593bc0422ff7..ebbb74c8f7ba3 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -235,19 +235,22 @@ def test_iforest_subsampled_features(): def test_iforest_average_path_length(): + def harmonic_humber(n): + return np.sum(1.0/np.arange(1, n+1)) # It tests non-regression for #8549 which used the wrong formula # for average path length, strictly for the integer case # Updated to check average path length when input is <= 2 (issue #11839) - result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0 - result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0 + result_one = 2.0 * harmonic_humber(4.0) - 2.0 * 4.0 / 5.0 + result_two = 2.0 * harmonic_humber(998.0) - 2.0 * 998.0 / 999.0 assert_allclose(_average_path_length([0]), [0.0]) assert_allclose(_average_path_length([1]), [0.0]) assert_allclose(_average_path_length([2]), [1.0]) - assert_allclose(_average_path_length([5]), [result_one]) - assert_allclose(_average_path_length([999]), [result_two]) + assert_allclose(_average_path_length([5]), [result_one], rtol=0.1) + assert_allclose(_average_path_length([999]), [result_two], rtol=1e-4) assert_allclose( _average_path_length(np.array([1, 2, 5, 999])), [0.0, 1.0, result_one, result_two], + rtol=0.1 ) # _average_path_length is increasing avg_path_length = _average_path_length(np.arange(5)) @@ -322,37 +325,3 @@ def test_iforest_deprecation(): warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24" with pytest.warns(FutureWarning, match=warn_msg): iforest.fit(iris.data) - - -def test_iforest_with_uniform_data(): - """Test whether iforest predicts inliers when using uniform data""" - - # 2-d array of all 1s - X = np.ones((100, 10)) - iforest = IsolationForest() - iforest.fit(X) - - rng = np.random.RandomState(0) - - assert all(iforest.predict(X) == 1) - assert all(iforest.predict(rng.randn(100, 10)) == 1) - assert all(iforest.predict(X + 1) == 1) - assert all(iforest.predict(X - 1) == 1) - - # 2-d array where columns contain the same value across rows - X = np.repeat(rng.randn(1, 10), 100, 0) - iforest = IsolationForest() - iforest.fit(X) - - assert all(iforest.predict(X) == 1) - assert all(iforest.predict(rng.randn(100, 10)) == 1) - assert all(iforest.predict(np.ones((100, 10))) == 1) - - # Single row - X = rng.randn(1, 10) - iforest = IsolationForest() - iforest.fit(X) - - assert all(iforest.predict(X) == 1) - assert all(iforest.predict(rng.randn(100, 10)) == 1) - assert all(iforest.predict(np.ones((100, 10))) == 1)