8000 Improve IsolationForest average depth evaluation by matwey · Pull Request #16721 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

Improve IsolationForest average depth evaluation #16721

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions sklearn/ensemble/_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,9 +501,7 @@ def _average_path_length(n_samples_leaf):

average_path_length[mask_1] = 0.
average_path_length[mask_2] = 1.
average_path_length[not_mask] = (
2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
- 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
)
average_path_length[not_mask] = 2.0 * (np.log(n_samples_leaf[not_mask])
+ np.euler_gamma - 1.0)
Comment on lines +504 to +505
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
average_path_length[not_mask] = 2.0 * (np.log(n_samples_leaf[not_mask])
+ np.euler_gamma - 1.0)
average_path_length[not_mask] = 2 * (np.log(n_samples_leaf[not_mask])
+ np.euler_gamma - 1)


return average_path_length.reshape(n_samples_leaf_shape)
45 changes: 7 additions & 38 deletions sklearn/ensemble/tests/test_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,19 +235,22 @@ def test_iforest_subsampled_features():


def test_iforest_average_path_length():
def harmonic_humber(n):
return np.sum(1.0/np.arange(1, n+1))
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case
# Updated to check average path length when input is <= 2 (issue #11839)
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
result_one = 2.0 * harmonic_humber(4.0) - 2.0 * 4.0 / 5.0
result_two = 2.0 * harmonic_humber(998.0) - 2.0 * 998.0 / 999.0
assert_allclose(_average_path_length([0]), [0.0])
assert_allclose(_average_path_length([1]), [0.0])
assert_allclose(_average_path_length([2]), [1.0])
assert_allclose(_average_path_length([5]), [result_one])
assert_allclose(_average_path_length([999]), [result_two])
assert_allclose(_average_path_length([5]), [result_one], rtol=0.1)
assert_allclose(_average_path_length([999]), [result_two], rtol=1e-4)
assert_allclose(
_average_path_length(np.array([1, 2, 5, 999])),
[0.0, 1.0, result_one, result_two],
rtol=0.1
)
# _average_path_length is increasing
avg_path_length = _average_path_length(np.arange(5))
Expand Down Expand Up @@ -322,37 +325,3 @@ def test_iforest_deprecation():
warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
with pytest.warns(FutureWarning, match=warn_msg):
iforest.fit(iris.data)


def test_iforest_with_uniform_data():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We definitely don't want to remove this one.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't insist on removing this, but #7141 should be resolved in a proper way then.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for opening this PR, can you develop on why #7141 was not resolved in a proper way?
tbh I can't find what/where this issue was fixed, but this test seems to show it was fixed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry just saw your previous comment - so you're saying it was fixed because the approximation error flipped side basically. I think we should go with @NicolasHug suggestion then - is_inlier[self.decision_function(X) < -1e-15] = -1 instead of is_inlier[self.decision_function(X) < 0] = -1

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ngoix Do you expect me to do it in this PR or in the separate PR?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another PR would be the proper way, but then we'd have to merge this one only afterwards because we definitely shouldn't remove this test.
I'd be ok to do it in this PR if it is just this one-line change

"""Test whether iforest predicts inliers when using uniform data"""

# 2-d array of all 1s
X = np.ones((100, 10))
iforest = IsolationForest()
iforest.fit(X)

rng = np.random.RandomState(0)

assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(X + 1) == 1)
assert all(iforest.predict(X - 1) == 1)

# 2-d array where columns contain the same value across rows
X = np.repeat(rng.randn(1, 10), 100, 0)
iforest = IsolationForest()
iforest.fit(X)

assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)

# Single row
X = rng.randn(1, 10)
iforest = IsolationForest()
iforest.fit(X)

assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
0