-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
Improve IsolationForest average depth evaluation #16721
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -235,19 +235,22 @@ def test_iforest_subsampled_features(): | |
|
||
|
||
def test_iforest_average_path_length(): | ||
def harmonic_humber(n): | ||
return np.sum(1.0/np.arange(1, n+1)) | ||
# It tests non-regression for #8549 which used the wrong formula | ||
# for average path length, strictly for the integer case | ||
# Updated to check average path length when input is <= 2 (issue #11839) | ||
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0 | ||
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0 | ||
result_one = 2.0 * harmonic_humber(4.0) - 2.0 * 4.0 / 5.0 | ||
result_two = 2.0 * harmonic_humber(998.0) - 2.0 * 998.0 / 999.0 | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert_allclose(_average_path_length([0]), [0.0]) | ||
assert_allclose(_average_path_length([1]), [0.0]) | ||
assert_allclose(_average_path_length([2]), [1.0]) | ||
assert_allclose(_average_path_length([5]), [result_one]) | ||
assert_allclose(_average_path_length([999]), [result_two]) | ||
assert_allclose(_average_path_length([5]), [result_one], rtol=0.1) | ||
assert_allclose(_average_path_length([999]), [result_two], rtol=1e-4) | ||
assert_allclose( | ||
_average_path_length(np.array([1, 2, 5, 999])), | ||
[0.0, 1.0, result_one, result_two], | ||
rtol=0.1 | ||
) | ||
# _average_path_length is increasing | ||
avg_path_length = _average_path_length(np.arange(5)) | ||
|
@@ -322,37 +325,3 @@ def test_iforest_deprecation(): | |
warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24" | ||
with pytest.warns(FutureWarning, match=warn_msg): | ||
iforest.fit(iris.data) | ||
|
||
|
||
def test_iforest_with_uniform_data(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We definitely don't want to remove this one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't insist on removing this, but #7141 should be resolved in a proper way then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for opening this PR, can you develop on why #7141 was not resolved in a proper way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry just saw your previous comment - so you're saying it was fixed because the approximation error flipped side basically. I think we should go with @NicolasHug suggestion then - There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ngoix Do you expect me to do it in this PR or in the separate PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. another PR would be the proper way, but then we'd have to merge this one only afterwards because we definitely shouldn't remove this test. |
||
"""Test whether iforest predicts inliers when using uniform data""" | ||
|
||
# 2-d array of all 1s | ||
X = np.ones((100, 10)) | ||
iforest = IsolationForest() | ||
iforest.fit(X) | ||
|
||
rng = np.random.RandomState(0) | ||
|
||
assert all(iforest.predict(X) == 1) | ||
assert all(iforest.predict(rng.randn(100, 10)) == 1) | ||
assert all(iforest.predict(X + 1) == 1) | ||
assert all(iforest.predict(X - 1) == 1) | ||
|
||
# 2-d array where columns contain the same value across rows | ||
X = np.repeat(rng.randn(1, 10), 100, 0) | ||
iforest = IsolationForest() | ||
iforest.fit(X) | ||
|
||
assert all(iforest.predict(X) == 1) | ||
assert all(iforest.predict(rng.randn(100, 10)) == 1) | ||
assert all(iforest.predict(np.ones((100, 10))) == 1) | ||
|
||
# Single row | ||
X = rng.randn(1, 10) | ||
iforest = IsolationForest() | ||
iforest.fit(X) | ||
|
||
assert all(iforest.predict(X) == 1) | ||
assert all(iforest.predict(rng.randn(100, 10)) == 1) | ||
assert all(iforest.predict(np.ones((100, 10))) == 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.