8000 BUG: fix average path length in iforest (#13251) · xhluca/scikit-learn@00cea26 · GitHub
[go: up one dir, main page]

Skip to content

Commit 00cea26

Browse files
albertcthomasXing
authored andcommitted
BUG: fix average path length in iforest (scikit-learn#13251)
1 parent 7450b55 commit 00cea26

File tree

6 files changed

+110
-26
lines changed

6 files changed

+110
-26
lines changed

doc/whats_new/v0.21.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ Support for Python 3.4 and below has been officially dropped.
109109
communication overhead. :issue:`12543` by :user:`Isaac Storch <istorch>`
110110
and `Olivier Grisel`_.
111111

112+
- |Fix| Fixed the output of the average path length computed in
113+
:class:`ensemble.IsolationForest` when the input is either 0, 1 or 2.
114+
:issue:`13251` by :user:`Albert Thomas <albertcthomas>`
115+
and :user:`joshuakennethjones <joshuakennethjones>`.
116+
112117
- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
113118
the gradients would be incorrectly computed in multiclass classification
114119
problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.

sklearn/ensemble/iforest.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,8 @@ def _average_path_length(n_samples_leaf):
439439
"""
440440
if isinstance(n_samples_leaf, INTEGER_TYPES):
441441
if n_samples_leaf <= 1:
442+
return 0.
443+
elif n_samples_leaf <= 2:
442444
return 1.
8000
443445
else:
444446
return 2. * (np.log(n_samples_leaf - 1.) + np.euler_gamma) - 2. * (
@@ -450,10 +452,12 @@ def _average_path_length(n_samples_leaf):
450452
n_samples_leaf = n_samples_leaf.reshape((1, -1))
451453
average_path_length = np.zeros(n_samples_leaf.shape)
452454

453-
mask = (n_samples_leaf <= 1)
454-
not_mask = np.logical_not(mask)
455+
mask_1 = n_samples_leaf <= 1
456+
mask_2 = n_samples_leaf == 2
457+
not_mask = ~np.logical_or(mask_1, mask_2)
455458

456-
average_path_length[mask] = 1.
459+
average_path_length[mask_1] = 0.
460+
average_path_length[mask_2] = 1.
457461
average_path_length[not_mask] = 2. * (
458462
np.log(n_samples_leaf[not_mask] - 1.) + np.euler_gamma) - 2. * (
459463
n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]

sklearn/ensemble/tests/test_iforest.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from sklearn.utils.testing import assert_equal
2020
from sklearn.utils.testing import assert_greater
2121
from sklearn.utils.testing import ignore_warnings
22+
from sklearn.utils.testing import assert_allclose
2223

2324
from sklearn.model_selection import ParameterGrid
2425
from sklearn.ensemble import IsolationForest
@@ -262,14 +263,22 @@ def test_iforest_subsampled_features():
262263
def test_iforest_average_path_length():
263264
# It tests non-regression for #8549 which used the wrong formula
264265
# for average path length, strictly for the integer case
266+
# Updated to check average path length when input is <= 2 (issue #11839)
265267

266268
result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5.
267269
result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999.
268-
assert_almost_equal(_average_path_length(1), 1., decimal=10)
269-
assert_almost_equal(_average_path_length(5), result_one, decimal=10)
270-
assert_almost_equal(_average_path_length(999), result_two, decimal=10)
271-
assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
272-
[1., result_one, result_two], decimal=10)
270+
assert _average_path_length(0) == pytest.approx(0)
271+
assert _average_path_length(1) == pytest.approx(0)
272+
assert _average_path_length(2) == pytest.approx(1)
273+
assert_allclose(_average_path_length(5), result_one)
274+
assert_allclose(_average_path_length(999), result_two)
275+
assert_allclose(_average_path_length(np.array([1, 2, 5, 999])),
276+
[0., 1., result_one, result_two])
277+
278+
# _average_path_length is increasing
279+
avg_path_length = _average_path_length(np.arange(5))
280+
assert_array_equal(avg_path_length, np.sort(avg_path_length))
281+
273282

274283

275284
@pytest.mark.filterwarnings('ignore:default contamination')

sklearn/neighbors/tests/test_lof.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from sklearn.utils.testing import assert_raises
2222
from sklearn.utils.testing import assert_raises_regex
2323
from sklearn.utils.estimator_checks import check_estimator
24+
from sklearn.utils.estimator_checks import check_outlier_corruption
2425

2526
from sklearn.datasets import load_iris
2627

@@ -252,3 +253,20 @@ def test_contamination_future_warning():
252253
'default contamination parameter 0.1 will change '
253254
'in version 0.22 to "auto"',
254255
neighbors.LocalOutlierFactor().fit, X)
256+
257+
258+
def test_predicted_outlier_number():
259+
# the number of predicted outliers should be equal to the number of
260+
# expected outliers unless there are ties in the abnormality scores.
261+
X = iris.data
262+
n_samples = X.shape[0]
263+
expected_outliers = 30
264+
contamination = float(expected_outliers)/n_samples
265+
266+
clf = neighbors.LocalOutlierFactor(contamination=contamination)
267+
y_pred = clf.fit_predict(X)
268+
269+
num_outliers = np.sum(y_pred != 1)
270+
if num_outliers != expected_outliers:
271+
y_dec = clf.negative_outlier_factor_
272+
check_outlier_corruption(num_outliers, expected_outliers, y_dec)

sklearn/utils/estimator_checks.py

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from sklearn.utils.testing import assert_raise_message
1919
from sklearn.utils.testing import assert_equal
2020
from sklearn.utils.testing import assert_not_equal
21-
from sklearn.utils.testing import assert_almost_equal
2221
from sklearn.utils.testing import assert_in
2322
from sklearn.utils.testing import assert_array_equal
2423
from sklearn.utils.testing import assert_array_almost_equal
@@ -1525,8 +1524,29 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
15251524
assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
15261525

15271526

1527+
def check_outlier_corruption(num_outliers, expected_outliers, decision):
1528+
# Check for deviation from the precise given contamination level that may
1529+
# be due to ties in the anomaly scores.
1530+
if num_outliers < expected_outliers:
1531+
start = num_outliers
1532+
end = expected_outliers + 1
1533+
else:
1534+
start = expected_outliers
1535+
end = num_outliers + 1
1536+
1537+
# ensure that all values in the 'critical area' are tied,
1538+
# leading to the observed discrepancy between provided
1539+
# and actual contamination levels.
1540+
sorted_decision = np.sort(decision)
1541+
msg = ('The number of predicted outliers is not equal to the expected '
1542+
'number of outliers and this difference is not explained by the '
1543+
'number of ties in the decision_function values')
1544+
assert len(np.unique(sorted_decision[start:end])) == 1, msg
1545+
1546+
15281547
def check_outliers_train(name, estimator_orig, readonly_memmap=True):
1529-
X, _ = make_blobs(n_samples=300, random_state=0)
1548+
n_samples = 300
1549+
X, _ = make_blobs(n_samples=n_samples, random_state=0)
15301550
X = shuffle(X, random_state=7)
15311551

15321552
if readonly_memmap:
@@ -1547,17 +1567,15 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
15471567
assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
15481568

15491569
decision = estimator.decision_function(X)
1550-
assert decision.dtype == np.dtype('float')
1551-
1552-
score = estimator.score_samples(X)
1553-
assert score.dtype == np.dtype('float')
1570+
scores = estimator.score_samples(X)
1571+
for output in [decision, scores]:
1572+
assert output.dtype == np.dtype('float')
1573+
assert output.shape == (n_samples,)
15541574

15551575
# raises error on malformed input for predict
15561576
assert_raises(ValueError, estimator.predict, X.T)
15571577

15581578
# decision_function agrees with predict
1559-
decision = estimator.decision_function(X)
1560-
assert decision.shape == (n_samples,)
15611579
dec_pred = (decision >= 0).astype(np.int)
15621580
dec_pred[dec_pred == 0] = -1
15631581
assert_array_equal(dec_pred, y_pred)
@@ -1566,9 +1584,7 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
15661584
assert_raises(ValueError, estimator.decision_function, X.T)
15671585

15681586
# decision_function is a translation of score_samples
1569-
y_scores = estimator.score_samples(X)
1570-
assert y_scores.shape == (n_samples,)
1571-
y_dec = y_scores - estimator.offset_
1587+
y_dec = scores - estimator.offset_
15721588
assert_allclose(y_dec, decision)
15731589

15741590
# raises error on malformed input for score_samples
@@ -1581,11 +1597,21 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
15811597
# set to 'auto'. This is true for the training set and cannot thus be
15821598
# checked as follows for estimators with a novelty parameter such as
15831599
# LocalOutlierFactor (tested in check_outliers_fit_predict)
1584-
contamination = 0.1
1600+
expected_outliers = 30
1601+
contamination = expected_outliers / n_samples
15851602
estimator.set_params(contamination=contamination)
15861603
estimator.fit(X)
15871604
y_pred = estimator.predict(X)
1588-
assert_almost_equal(np.mean(y_pred != 1), contamination)
1605+
1606+
num_outliers = np.sum(y_pred != 1)
1607+
# num_outliers should be equal to expected_outliers unless
1608+
# there are ties in the decision_function values. this can
1609+
# only be tested for estimators with a decision_function
1610+
# method, i.e. all estimators except LOF which is already
1611+
# excluded from this if branch.
1612+
if num_outliers != expected_outliers:
1613+
decision = estimator.decision_function(X)
1614+
check_outlier_corruption(num_outliers, expected_outliers, decision)
15891615

15901616
# raises error when contamination is a scalar and not in [0,1]
15911617
for contamination in [-0.5, 2.3]:
@@ -2356,7 +2382,8 @@ def check_decision_proba_consistency(name, estimator_orig):
23562382
def check_outliers_fit_predict(name, estimator_orig):
23572383
# Check fit_predict for outlier detectors.
23582384

2359-
X, _ = make_blobs(n_samples=300, random_state=0)
2385+
n_samples = 300
2386+
X, _ = make_blobs(n_samples=n_samples, random_state=0)
23602387
X = shuffle(X, random_state=7)
23612388
n_samples, n_features = X.shape
23622389
estimator = clone(estimator_orig)
@@ -2378,10 +2405,20 @@ def check_outliers_fit_predict(name, estimator_orig):
23782405
if hasattr(estimator, "contamination"):
23792406
# proportion of outliers equal to contamination parameter when not
23802407
# set to 'auto'
2381-
contamination = 0.1
2408+
expected_outliers = 30
2409+
contamination = float(expected_outliers)/n_samples
23822410
estimator.set_params(contamination=contamination)
23832411
y_pred = estimator.fit_predict(X)
2384-
assert_almost_equal(np.mean(y_pred != 1), contamination)
2412+
2413+
num_outliers = np.sum(y_pred != 1)
2414+
# num_outliers should be equal to expected_outliers unless
2415+
# there are ties in the decision_function values. this can
2416+
# only be tested for estimators with a decision_function
2417+
# method
2418+
if (num_outliers != expected_outliers and
2419+
hasattr(estimator, 'decision_function')):
2420+
decision = estimator.decision_function(X)
2421+
check_outlier_corruption(num_outliers, expected_outliers, decision)
23852422

23862423
# raises error when contamination is a scalar and not in [0,1]
23872424
for contamination in [-0.5, 2.3]:

sklearn/utils/tests/test_estimator_checks.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,16 @@
99
from sklearn.base import BaseEstimator, ClassifierMixin
1010
from sklearn.utils import deprecated
1111
from sklearn.utils import _joblib
12-
from sklearn.utils.testing import (assert_raises_regex, assert_equal,
13-
ignore_warnings, assert_warns)
12+
from sklearn.utils.testing import (assert_raises_regex,
13+
assert_equal, ignore_warnings,
14+
assert_warns, assert_raises)
1415
from sklearn.utils.estimator_checks import check_estimator
1516
from sklearn.utils.estimator_checks import set_random_state
1617
from sklearn.utils.estimator_checks import set_checking_parameters
1718
from sklearn.utils.estimator_checks import check_estimators_unfitted
1819
from sklearn.utils.estimator_checks import check_fit_score_takes_y
1920
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
21+
from sklearn.utils.estimator_checks import check_outlier_corruption
2022
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
2123
from sklearn.linear_model import LinearRegression, SGDClassifier
2224
from sklearn.mixture import GaussianMixture
@@ -360,6 +362,15 @@ def test_check_estimator():
360362
check_estimator(MultiTaskElasticNet())
361363

362364

365+
def test_check_outlier_corruption():
366+
# should raise AssertionError
367+
decision = np.array([0., 1., 1.5, 2.])
368+
assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
369+
# should pass
370+
decision = np.array([0., 1., 1., 2.])
371+
check_outlier_corruption(1, 2, decision)
372+
373+
363374
def test_check_estimator_transformer_no_mixin():
364375
# check that TransformerMixin is not required for transformer tests to run
365376
assert_raises_regex(AttributeError, '.*fit_transform.*',

0 commit comments

Comments
 (0)
0