8000 Revert "BUG: fix average path length in iforest (#13251)" · xhluca/scikit-learn@7965662 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7965662

Browse files
author
Xing
authored
Revert "BUG: fix average path length in iforest (scikit-learn#13251)"
This reverts commit 00cea26.
1 parent 7918faa commit 7965662

File tree

6 files changed

+26
-110
lines changed

6 files changed

+26
-110
lines changed

doc/whats_new/v0.21.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,6 @@ Support for Python 3.4 and below has been officially dropped.
109109
communication overhead. :issue:`12543` by :user:`Isaac Storch <istorch>`
110110
and `Olivier Grisel`_.
111111

112-
- |Fix| Fixed the output of the average path length computed in
113-
:class:`ensemble.IsolationForest` when the input is either 0, 1 or 2.
114-
:issue:`13251` by :user:`Albert Thomas <albertcthomas>`
115-
and :user:`joshuakennethjones <joshuakennethjones>`.
116-
117112
- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
118113
the gradients would be incorrectly computed in multiclass classification
119114
problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.

sklearn/ensemble/iforest.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -439,8 +439,6 @@ def _average_path_length(n_samples_leaf):
439439
"""
440440
if isinstance(n_samples_leaf, INTEGER_TYPES):
441441
if n_samples_leaf <= 1:
442-
return 0.
443-
elif n_samples_leaf <= 2:
444442
return 1.
445443
else:
446444
return 2. * (np.log(n_samples_leaf - 1.) + np.euler_gamma) - 2. * (
@@ -452,12 +450,10 @@ def _average_path_length(n_samples_leaf):
452450
n_samples_leaf = n_samples_leaf.reshape((1, -1))
453451
average_path_length = np.zeros(n_samples_leaf.shape)
454452

455-
mask_1 = n_samples_leaf <= 1
456-
mask_2 = n_samples_leaf == 2
457-
not_mask = ~np.logical_or(mask_1, mask_2)
453+
mask = (n_samples_leaf <= 1)
454+
not_mask = np.logical_not(mask)
458455

459-
average_path_length[mask_1] = 0.
460-
average_path_length[mask_2] = 1.
456+
average_path_length[mask] = 1.
461457
average_path_length[not_mask] = 2. * (
462458
np.log(n_samples_leaf[not_mask] - 1.) + np.euler_gamma) - 2. * (
463459
n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]

sklearn/ensemble/tests/test_iforest.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from sklearn.utils.testing import assert_equal
2020
from sklearn.utils.testing import assert_greater
2121
from sklearn.utils.testing import ignore_warnings
22-
from sklearn.utils.testing import assert_allclose
2322

2423
from sklearn.model_selection import ParameterGrid
2524
from sklearn.ensemble import IsolationForest
@@ -263,22 +262,14 @@ def test_iforest_subsampled_features():
263262
def test_iforest_average_path_length():
264263
# It tests non-regression for #8549 which used the wrong formula
265264
# for average path length, strictly for the integer case
266-
# Updated to check average path length when input is <= 2 (issue #11839)
267265

268266
result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5.
269267
result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999.
270-
assert _average_path_length(0) == pytest.approx(0)
271-
assert _average_path_length(1) == pytest.approx(0)
272-
assert _average_path_length(2) == pytest.approx(1)
273-
assert_allclose(_average_path_length(5), result_one)
F438
274-
assert_allclose(_average_path_length(999), result_two)
275-
assert_allclose(_average_path_length(np.array([1, 2, 5, 999])),
276-
[0., 1., result_one, result_two])
277-
278-
# _average_path_length is increasing
279-
avg_path_length = _average_path_length(np.arange(5))
280-
assert_array_equal(avg_path_length, np.sort(avg_path_length))
281-
268+
assert_almost_equal(_average_path_length(1), 1., decimal=10)
269+
assert_almost_equal(_average_path_length(5), result_one, decimal=10)
270+
assert_almost_equal(_average_path_length(999), result_two, decimal=10)
271+
assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
272+
[1., result_one, result_two], decimal=10)
282273

283274

284275
@pytest.mark.filterwarnings('ignore:default contamination')

sklearn/neighbors/tests/test_lof.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from sklearn.utils.testing import assert_raises
2222
from sklearn.utils.testing import assert_raises_regex
2323
from sklearn.utils.estimator_checks import check_estimator
24-
from sklearn.utils.estimator_checks import check_outlier_corruption
2524

2625
from sklearn.datasets import load_iris
2726

@@ -253,20 +252,3 @@ def test_contamination_future_warning():
253252
'default contamination parameter 0.1 will change '
254253
'in version 0.22 to "auto"',
255254
neighbors.LocalOutlierFactor().fit, X)
256-
257-
258-
def test_predicted_outlier_number():
259-
# the number of predicted outliers should be equal to the number of
260-
# expected outliers unless there are ties in the abnormality scores.
261-
X = iris.data
262-
n_samples = X.shape[0]
263-
expected_outliers = 30
264-
contamination = float(expected_outliers)/n_samples
265-
266-
clf = neighbors.LocalOutlierFactor(contamination=contamination)
267-
y_pred = clf.fit_predict(X)
268-
269-
num_outliers = np.sum(y_pred != 1)
270-
if num_outliers != expected_outliers:
271-
y_dec = clf.negative_outlier_factor_
272-
check_outlier_corruption(num_outliers, expected_outliers, y_dec)

sklearn/utils/estimator_checks.py

Lines changed: 16 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sklearn.utils.testing import assert_raise_message
1919
from sklearn.utils.testing import assert_equal
2020
from sklearn.utils.testing import assert_not_equal
21+
from sklearn.utils.testing import assert_almost_equal
2122
from sklearn.utils.testing import assert_in
2223
from sklearn.utils.testing import assert_array_equal
2324
from sklearn.utils.testing import assert_array_almost_equal
@@ -1524,29 +1525,8 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
15241525
assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
15251526

15261527

1527-
def check_outlier_corruption(num_outliers, expected_outliers, decision):
1528-
# Check for deviation from the precise given contamination level that may
1529-
# be due to ties in the anomaly scores.
1530-
if num_outliers < expected_outliers:
1531-
start = num_outliers
1532-
end = expected_outliers + 1
1533-
else:
1534-
start = expected_outliers
1535-
end = num_outliers + 1
1536-
1537-
# ensure that all values in the 'critical area' are tied,
1538-
# leading to the observed discrepancy between provided
1539-
# and actual contamination levels.
1540-
sorted_decision = np.sort(decision)
1541-
msg = ('The number of predicted outliers is not equal to the expected '
1542-
'number of outliers and this difference is not explained by the '
1543-
'number of ties in the decision_function values')
1544-
assert len(np.unique(sorted_decision[start:end])) == 1, msg
1545-
1546-
15471528
def check_outliers_train(name, estimator_orig, readonly_memmap=True):
1548-
n_samples = 300
1549-
X, _ = make_blobs(n_samples=n_samples, random_state=0)
1529+
X, _ = make_blobs(n_samples=300, random_state=0)
15501530
X = shuffle(X, random_state=7)
15511531

15521532
if readonly_memmap:
@@ -1567,15 +1547,17 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
15671547
assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
15681548

15691549
decision = estimator.decision_function(X)
1570-
scores = estimator.score_samples(X)
1571-
for output in [decision, scores]:
1572-
assert output.dtype == np.dtype('float')
1573-
assert output.shape == (n_samples,)
1550+
assert decision.dtype == np.dtype('float')
1551+
1552+
score = estimator.score_samples(X)
1553+
assert score.dtype == np.dtype('float')
15741554

15751555
# raises error on malformed input for predict
15761556
assert_raises(ValueError, estimator.predict, X.T)
15771557

15781558
# decision_function agrees with predict
1559+
decision = estimator.decision_function(X)
1560+
assert decision.shape == (n_samples,)
15791561
dec_pred = (decision >= 0).astype(np.int)
15801562
dec_pred[dec_pred == 0] = -1
15811563
assert_array_equal(dec_pred, y_pred)
@@ -1584,7 +1566,9 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
15841566
assert_raises(ValueError, estimator.decision_function, X.T)
15851567

15861568
# decision_function is a translation of score_samples
1587-
y_dec = scores - estimator.offset_
1569+
y_scores = estimator.score_samples(X)
1570+
assert y_scores.shape == (n_samples,)
1571+
y_dec = y_scores - estimator.offset_
15881572
assert_allclose(y_dec, decision)
15891573

15901574
# raises error on malformed input for score_samples
@@ -1597,21 +1581,11 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
15971581
# set to 'auto'. This is true for the training set and cannot thus be
15981582
# checked as follows for estimators with a novelty parameter such as
15991583
# LocalOutlierFactor (tested in check_outliers_fit_predict)
1600-
expected_outliers = 30
1601-
contamination = expected_outliers / n_samples
1584+
contamination = 0.1
16021585
estimator.set_params(contamination=contamination)
16031586
estimator.fit(X)
16041587
y_pred = estimator.predict(X)
1605-
1606-
num_outliers = np.sum(y_pred != 1)
1607-
# num_outliers should be equal to expected_outliers unless
1608-
# there are ties in the decision_function values. this can
1609-
# only be tested for estimators with a decision_function
1610-
# method, i.e. all estimators except LOF which is already
1611-
# excluded from this if branch.
1612-
if num_outliers != expected_outliers:
1613-
decision = estimator.decision_function(X)
1614-
check_outlier_corruption(num_outliers, expected_outliers, decision)
1588+
assert_almost_equal(np.mean(y_pred != 1), contamination)
16151589

16161590
# raises error when contamination is a scalar and not in [0,1]
16171591
for contamination in [-0.5, 2.3]:
@@ -2382,8 +2356,7 @@ def check_decision_proba_consistency(name, estimator_orig):
23822356
def check_outliers_fit_predict(name, estimator_orig):
23832357
# Check fit_predict for outlier detectors.
23842358

2385-
n_samples = 300
2386-
X, _ = make_blobs(n_samples=n_samples, random_state=0)
2359+
X, _ = make_blobs(n_samples=300, random_state=0)
23872360
X = shuffle(X, random_state=7)
23882361
n_samples, n_features = X.shape
23892362
estimator = clone(estimator_orig)
@@ -2405,20 +2378,10 @@ def check_outliers_fit_predict(name, estimator_orig):
24052378
if hasattr(estimator, "contamination"):
24062379
# proportion of outliers equal to contamination parameter when not
24072380
# set to 'auto'
2408-
expected_outliers = 30
2409-
contamination = float(expected_outliers)/n_samples
2381+
contamination = 0.1
24102382
estimator.set_params(contamination=contamination)
24112383
y_pred = estimator.fit_predict(X)
2412-
2413-
num_outliers = np.sum(y_pred != 1)
2414-
# num_outliers should be equal to expected_outliers unless
2415-
# there are ties in the decision_function values. this can
2416-
# only be tested for estimators with a decision_function
2417-
# method
2418-
if (num_outliers != expected_outliers and
2419-
hasattr(estimator, 'decision_function')):
2420-
decision = estimator.decision_function(X)
2421-
check_outlier_corruption(num_outliers, expected_outliers, decision)
2384+
assert_almost_equal(np.mean(y_pred != 1), contamination)
24222385

24232386
# raises error when contamination is a scalar and not in [0,1]
24242387
for contamination in [-0.5, 2.3]:

sklearn/utils/tests/test_estimator_checks.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,14 @@
99
from sklearn.base import BaseEstimator, ClassifierMixin
1010
from sklearn.utils import deprecated
1111
from sklearn.utils import _joblib
12-
from sklearn.utils.testing import (assert_raises_regex,
13-
assert_equal, ignore_warnings,
14-
assert_warns, assert_raises)
12+
from sklearn.utils.testing import (assert_raises_regex, assert_equal,
13+
ignore_warnings, assert_warns)
1514
from sklearn.utils.estimator_checks import check_estimator
1615
from sklearn.utils.estimator_checks import set_random_state
1716
from sklearn.utils.estimator_checks import set_checking_parameters
1817
from sklearn.utils.estimator_checks import check_estimators_unfitted
1918
from sklearn.utils.estimator_checks import check_fit_score_takes_y
2019
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
21-
from sklearn.utils.estimator_checks import check_outlier_corruption
2220
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
2321
from sklearn.linear_model import LinearRegression, SGDClassifier
2422
from sklearn.mixture import GaussianMixture
@@ -362,15 +360,6 @@ def test_check_estimator():
362360
check_estimator(MultiTaskElasticNet())
363361

364362

365-
def test_check_outlier_corruption():
366-
# should raise AssertionError
367-
decision = np.array([0., 1., 1.5, 2.])
368-
assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
369-
# should pass
370-
decision = np.array([0., 1., 1., 2.])
371-
check_outlier_corruption(1, 2, decision)
372-
373-
374363
def test_check_estimator_transformer_no_mixin():
375364
# check that TransformerMixin is not required for transformer tests to run
376365
assert_raises_regex(AttributeError, '.*fit_transform.*',

0 commit comments

Comments
 (0)
0