xhluca
diff --git a/‎doc/whats_new/v0.21.rst
Lines changed: 0 additions & 5 deletions b/‎doc/whats_new/v0.21.rst
Lines changed: 0 additions & 5 deletions
diff --git a/‎sklearn/ensemble/iforest.py
Lines changed: 3 additions & 7 deletions b/‎sklearn/ensemble/iforest.py
Lines changed: 3 additions & 7 deletions
diff --git a/‎sklearn/ensemble/tests/test_iforest.py
Lines changed: 5 additions & 14 deletions b/‎sklearn/ensemble/tests/test_iforest.py
Lines changed: 5 additions & 14 deletions
diff --git a/‎sklearn/neighbors/tests/test_lof.py
Lines changed: 0 additions & 18 deletions b/‎sklearn/neighbors/tests/test_lof.py
Lines changed: 0 additions & 18 deletions
diff --git a/‎sklearn/utils/estimator_checks.py
Lines changed: 16 additions & 53 deletions b/‎sklearn/utils/estimator_checks.py
Lines changed: 16 additions & 53 deletions
diff --git a/‎sklearn/utils/tests/test_estimator_checks.py
Lines changed: 2 additions & 13 deletions b/‎sklearn/utils/tests/test_estimator_checks.py
Lines changed: 2 additions & 13 deletions
@@ -109,11 +109,6 @@ Support for Python 3.4 and below has been officially dropped.
   communication overhead. :issue:`12543` by :user:`Isaac Storch <istorch>`
   and `Olivier Grisel`_.
 
-- |Fix| Fixed the output of the average path length computed in
-  :class:`ensemble.IsolationForest` when the input is either 0, 1 or 2.
-  :issue:`13251` by :user:`Albert Thomas <albertcthomas>`
-  and :user:`joshuakennethjones <joshuakennethjones>`.
-
 - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
   the gradients would be incorrectly computed in multiclass classification
   problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.
 
@@ -439,8 +439,6 @@ def _average_path_length(n_samples_leaf):
     """
     if isinstance(n_samples_leaf, INTEGER_TYPES):
         if n_samples_leaf <= 1:
-            return 0.
-        elif n_samples_leaf <= 2:
             return 1.
         else:
             return 2. * (np.log(n_samples_leaf - 1.) + np.euler_gamma) - 2. * (
@@ -452,12 +450,10 @@ def _average_path_length(n_samples_leaf):
         n_samples_leaf = n_samples_leaf.reshape((1, -1))
         average_path_length = np.zeros(n_samples_leaf.shape)
 
-        mask_1 = n_samples_leaf <= 1
-        mask_2 = n_samples_leaf == 2
-        not_mask = ~np.logical_or(mask_1, mask_2)
+        mask = (n_samples_leaf <= 1)
+        not_mask = np.logical_not(mask)
 
-        average_path_length[mask_1] = 0.
-        average_path_length[mask_2] = 1.
+        average_path_length[mask] = 1.
         average_path_length[not_mask] = 2. * (
             np.log(n_samples_leaf[not_mask] - 1.) + np.euler_gamma) - 2. * (
                 n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]
 
@@ -19,7 +19,6 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.testing import assert_allclose
 
 from sklearn.model_selection import ParameterGrid
 from sklearn.ensemble import IsolationForest
@@ -263,22 +262,14 @@ def test_iforest_subsampled_features():
 def test_iforest_average_path_length():
     # It tests non-regression for #8549 which used the wrong formula
     # for average path length, strictly for the integer case
-    # Updated to check average path length when input is <= 2 (issue #11839)
 
     result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5.
     result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999.
-    assert _average_path_length(0) == pytest.approx(0)
-    assert _average_path_length(1) == pytest.approx(0)
-    assert _average_path_length(2) == pytest.approx(1)
-    assert_allclose(_average_path_length(5), result_one)
-    assert_allclose(_average_path_length(999), result_two)
-    assert_allclose(_average_path_length(np.array([1, 2, 5, 999])),
-                    [0., 1., result_one, result_two])
-
-    # _average_path_length is increasing
-    avg_path_length = _average_path_length(np.arange(5))
-    assert_array_equal(avg_path_length, np.sort(avg_path_length))
-
+    assert_almost_equal(_average_path_length(1), 1., decimal=10)
+    assert_almost_equal(_average_path_length(5), result_one, decimal=10)
+    assert_almost_equal(_average_path_length(999), result_two, decimal=10)
+    assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
+                              [1., result_one, result_two], decimal=10)
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
 
@@ -21,7 +21,6 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.estimator_checks import check_estimator
-from sklearn.utils.estimator_checks import check_outlier_corruption
 
 from sklearn.datasets import load_iris
 
@@ -253,20 +252,3 @@ def test_contamination_future_warning():
                          'default contamination parameter 0.1 will change '
                          'in version 0.22 to "auto"',
                          neighbors.LocalOutlierFactor().fit, X)
-
-
-def test_predicted_outlier_number():
-    # the number of predicted outliers should be equal to the number of
-    # expected outliers unless there are ties in the abnormality scores.
-    X = iris.data
-    n_samples = X.shape[0]
-    expected_outliers = 30
-    contamination = float(expected_outliers)/n_samples
-
-    clf = neighbors.LocalOutlierFactor(contamination=contamination)
-    y_pred = clf.fit_predict(X)
-
-    num_outliers = np.sum(y_pred != 1)
-    if num_outliers != expected_outliers:
-        y_dec = clf.negative_outlier_factor_
-        check_outlier_corruption(num_outliers, expected_outliers, y_dec)
@@ -18,6 +18,7 @@
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_not_equal
+from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_in
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -1524,29 +1525,8 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
                 assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
 
 
-def check_outlier_corruption(num_outliers, expected_outliers, decision):
-    # Check for deviation from the precise given contamination level that may
-    # be due to ties in the anomaly scores.
-    if num_outliers < expected_outliers:
-        start = num_outliers
-        end = expected_outliers + 1
-    else:
-        start = expected_outliers
-        end = num_outliers + 1
-
-    # ensure that all values in the 'critical area' are tied,
-    # leading to the observed discrepancy between provided
-    # and actual contamination levels.
-    sorted_decision = np.sort(decision)
-    msg = ('The number of predicted outliers is not equal to the expected '
-           'number of outliers and this difference is not explained by the '
-           'number of ties in the decision_function values')
-    assert len(np.unique(sorted_decision[start:end])) == 1, msg
-
-
 def check_outliers_train(name, estimator_orig, readonly_memmap=True):
-    n_samples = 300
-    X, _ = make_blobs(n_samples=n_samples, random_state=0)
+    X, _ = make_blobs(n_samples=300, random_state=0)
     X = shuffle(X, random_state=7)
 
     if readonly_memmap:
@@ -1567,15 +1547,17 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
     assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
 
     decision = estimator.decision_function(X)
-    scores = estimator.score_samples(X)
-    for output in [decision, scores]:
-        assert output.dtype == np.dtype('float')
-        assert output.shape == (n_samples,)
+
+    score = estimator.score_samples(X)
+    assert score.dtype == np.dtype('float')
 
     # raises error on malformed input for predict
     assert_raises(ValueError, estimator.predict, X.T)
 
     # decision_function agrees with predict
+    decision = estimator.decision_function(X)
+    assert decision.shape == (n_samples,)
     dec_pred = (decision >= 0).astype(np.int)
     dec_pred[dec_pred == 0] = -1
     assert_array_equal(dec_pred, y_pred)
@@ -1584,7 +1566,9 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
     assert_raises(ValueError, estimator.decision_function, X.T)
 
     # decision_function is a translation of score_samples
-    y_dec = scores - estimator.offset_
+    y_scores = estimator.score_samples(X)
+    assert y_scores.shape == (n_samples,)
+    y_dec = y_scores - estimator.offset_
     assert_allclose(y_dec, decision)
 
     # raises error on malformed input for score_samples
@@ -1597,21 +1581,11 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
         # set to 'auto'. This is true for the training set and cannot thus be
         # checked as follows for estimators with a novelty parameter such as
         # LocalOutlierFactor (tested in check_outliers_fit_predict)
-        expected_outliers = 30
-        contamination = expected_outliers / n_samples
+        contamination = 0.1
         estimator.set_params(contamination=contamination)
         estimator.fit(X)
         y_pred = estimator.predict(X)
-
-        num_outliers = np.sum(y_pred != 1)
-        # num_outliers should be equal to expected_outliers unless
-        # there are ties in the decision_function values. this can
-        # only be tested for estimators with a decision_function
-        # method, i.e. all estimators except LOF which is already
-        # excluded from this if branch.
-        if num_outliers != expected_outliers:
-            decision = estimator.decision_function(X)
-            check_outlier_corruption(num_outliers, expected_outliers, decision)
+        assert_almost_equal(np.mean(y_pred != 1), contamination)
 
         # raises error when contamination is a scalar and not in [0,1]
         for contamination in [-0.5, 2.3]:
@@ -2382,8 +2356,7 @@ def check_decision_proba_consistency(name, estimator_orig):
 def check_outliers_fit_predict(name, estimator_orig):
     # Check fit_predict for outlier detectors.
 
-    n_samples = 300
-    X, _ = make_blobs(n_samples=n_samples, random_state=0)
+    X, _ = make_blobs(n_samples=300, random_state=0)
     X = shuffle(X, random_state=7)
     n_samples, n_features = X.shape
     estimator = clone(estimator_orig)
@@ -2405,20 +2378,10 @@ def check_outliers_fit_predict(name, estimator_orig):
     if hasattr(estimator, "contamination"):
         # proportion of outliers equal to contamination parameter when not
         # set to 'auto'
-        expected_outliers = 30
-        contamination = float(expected_outliers)/n_samples
+        contamination = 0.1
         estimator.set_params(contamination=contamination)
         y_pred = estimator.fit_predict(X)
-
-        num_outliers = np.sum(y_pred != 1)
-        # num_outliers should be equal to expected_outliers unless
-        # there are ties in the decision_function values. this can
-        # only be tested for estimators with a decision_function
-        # method
-        if (num_outliers != expected_outliers and
-                hasattr(estimator, 'decision_function')):
-            decision = estimator.decision_function(X)
-            check_outlier_corruption(num_outliers, expected_outliers, decision)
+        assert_almost_equal(np.mean(y_pred != 1), contamination)
 
         # raises error when contamination is a scalar and not in [0,1]
         for contamination in [-0.5, 2.3]:
 
@@ -9,16 +9,14 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils import deprecated
 from sklearn.utils import _joblib
-from sklearn.utils.testing import (assert_raises_regex,
-                                   assert_equal, ignore_warnings,
-                                   assert_warns, assert_raises)
+from sklearn.utils.testing import (assert_raises_regex, assert_equal,
+                                   ignore_warnings, assert_warns)
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.estimator_checks import set_random_state
 from sklearn.utils.estimator_checks import set_checking_parameters
 from sklearn.utils.estimator_checks import check_estimators_unfitted
 from sklearn.utils.estimator_checks import check_fit_score_takes_y
 from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
-from sklearn.utils.estimator_checks import check_outlier_corruption
 from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
 from sklearn.linear_model import LinearRegression, SGDClassifier
 from sklearn.mixture import GaussianMixture
@@ -362,15 +360,6 @@ def test_check_estimator():
     check_estimator(MultiTaskElasticNet())
 
 
-def test_check_outlier_corruption():
-    # should raise AssertionError
-    decision = np.array([0., 1., 1.5, 2.])
-    assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
-    # should pass
-    decision = np.array([0., 1., 1., 2.])
-    check_outlier_corruption(1, 2, decision)
-
-
 def test_check_estimator_transformer_no_mixin():
     # check that TransformerMixin is not required for transformer tests to run
     assert_raises_regex(AttributeError, '.*fit_transform.*',