xhluca
diff --git a/‎doc/whats_new/v0.21.rst
Lines changed: 5 additions & 0 deletions b/‎doc/whats_new/v0.21.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎sklearn/ensemble/iforest.py
Lines changed: 7 additions & 3 deletions b/‎sklearn/ensemble/iforest.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎sklearn/ensemble/tests/test_iforest.py
Lines changed: 14 additions & 5 deletions b/‎sklearn/ensemble/tests/test_iforest.py
Lines changed: 14 additions & 5 deletions
diff --git a/‎sklearn/neighbors/tests/test_lof.py
Lines changed: 18 additions & 0 deletions b/‎sklearn/neighbors/tests/test_lof.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎sklearn/utils/estimator_checks.py
Lines changed: 53 additions & 16 deletions b/‎sklearn/utils/estimator_checks.py
Lines changed: 53 additions & 16 deletions
diff --git a/‎sklearn/utils/tests/test_estimator_checks.py
Lines changed: 13 additions & 2 deletions b/‎sklearn/utils/tests/test_estimator_checks.py
Lines changed: 13 additions & 2 deletions
@@ -109,6 +109,11 @@ Support for Python 3.4 and below has been officially dropped.
   and `Olivier Grisel`_.
 
+- |Fix| Fixed the output of the average path length computed in
+  :class:`ensemble.IsolationForest` when the input is either 0, 1 or 2.
+  :issue:`13251` by :user:`Albert Thomas <albertcthomas>`
+  and :user:`joshuakennethjones <joshuakennethjones>`.
+
 - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
   the gradients would be incorrectly computed in multiclass classification
   problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.
 
@@ -439,6 +439,8 @@ def _average_path_length(n_samples_leaf):
     """
     if isinstance(n_samples_leaf, INTEGER_TYPES):
         if n_samples_leaf <= 1:
+            return 0.
+        elif n_samples_leaf <= 2:
             return 1.
         else:
             return 2. * (np.log(n_samples_leaf - 1.) + np.euler_gamma) - 2. * (
@@ -450,10 +452,12 @@ def _average_path_length(n_samples_leaf):
         n_samples_leaf = n_samples_leaf.reshape((1, -1))
         average_path_length = np.zeros(n_samples_leaf.shape)
 
-        mask = (n_samples_leaf <= 1)
-        not_mask = np.logical_not(mask)
+        mask_1 = n_samples_leaf <= 1
+        mask_2 = n_samples_leaf == 2
+        not_mask = ~np.logical_or(mask_1, mask_2)
 
-        average_path_length[mask] = 1.
+        average_path_length[mask_1] = 0.
+        average_path_length[mask_2] = 1.
         average_path_length[not_mask] = 2. * (
             np.log(n_samples_leaf[not_mask] - 1.) + np.euler_gamma) - 2. * (
                 n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]
 
@@ -19,6 +19,7 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_allclose
 
 from sklearn.model_selection import ParameterGrid
 from sklearn.ensemble import IsolationForest
@@ -262,14 +263,22 @@ def test_iforest_subsampled_features():
 def test_iforest_average_path_length():
     # It tests non-regression for #8549 which used the wrong formula
     # for average path length, strictly for the integer case
+    # Updated to check average path length when input is <= 2 (issue #11839)
 
     result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5.
     result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999.
-    assert_almost_equal(_average_path_length(1), 1., decimal=10)
-    assert_almost_equal(_average_path_length(5), result_one, decimal=10)
-    assert_almost_equal(_average_path_length(999), result_two, decimal=10)
-    assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
-                              [1., result_one, result_two], decimal=10)
+    assert _average_path_length(0) == pytest.approx(0)
+    assert _average_path_length(1) == pytest.approx(0)
+    assert _average_path_length(2) == pytest.approx(1)
+    assert_allclose(_average_path_length(5), result_one)
+    assert_allclose(_average_path_length(999), result_two)
+    assert_allclose(_average_path_length(np.array([1, 2, 5, 999])),
+                    [0., 1., result_one, result_two])
+
+    # _average_path_length is increasing
+    avg_path_length = _average_path_length(np.arange(5))
+    assert_array_equal(avg_path_length, np.sort(avg_path_length))
+
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
 
@@ -21,6 +21,7 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.estimator_checks import check_estimator
+from sklearn.utils.estimator_checks import check_outlier_corruption
 
 from sklearn.datasets import load_iris
 
@@ -252,3 +253,20 @@ def test_contamination_future_warning():
                          'default contamination parameter 0.1 will change '
                          'in version 0.22 to "auto"',
                          neighbors.LocalOutlierFactor().fit, X)
+
+
+def test_predicted_outlier_number():
+    # the number of predicted outliers should be equal to the number of
+    # expected outliers unless there are ties in the abnormality scores.
+    X = iris.data
+    n_samples = X.shape[0]
+    expected_outliers = 30
+    contamination = float(expected_outliers)/n_samples
+
+    clf = neighbors.LocalOutlierFactor(contamination=contamination)
+
+    num_outliers = np.sum(y_pred != 1)
+    if num_outliers != expected_outliers:
+        y_dec = clf.negative_outlier_factor_
+        check_outlier_corruption(num_outliers, expected_outliers, y_dec)
@@ -18,7 +18,6 @@
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_in
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -1525,8 +1524,29 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
                 assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
 
 
+def check_outlier_corruption(num_outliers, expected_outliers, decision):
+    # Check for deviation from the precise given contamination level that may
+    # be due to ties in the anomaly scores.
+    if num_outliers < expected_outliers:
+        start = num_outliers
+        end = expected_outliers + 1
+    else:
+        start = expected_outliers
+        end = num_outliers + 1
+
+    # ensure that all values in the 'critical area' are tied,
+    # leading to the observed discrepancy between provided
+    # and actual contamination levels.
+    sorted_decision = np.sort(decision)
+    msg = ('The number of predicted outliers is not equal to the expected '
+           'number of outliers and this difference is not explained by the '
+           'number of ties in the decision_function values')
+    assert len(np.unique(sorted_decision[start:end])) == 1, msg
+
+
 def check_outliers_train(name, estimator_orig, readonly_memmap=True):
-    X, _ = make_blobs(n_samples=300, random_state=0)
+    n_samples = 300
+    X, _ = make_blobs(n_samples=n_samples, random_state=0)
     X = shuffle(X, random_state=7)
 
     if readonly_memmap:
@@ -1547,17 +1567,15 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
     assert_array_equal(np.unique(y_pred), np.array([-1, 1]))
 
     decision = estimator.decision_function(X)
-    assert decision.dtype == np.dtype('float')
-
-    score = estimator.score_samples(X)
-    assert score.dtype == np.dtype('float')
+    scores = estimator.score_samples(X)
+    for output in [decision, scores]:
+        assert output.dtype == np.dtype('float')
+        assert output.shape == (n_samples,)
 
     # raises error on malformed input for predict
     assert_raises(ValueError, estimator.predict, X.T)
 
     # decision_function agrees with predict
-    decision = estimator.decision_function(X)
-    assert decision.shape == (n_samples,)
     dec_pred = (decision >= 0).astype(np.int)
     dec_pred[dec_pred == 0] = -1
     assert_array_equal(dec_pred, y_pred)
@@ -1566,9 +1584,7 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
     assert_raises(ValueError, estimator.decision_function, X.T)
 
     # decision_function is a translation of score_samples
-    y_scores = estimator.score_samples(X)
-    assert y_scores.shape == (n_samples,)
-    y_dec = y_scores - estimator.offset_
+    y_dec = scores - estimator.offset_
     assert_allclose(y_dec, decision)
 
     # raises error on malformed input for score_samples
@@ -1581,11 +1597,21 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
         # set to 'auto'. This is true for the training set and cannot thus be
         # checked as follows for estimators with a novelty parameter such as
         # LocalOutlierFactor (tested in check_outliers_fit_predict)
-        contamination = 0.1
+        expected_outliers = 30
+        contamination = expected_outliers / n_samples
         estimator.set_params(contamination=contamination)
         estimator.fit(X)
         y_pred = estimator.predict(X)
-        assert_almost_equal(np.mean(y_pred != 1), contamination)
+
+        num_outliers = np.sum(y_pred != 1)
+        # num_outliers should be equal to expected_outliers unless
+        # there are ties in the decision_function values. this can
+        # only be tested for estimators with a decision_function
+        # method, i.e. all estimators except LOF which is already
+        # excluded from this if branch.
+        if num_outliers != expected_outliers:
+            decision = estimator.decision_function(X)
+            check_outlier_corruption(num_outliers, expected_outliers, decision)
 
         # raises error when contamination is a scalar and not in [0,1]
         for contamination in [-0.5, 2.3]:
@@ -2356,7 +2382,8 @@ def check_decision_proba_consistency(name, estimator_orig):
 def check_outliers_fit_predict(name, estimator_orig):
     # Check fit_predict for outlier detectors.
 
     X, _ = make_blobs(n_samples=300, random_state=0)
+    n_samples = 300
+    X, _ = make_blobs(n_samples=n_samples, random_state=0)
     X = shuffle(X, random_state=7)
     n_samples, n_features = X.shape
     estimator = clone(estimator_orig)
@@ -2378,10 +2405,20 @@ def check_outliers_fit_predict(name, estimator_orig):
     if hasattr(estimator, "contamination"):
         # proportion of outliers equal to contamination parameter when not
         # set to 'auto'
-        contamination = 0.1
+        expected_outliers = 30
+        contamination = float(expected_outliers)/n_samples
         estimator.set_params(contamination=contamination)
         y_pred = estimator.fit_predict(X)
-        assert_almost_equal(np.mean(y_pred != 1), contamination)
+
+        num_outliers = np.sum(y_pred != 1)
+        # num_outliers should be equal to expected_outliers unless
+        # there are ties in the decision_function values. this can
+        # only be tested for estimators with a decision_function
+        # method
+        if (num_outliers != expected_outliers and
+                hasattr(estimator, 'decision_function')):
+            decision = estimator.decision_function(X)
+            check_outlier_corruption(num_outliers, expected_outliers, decision)
 
         # raises error when contamination is a scalar and not in [0,1]
         for contamination in [-0.5, 2.3]:
 
@@ -9,14 +9,16 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils import deprecated
 from sklearn.utils import _joblib
-from sklearn.utils.testing import (assert_raises_regex, assert_equal,
-                                   ignore_warnings, assert_warns)
+from sklearn.utils.testing import (assert_raises_regex,
+                                   assert_equal, ignore_warnings,
+                                   assert_warns, assert_raises)
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.estimator_checks import set_random_state
 from sklearn.utils.estimator_checks import set_checking_parameters
 from sklearn.utils.estimator_checks import check_estimators_unfitted
 from sklearn.utils.estimator_checks import check_fit_score_takes_y
 from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
+from sklearn.utils.estimator_checks import check_outlier_corruption
 from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
 from sklearn.linear_model import LinearRegression, SGDClassifier
 from sklearn.mixture import GaussianMixture
@@ -360,6 +362,15 @@ def test_check_estimator():
     check_estimator(MultiTaskElasticNet())
 
 
+def test_check_outlier_corruption():
+    # should raise AssertionError
+    decision = np.array([0., 1., 1.5, 2.])
+    assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
+    # should pass
+    decision = np.array([0., 1., 1., 2.])
+    check_outlier_corruption(1, 2, decision)
+
+
 def test_check_estimator_transformer_no_mixin():
     # check that TransformerMixin is not required for transformer tests to run
     assert_raises_regex(AttributeError, '.*fit_transform.*',