8000 FIX iForest average path length for small samples (issue #15724) · Konrad0/scikit-learn@9860306 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9860306

Browse files
committed
FIX iForest average path length for small samples (issue scikit-learn#15724)
An improved approximation for the average path length in isolation forests is introduced, based on more a accurate harmonic number calculation. The routine _average_path_length() is mostly replaced.
1 parent 6b4f824 commit 9860306

File tree

1 file changed

+30
-7
lines changed

1 file changed

+30
-7
lines changed

sklearn/ensemble/_iforest.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,23 @@ def _more_tags(self):
463463
}
464464

465465

466+
def _average_path_length_small(n_samples_leaf):
467+
c = (0.0, 0.0, 1.0, 1.6666666666666666667,
468+
2.1666666666666666667, 2.5666666666666666667, 2.9000000000000000000, 3.1857142857142857143,
469+
3.4357142857142857143, 3.6579365079365079365, 3.8579365079365079365, 4.0397546897546897547,
470+
4.2064213564213564214, 4.3602675102675102675, 4.5031246531246531247, 4.6364579864579864580,
471+
4.7614579864579864580, 4.8791050452815158698, 4.9902161563926269809, 5.0954793142873638230,
472+
5.1954793142873638230, 5.2907174095254590611, 5.3816265004345499702, 5.4685830221736804049,
473+
5.5519163555070137383, 5.6319163555070137383, 5.7088394324300906613, 5.7829135065041647354,
474+
5.8543420779327361640, 5.9233075951741154743, 5.9899742618407821410, 6.0544903908730402055,
475+
6.1169903908730402055, 6.1775964514791008116, 6.2364199808908655175, 6.2935628380337226603,
476+
6.3491183935892782159, 6.4031724476433322699, 6.4558040265907006910, 6.5070860778727519730,
477+
6.5570860778727519730, 6.6058665656776300218, 6.6534856132966776409, 6.6999972412036543850,
478+
6.7454517866581998396, 6.7898962311026442840, 6.8333744919722095014, 6.8759276834615712036,
479+
6.9175943501282378702, 6.9584106766588501151, 6.9984106766588501151, 7.0376263629333599190)
480+
#return np.array(c)[n_samples_leaf]
481+
return np.array(c)[np.maximum(0, n_samples_leaf)]
482+
466483
def _average_path_length(n_samples_leaf):
467484
"""
468485
The average path length in a n_samples iTree, which is equal to
@@ -485,15 +502,21 @@ def _average_path_length(n_samples_leaf):
485502
n_samples_leaf = n_samples_leaf.reshape((1, -1))
486503
average_path_length = np.zeros(n_samples_leaf.shape)
487504

488-
mask_1 = n_samples_leaf <= 1
489-
mask_2 = n_samples_leaf == 2
490-
not_mask = ~np.logical_or(mask_1, mask_2)
505+
mask_small = n_samples_leaf < 52
506+
not_mask = ~mask_small
507+
508+
average_path_length[mask_small] = _average_path_length_small(n_samples_leaf[mask_small])
509+
510+
# Average path length given by 2*(H(n)-1), where H(n) is the nth harmonic number.
511+
# For the harmonic number calculation, see the following publications and references therein
512+
# Villarino, M. Ramanujan's Harmonic Number Expansion into Negative Powers of a Triangular Number. https://arxiv.org/abs/0707.3950
513+
# or
514+
# Wang, W. Harmonic Number Expansions of the Ramanujan Type. Results Math 73, 161 (2018). https://doi.org/10.1007/s00025-018-0920-8
491515

492-
average_path_length[mask_1] = 0.
493-
average_path_length[mask_2] = 1.
516+
tmp = 1.0/np.square(n_samples_leaf[not_mask])
494517
average_path_length[not_mask] = (
495-
2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
496-
- 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
518+
2.0 * (np.log(n_samples_leaf[not_mask]) - 1.0 + np.euler_gamma)
519+
+ 1.0/n_samples_leaf[not_mask] - tmp*(1.0/6.0 - tmp*(1.0/60.0 - tmp/126.0))
497520
)
498521

499522
return average_path_length.reshape(n_samples_leaf_shape)

0 commit comments

Comments
 (0)
0