3
3
IsolationForest benchmark
4
4
==========================================
5
5
A test of IsolationForest on classical anomaly detection datasets.
6
+
7
+ The benchmark is run as follows:
8
+ 1. The dataset is randomly split into a training set and a test set, both
9
+ assumed to contain outliers.
10
+ 2. Isolation Forest is trained on the training set.
11
+ 3. The ROC curve is computed on the test set using the knowledge of the labels.
12
+
13
+ Note that the smtp dataset contains a very small proportion of outliers.
14
+ Therefore, depending on the seed of the random number generator, randomly
15
+ splitting the data set might lead to a test set containing no outliers. In this
16
+ case a warning is raised when computing the ROC curve.
6
17
"""
7
18
8
19
from time import time
@@ -30,14 +41,13 @@ def print_outlier_ratio(y):
30
41
print ("----- Outlier ratio: %.5f" % (np .min (cnt ) / len (y )))
31
42
32
43
33
- np . random . seed ( 1 )
44
+ SEED = 1
34
45
fig_roc , ax_roc = plt .subplots (1 , 1 , figsize = (8 , 5 ))
35
46
36
47
# Set this to true for plotting score histograms for each dataset:
37
48
with_decision_function_histograms = False
38
49
39
- # Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
40
- # datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
50
+ # datasets available = ['http', 'smtp', 'SA', 'SF
10000
', 'shuttle', 'forestcover']
41
51
datasets = ['http' , 'smtp' , 'SA' , 'SF' , 'shuttle' , 'forestcover' ]
42
52
43
53
# Loop over all datasets for fitting and scoring the estimator:
@@ -47,15 +57,16 @@ def print_outlier_ratio(y):
47
57
print ('====== %s ======' % dat )
48
58
print ('--- Fetching data...' )
49
59
if dat in ['http' , 'smtp' , 'SF' , 'SA' ]:
50
- dataset = fetch_kddcup99 (subset = dat , shuffle = True , percent10 = True )
60
+ dataset = fetch_kddcup99 (subset = dat , shuffle = True ,
61
+ percent10 = True , random_state = SEED )
51
62
X = dataset .data
52
63
y = dataset .target
53
64
54
65
if dat == 'shuttle' :
55
66
dataset = fetch_mldata ('shuttle' )
56
67
X = dataset .data
57
68
y = dataset .target
58
- X , y = sh (X , y )
69
+ X , y = sh (X , y , random_state = SEED )
59
70
# we remove data with label 4
60
71
# normal data are then those of class 1
61
72
s = (y != 4 )
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
65
76
print ('----- ' )
66
77
67
78
if dat == 'forestcover' :
68
- dataset = fetch_covtype (shuffle = True )
79
+ dataset = fetch_covtype (shuffle = True , random_state = SEED )
69
80
X = dataset .data
70
81
y = dataset .target
71
82
# normal data are those with attribute 2
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
108
119
y_test = y [n_samples_train :]
109
120
110
121
print ('--- Fitting the IsolationForest estimator...' )
111
- model = IsolationForest (n_jobs = - 1 )
122
+ model = IsolationForest (n_jobs = - 1 , random_state = SEED )
112
123
tstart = time ()
113
124
model .fit (X_train )
114
125
fit_time = time () - tstart
0 commit comments