3
3
IsolationForest benchmark
4
4
==========================================
5
5
A test of IsolationForest on classical anomaly detection datasets.
6
+
7
+ The benchmark is run as follows:
8
+ 1. The dataset is randomly split into a training set and a test set, both
9
+ assumed to contain outliers.
10
+ 2. Isolation Forest is trained on the training set.
11
+ 3. The ROC curve is computed on the test set using the knowledge of the labels.
12
+
13
+ Note that the smtp dataset contains a very small proportion of outliers.
14
+ Therefore, depending on the seed of the random number generator, randomly
15
+ splitting the data set might lead to a test set containing no outliers. In this
16
+ case a warning is raised when computing the ROC curve.
6
17
"""
7
18
8
19
from time import time
12
23
from sklearn .ensemble import IsolationForest
13
24
from sklearn .metrics import roc_curve , auc
14
25
from sklearn .datasets import fetch_kddcup99 , fetch_covtype , fetch_mldata
15
- from sklearn .preprocessing import MultiLabelBinarizer
26
+ from sklearn .preprocessing import LabelBinarizer
16
27
from sklearn .utils import shuffle as sh
17
28
18
29
print (__doc__ )
@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
30
41
print ("----- Outlier ratio: %.5f" % (np .min (cnt ) / len (y )))
31
42
32
43
33
- np . random . seed ( 1 )
44
+ random_state = 1
34
45
fig_roc , ax_roc = plt .subplots (1 , 1 , figsize = (8 , 5 ))
35
46
36
47
# Set this to true for plotting score histograms for each dataset:
37
48
with_decision_function_histograms = False
38
49
39
- # Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
40
- # datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
41
- datasets = ['http' , 'smtp' , 'SA' , 'SF' , 'forestcover' ]
50
+ # datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
51
+ datasets = ['http' , 'smtp' , 'SA' , 'SF' , 'shuttle' , 'forestcover' ]
42
52
43
53
# Loop over all datasets for fitting and scoring the estimator:
44
54
for dat in datasets :
@@ -47,15 +57,16 @@ def print_outlier_ratio(y):
47
57
print ('====== %s ======' % dat )
48
58
print ('--- Fetching data...' )
49
59
if dat in ['http' , 'smtp' , 'SF' , 'SA' ]:
50
- dataset = fetch_kddcup99 (subset = dat , shuffle = True , percent10 = True )
60
+ dataset = fetch_kddcup99 (subset = dat , shuffle = True ,
61
+ percent10 = True , random_state = random_state )
51
62
X = dataset .data
52
63
y = dataset .target
53
64
54
65
if dat == 'shuttle' :
55
66
dataset = fetch_mldata ('shuttle' )
56
67
X = dataset .data
57
68
y = dataset .target
58
- X , y = sh (X , y )
69
+ X , y = sh (X , y , random_state = random_state )
59
70
# we remove data with label 4
60
71
# normal data are then those of class 1
61
72
s = (y != 4 )
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
65
76
print ('----- ' )
66
77
67
78
if dat == 'forestcover' :
68
- dataset = fetch_covtype (shuffle = True )
79
+ dataset = fetch_covtype (shuffle = True , random_state = random_state )
69
80
X = dataset .data
70
81
y = dataset .target
71
82
# normal data are those with attribute 2
@@ -79,17 +90,17 @@ def print_outlier_ratio(y):
79
90
print ('--- Vectorizing data...' )
80
91
81
92
if dat == 'SF' :
82
- lb = MultiLabelBinarizer ()
83
- x1 = lb .fit_transform (X [:, 1 ])
93
+ lb = LabelBinarizer ()
94
+ x1 = lb .fit_transform (X [:, 1 ]. astype ( str ) )
84
95
X = np .c_ [X [:, :1 ], x1 , X [:, 2 :]]
85
96
y = (y != b'normal.' ).astype (int )
86
97
print_outlier_ratio (y )
87
98
88
99
if dat == 'SA' :
89
- lb = MultiLabelBinarizer ()
90
- x1 = lb .fit_transform (X [:, 1 ])
91
- x2 = lb .fit_transform (X [:, 2 ])
92
- x3 = lb .fit_transform (X [:, 3 ])
100
+ lb = LabelBinarizer ()
101
+ x1 = lb .fit_transform (X [:, 1 ]. astype ( str ) )
102
+ x2 = lb .fit_transform (X [:, 2 ]. astype ( str ) )
103
+ x3 = lb .fit_transform (X [:, 3 ]. astype ( str ) )
9E81
93
104
X = np .c_ [X [:, :1 ], x1 , x2 , x3 , X [:, 4 :]]
94
105
y = (y != b'normal.' ).astype (int )
95
106
print_outlier_ratio (y )
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
108
119
y_test = y [n_samples_train :]
109
120
110
121
print ('--- Fitting the IsolationForest estimator...' )
111
- model = IsolationForest (n_jobs = - 1 )
122
+ model = IsolationForest (n_jobs = - 1 , random_state = random_state )
112
123
tstart = time ()
113
124
model .fit (X_train )
114
125
fit_time = time () - tstart
0 commit comments