8000 [MRG+1] Fix LOF and Isolation benchmarks (#9798) · maskani-moh/scikit-learn@e9d5a24 · GitHub
[go: up one dir, main page]

Skip to content

Commit e9d5a24

Browse files
albertcthomasmaskani-moh
authored andcommitted
[MRG+1] Fix LOF and Isolation benchmarks (scikit-learn#9798)
1 parent 564dd6c commit e9d5a24

File tree

2 files changed

+54
-56
lines changed

2 files changed

+54
-56
lines changed

benchmarks/bench_isolation_forest.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,17 @@
33
IsolationForest benchmark
44
==========================================
55
A test of IsolationForest on classical anomaly detection datasets.
6+
7+
The benchmark is run as follows:
8+
1. The dataset is randomly split into a training set and a test set, both
9+
assumed to contain outliers.
10+
2. Isolation Forest is trained on the training set.
11+
3. The ROC curve is computed on the test set using the knowledge of the labels.
12+
13+
Note that the smtp dataset contains a very small proportion of outliers.
14+
Therefore, depending on the seed of the random number generator, randomly
15+
splitting the data set might lead to a test set containing no outliers. In this
16+
case a warning is raised when computing the ROC curve.
617
"""
718

819
from time import time
@@ -12,7 +23,7 @@
1223
from sklearn.ensemble import IsolationForest
1324
from sklearn.metrics import roc_curve, auc
1425
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
15-
from sklearn.preprocessing import MultiLabelBinarizer
26+
from sklearn.preprocessing import LabelBinarizer
1627
from sklearn.utils import shuffle as sh
1728

1829
print(__doc__)
@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
3041
print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
3142

3243

33-
np.random.seed(1)
44+
random_state = 1
3445
fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
3546

3647
# Set this to true for plotting score histograms for each dataset:
3748
with_decision_function_histograms = False
3849

39-
# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
40-
# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
41-
datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
50+
# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
51+
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
4252

4353
# Loop over all datasets for fitting and scoring the estimator:
4454
for dat in datasets:
@@ -47,15 +57,16 @@ def print_outlier_ratio(y):
4757
print('====== %s ======' % dat)
4858
print('--- Fetching data...')
4959
if dat in ['http', 'smtp', 'SF', 'SA']:
50-
dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
60+
dataset = fetch_kddcup99(subset=dat, shuffle=True,
61+
percent10=True, random_state=random_state)
5162
X = dataset.data
5263
y = dataset.target
5364

5465
if dat == 'shuttle':
5566
dataset = fetch_mldata('shuttle')
5667
X = dataset.data
5768
y = dataset.target
58-
X, y = sh(X, y)
69+
X, y = sh(X, y, random_state=random_state)
5970
# we remove data with label 4
6071
# normal data are then those of class 1
6172
s = (y != 4)
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
6576
print('----- ')
6677

6778
if dat == 'forestcover':
68-
dataset = fetch_covtype(shuffle=True)
79+
dataset = fetch_covtype(shuffle=True, random_state=random_state)
6980
X = dataset.data
7081
y = dataset.target
7182
# normal data are those with attribute 2
@@ -79,17 +90,17 @@ def print_outlier_ratio(y):
7990
print('--- Vectorizing data...')
8091

8192
if dat == 'SF':
82-
lb = MultiLabelBinarizer()
83-
x1 = lb.fit_transform(X[:, 1])
93+
lb = LabelBinarizer()
94+
x1 = lb.fit_transform(X[:, 1].astype(str))
8495
X = np.c_[X[:, :1], x1, X[:, 2:]]
8596
y = (y != b'normal.').astype(int)
8697
print_outlier_ratio(y)
8798

8899
if dat == 'SA':
89-
lb = MultiLabelBinarizer()
90-
x1 = lb.fit_transform(X[:, 1])
91-
x2 = lb.fit_transform(X[:, 2])
92-
x3 = lb.fit_transform(X[:, 3])
100+
lb = LabelBinarizer()
101+
x1 = lb.fit_transform(X[:, 1].astype(str))
102+
x2 = lb.fit_transform(X[:, 2].astype(str))
103+
x3 = lb.fit_transform(X[:, 3].astype(str))
9E81
93104
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
94105
y = (y != b'normal.').astype(int)
95106
print_outlier_ratio(y)
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
108119
y_test = y[n_samples_train:]
109120

110121
print('--- Fitting the IsolationForest estimator...')
111-
model = IsolationForest(n_jobs=-1)
122+
model = IsolationForest(n_jobs=-1, random_state=random_state)
112123
tstart = time()
113124
model.fit(X_train)
114125
fit_time = time() - tstart

benchmarks/bench_lof.py

Lines changed: 28 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@
55
66
A test of LocalOutlierFactor on classical anomaly detection datasets.
77
8+
Note that LocalOutlierFactor is not meant to predict on a test set and its
9+
performance is assessed in an outlier detection context:
10+
1. The model is trained on the whole dataset which is assumed to contain
11+
outliers.
12+
2. The ROC curve is computed on the same dataset using the knowledge of the
13+
labels.
14+
In this context there is no need to shuffle the dataset because the model
15+
is trained and tested on the whole dataset. The randomness of this benchmark
16+
is only caused by the random selection of anomalies in the SA dataset.
17+
818
"""
919

1020
from time import time
@@ -14,31 +24,28 @@
1424
from sklearn.metrics import roc_curve, auc
1525
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
1626
from sklearn.preprocessing import LabelBinarizer
17-
from sklearn.utils import shuffle as sh
1827

1928
print(__doc__)
2029

21-
np.random.seed(2)
30+
random_state = 2 # to control the random selection of anomalies in SA
2231

2332
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
24-
datasets = ['shuttle']
25-
26-
novelty_detection = True # if False, training set polluted by outliers
33+
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
2734

35+
plt.figure()
2836
for dataset_name in datasets:
2937
# loading and vectorization
3038
print('loading data')
3139
if dataset_name in ['http', 'smtp', 'SA', 'SF']:
32-
dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
33-
percent10=False)
40+
dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
41+
random_state=random_state)
3442
X = dataset.data
3543
y = dataset.target
3644

3745
if dataset_name == 'shuttle':
3846
dataset = fetch_mldata('shuttle')
3947
X = dataset.data
4048
y = dataset.target
41-
X, y = sh(X, y)
4249
# we remove data with label 4
4350
# normal data are then those of class 1
4451
s = (y != 4)
@@ -47,7 +54,7 @@
4754
y = (y != 1).astype(int)
4855

4956
if dataset_name == 'forestcover':
50-
dataset = fetch_covtype(shuffle=True)
57+
dataset = fetch_covtype()
5158
X = dataset.data
5259
y = dataset.target
5360
# normal data are those with attribute 2
@@ -61,54 +68,34 @@
6168

6269
if dataset_name == 'SF':
6370
lb = LabelBinarizer()
64-
lb.fit(X[:, 1])
65-
x1 = lb.transform(X[:, 1])
71+
x1 = lb.fit_transform(X[:, 1].astype(str))
6672
X = np.c_[X[:, :1], x1, X[:, 2:]]
67-
y = (y != 'normal.').astype(int)
73+
y = (y != b'normal.').astype(int)
6874

6975
if dataset_name == 'SA':
7076
lb = LabelBinarizer()
71-
lb.fit(X[:, 1])
72-
x1 = lb.transform(X[:, 1])
73-
lb.fit(X[:, 2])
74-
x2 = lb.transform(X[:, 2])
75-
lb.fit(X[:, 3])
76-
x3 = lb.transform(X[:, 3])
77+
x1 = lb.fit_transform(X[:, 1].astype(str))
78+
x2 = lb.fit_transform(X[:, 2].astype(str))
79+
x3 = lb.fit_transform(X[:, 3].astype(str))
7780
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
78-
y = (y != 'normal.').astype(int)
81+
y = (y != b'normal.').astype(int)
7982

8083
if dataset_name == 'http' or dataset_name == 'smtp':
81-
y = (y != 'normal.').astype(int)
82-
83-
n_samples, n_features = np.shape(X)
84-
n_samples_train = n_samples // 2
85-
n_samples_test = n_samples - n_samples_train
84+
y = (y != b'normal.').astype(int)
8685

8786
X = X.astype(float)
88-
X_train = X[:n_samples_train, :]
89-
X_test = X[n_samples_train:, :]
90-
y_train = y[:n_samples_train]
91-
y_test = y[n_samples_train:]
92-
93-
if novelty_detection:
94-
X_train = X_train[y_train == 0]
95-
y_train = y_train[y_train == 0]
9687

9788
print('LocalOutlierFactor processing...')
9889
model = LocalOutlierFactor(n_neighbors=20)
9990
tstart = time()
100-
model.fit(X_train)
91+
model.fit(X)
10192
fit_time = time() - tstart
102-
tstart = time()
103-
104-
scoring = -model.decision_function(X_test) # the lower, the more normal
105-
predict_time = time() - tstart
106-
fpr, tpr, thresholds = roc_curve(y_test, scoring)
93+
scoring = -model.negative_outlier_factor_ # the lower, the more normal
94+
fpr, tpr, thresholds = roc_curve(y, scoring)
10795
AUC = auc(fpr, tpr)
10896
plt.plot(fpr, tpr, lw=1,
109-
label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
110-
'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
111-
predict_time)))
97+
label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
98+
% (dataset_name, AUC, fit_time)))
11299

113100
plt.xlim([-0.05, 1.05])
114101
plt.ylim([-0.05, 1.05])

0 commit comments

Comments
 (0)
0