8000 [MRG+1] Fix LOF and Isolation benchmarks by albertcthomas · Pull Request #9798 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+1] Fix LOF and Isolation benchmarks #9798

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 25, 2017
41 changes: 26 additions & 15 deletions benchmarks/bench_isolation_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
IsolationForest benchmark
==========================================
A test of IsolationForest on classical anomaly detection datasets.

The benchmark is run as follows:
1. The dataset is randomly split into a training set and a test set, both
assumed to contain outliers.
2. Isolation Forest is trained on the training set.
3. The ROC curve is computed on the test set using the knowledge of the labels.

Note that the smtp dataset contains a very small proportion of outliers.
Therefore, depending on the seed of the random number generator, randomly
splitting the data set might lead to a test set containing no outliers. In this
case a warning is raised when computing the ROC curve.
"""

from time import time
Expand All @@ -12,7 +23,7 @@
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh

print(__doc__)
Expand All @@ -30,15 +41,14 @@ def print_outlier_ratio(y):
print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))


np.random.seed(1)
random_state = 1
fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))

# Set this to true for plotting score histograms for each dataset:
with_decision_function_histograms = False

# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that the shuttle dataset is run by default, we can remove this comment.

datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

# Loop over all datasets for fitting and scoring the estimator:
for dat in datasets:
Expand All @@ -47,15 +57,16 @@ def print_outlier_ratio(y):
print('====== %s ======' % dat)
print('--- Fetching data...')
if dat in ['http', 'smtp', 'SF', 'SA']:
dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
dataset = fetch_kddcup99(subset=dat, shuffle=True,
percent10=True, random_state=random_state)
X = dataset.data
y = dataset.target

if dat == 'shuttle':
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
X, y = sh(X, y)
X, y = sh(X, y, random_state=random_state)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
Expand All @@ -65,7 +76,7 @@ def print_outlier_ratio(y):
print('----- ')

if dat == 'forestcover':
dataset = fetch_covtype(shuffle=True)
dataset = fetch_covtype(shuffle=True, random_state=random_state)
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
Expand All @@ -79,17 +90,17 @@ def print_outlier_ratio(y):
print('--- Vectorizing data...')

if dat == 'SF':
lb = MultiLabelBinarizer()
x1 = lb.fit_transform(X[:, 1])
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != b'normal.').astype(int)
print_outlier_ratio(y)

if dat == 'SA':
lb = MultiLabelBinarizer()
x1 = lb.fit_transform(X[:, 1])
x2 = lb.fit_transform(X[:, 2])
x3 = lb.fit_transform(X[:, 3])
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != b'normal.').astype(int)
print_outlier_ratio(y)
Expand All @@ -108,7 +119,7 @@ def print_outlier_ratio(y):
y_test = y[n_samples_train:]

print('--- Fitting the IsolationForest estimator...')
model = IsolationForest(n_jobs=-1)
model = IsolationForest(n_jobs=-1, random_state=random_state)
tstart = time()
model.fit(X_train)
fit_time = time() - tstart
Expand Down
69 changes: 28 additions & 41 deletions benchmarks/bench_lof.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@

A test of LocalOutlierFactor on classical anomaly detection datasets.

Note that LocalOutlierFactor is not meant to predict on a test set and its
performance is assessed in an outlier detection context:
1. The model is trained on the whole dataset which is assumed to contain
outliers.
2. The ROC curve is computed on the same dataset using the knowledge of the
labels.
In this context there is no need to shuffle the dataset because the model
is trained and tested on the whole dataset. The randomness of this benchmark
is only caused by the random selection of anomalies in the SA dataset.

"""

from time import time
Expand All @@ -14,31 +24,28 @@
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh

print(__doc__)

np.random.seed(2)
random_state = 2 # to control the random selection of anomalies in SA

# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['shuttle']

novelty_detection = True # if False, training set polluted by outliers
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

plt.figure()
for dataset_name in datasets:
# loading and vectorization
print('loading data')
if dataset_name in ['http', 'smtp', 'SA', 'SF']:
dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
percent10=False)
dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
random_state=random_state)
X = dataset.data
y = dataset.target

if dataset_name == 'shuttle':
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
X, y = sh(X, y)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
Expand All @@ -47,7 +54,7 @@
y = (y != 1).astype(int)

if dataset_name == 'forestcover':
dataset = fetch_covtype(shuffle=True)
dataset = fetch_covtype()
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
Expand All @@ -61,54 +68,34 @@

if dataset_name == 'SF':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != 'normal.').astype(int)
y = (y != b'normal.').astype(int)

if dataset_name == 'SA':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
lb.fit(X[:, 2])
x2 = lb.transform(X[:, 2])
lb.fit(X[:, 3])
x3 = lb.transform(X[:, 3])
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != 'normal.').astype(int)
y = (y != b'normal.').astype(int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

works with python2 and python3?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes


if dataset_name == 'http' or dataset_name == 'smtp':
y = (y != 'normal.').astype(int)

n_samples, n_features = np.shape(X)
n_samples_train = n_samples // 2
n_samples_test = n_samples - n_samples_train
y = (y != b'normal.').astype(int)

X = X.astype(float)
X_train = X[:n_samples_train, :]
X_test = X[n_samples_train:, :]
y_train = y[:n_samples_train]
y_test = y[n_samples_train:]

if novelty_detection:
X_train = X_train[y_train == 0]
y_train = y_train[y_train == 0]

print('LocalOutlierFactor processing...')
model = LocalOutlierFactor(n_neighbors=20)
tstart = time()
model.fit(X_train)
model.fit(X)
fit_time = time() - tstart
tstart = time()

scoring = -model.decision_function(X_test) # the lower, the more normal
predict_time = time() - tstart
fpr, tpr, thresholds = roc_curve(y_test, scoring)
scoring = -model.negative_outlier_factor_ # the lower, the more normal
fpr, tpr, thresholds = roc_curve(y, scoring)
AUC = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1,
label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
predict_time)))
label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
% (dataset_name, AUC, fit_time)))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
Expand Down
0