diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index 4d9f3037b2758..547b4f3ed2ddc 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -3,6 +3,17 @@ IsolationForest benchmark ========================================== A test of IsolationForest on classical anomaly detection datasets. + +The benchmark is run as follows: +1. The dataset is randomly split into a training set and a test set, both +assumed to contain outliers. +2. Isolation Forest is trained on the training set. +3. The ROC curve is computed on the test set using the knowledge of the labels. + +Note that the smtp dataset contains a very small proportion of outliers. +Therefore, depending on the seed of the random number generator, randomly +splitting the data set might lead to a test set containing no outliers. In this +case a warning is raised when computing the ROC curve. """ from time import time @@ -12,7 +23,7 @@ from sklearn.ensemble import IsolationForest from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata -from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle as sh print(__doc__) @@ -30,15 +41,14 @@ def print_outlier_ratio(y): print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) -np.random.seed(1) +random_state = 1 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) # Set this to true for plotting score histograms for each dataset: with_decision_function_histograms = False -# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down: -# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover'] +# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] +datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] # Loop over all datasets for fitting and scoring the estimator: for dat in datasets: @@ -47,7 +57,8 @@ def print_outlier_ratio(y): print('====== %s ======' % dat) print('--- Fetching data...') if dat in ['http', 'smtp', 'SF', 'SA']: - dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True) + dataset = fetch_kddcup99(subset=dat, shuffle=True, + percent10=True, random_state=random_state) X = dataset.data y = dataset.target @@ -55,7 +66,7 @@ def print_outlier_ratio(y): dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target - X, y = sh(X, y) + X, y = sh(X, y, random_state=random_state) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) @@ -65,7 +76,7 @@ def print_outlier_ratio(y): print('----- ') if dat == 'forestcover': - dataset = fetch_covtype(shuffle=True) + dataset = fetch_covtype(shuffle=True, random_state=random_state) X = dataset.data y = dataset.target # normal data are those with attribute 2 @@ -79,17 +90,17 @@ def print_outlier_ratio(y): print('--- Vectorizing data...') if dat == 'SF': - lb = MultiLabelBinarizer() - x1 = lb.fit_transform(X[:, 1]) + lb = LabelBinarizer() + x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != b'normal.').astype(int) print_outlier_ratio(y) if dat == 'SA': - lb = MultiLabelBinarizer() - x1 = lb.fit_transform(X[:, 1]) - x2 = lb.fit_transform(X[:, 2]) - x3 = lb.fit_transform(X[:, 3]) + lb = LabelBinarizer() + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b'normal.').astype(int) print_outlier_ratio(y) @@ -108,7 +119,7 @@ def print_outlier_ratio(y): y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') - model = IsolationForest(n_jobs=-1) + model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 620adc3d43b0c..4d063b8100fcd 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -5,6 +5,16 @@ A test of LocalOutlierFactor on classical anomaly detection datasets. +Note that LocalOutlierFactor is not meant to predict on a test set and its +performance is assessed in an outlier detection context: +1. The model is trained on the whole dataset which is assumed to contain +outliers. +2. The ROC curve is computed on the same dataset using the knowledge of the +labels. +In this context there is no need to shuffle the dataset because the model +is trained and tested on the whole dataset. The randomness of this benchmark +is only caused by the random selection of anomalies in the SA dataset. + """ from time import time @@ -14,23 +24,21 @@ from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata from sklearn.preprocessing import LabelBinarizer -from sklearn.utils import shuffle as sh print(__doc__) -np.random.seed(2) +random_state = 2 # to control the random selection of anomalies in SA # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['shuttle'] - -novelty_detection = True # if False, training set polluted by outliers +datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] +plt.figure() for dataset_name in datasets: # loading and vectorization print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: - dataset = fetch_kddcup99(subset=dataset_name, shuffle=True, - percent10=False) + dataset = fetch_kddcup99(subset=dataset_name, percent10=True, + random_state=random_state) X = dataset.data y = dataset.target @@ -38,7 +46,6 @@ dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target - X, y = sh(X, y) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) @@ -47,7 +54,7 @@ y = (y != 1).astype(int) if dataset_name == 'forestcover': - dataset = fetch_covtype(shuffle=True) + dataset = fetch_covtype() X = dataset.data y = dataset.target # normal data are those with attribute 2 @@ -61,54 +68,34 @@ if dataset_name == 'SF': lb = LabelBinarizer() - lb.fit(X[:, 1]) - x1 = lb.transform(X[:, 1]) + x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] - y = (y != 'normal.').astype(int) + y = (y != b'normal.').astype(int) if dataset_name == 'SA': lb = LabelBinarizer() - lb.fit(X[:, 1]) - x1 = lb.transform(X[:, 1]) - lb.fit(X[:, 2]) - x2 = lb.transform(X[:, 2]) - lb.fit(X[:, 3]) - x3 = lb.transform(X[:, 3]) + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] - y = (y != 'normal.').astype(int) + y = (y != b'normal.').astype(int) if dataset_name == 'http' or dataset_name == 'smtp': - y = (y != 'normal.').astype(int) - - n_samples, n_features = np.shape(X) - n_samples_train = n_samples // 2 - n_samples_test = n_samples - n_samples_train + y = (y != b'normal.').astype(int) X = X.astype(float) - X_train = X[:n_samples_train, :] - X_test = X[n_samples_train:, :] - y_train = y[:n_samples_train] - y_test = y[n_samples_train:] - - if novelty_detection: - X_train = X_train[y_train == 0] - y_train = y_train[y_train == 0] print('LocalOutlierFactor processing...') model = LocalOutlierFactor(n_neighbors=20) tstart = time() - model.fit(X_train) + model.fit(X) fit_time = time() - tstart - tstart = time() - - scoring = -model.decision_function(X_test) # the lower, the more normal - predict_time = time() - tstart - fpr, tpr, thresholds = roc_curve(y_test, scoring) + scoring = -model.negative_outlier_factor_ # the lower, the more normal + fpr, tpr, thresholds = roc_curve(y, scoring) AUC = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, - label=('ROC for %s (area = %0.3f, train-time: %0.2fs,' - 'test-time: %0.2fs)' % (dataset_name, AUC, fit_time, - predict_time))) + label=('ROC for %s (area = %0.3f, train-time: %0.2fs)' + % (dataset_name, AUC, fit_time))) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05])