From 67a85ecdb50a5a2459e1682d3ce2de29addf716f Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Tue, 19 Sep 2017 12:13:45 +0200 Subject: [PATCH 1/8] fix lof bench --- benchmarks/bench_lof.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 620adc3d43b0c..1cbdfc08056a8 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -21,7 +21,7 @@ np.random.seed(2) # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['shuttle'] +datasets = ['SA', 'SF', 'shuttle'] novelty_detection = True # if False, training set polluted by outliers @@ -61,24 +61,20 @@ if dataset_name == 'SF': lb = LabelBinarizer() - lb.fit(X[:, 1]) - x1 = lb.transform(X[:, 1]) + x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] - y = (y != 'normal.').astype(int) + y = (y != b'normal.').astype(int) if dataset_name == 'SA': lb = LabelBinarizer() - lb.fit(X[:, 1]) - x1 = lb.transform(X[:, 1]) - lb.fit(X[:, 2]) - x2 = lb.transform(X[:, 2]) - lb.fit(X[:, 3]) - x3 = lb.transform(X[:, 3]) + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] - y = (y != 'normal.').astype(int) + y = (y != b'normal.').astype(int) if dataset_name == 'http' or dataset_name == 'smtp': - y = (y != 'normal.').astype(int) + y = (y != b'normal.').astype(int) n_samples, n_features = np.shape(X) n_samples_train = n_samples // 2 @@ -101,7 +97,7 @@ fit_time = time() - tstart tstart = time() - scoring = -model.decision_function(X_test) # the lower, the more normal + scoring = -model._decision_function(X_test) # the lower, the more normal predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring) AUC = auc(fpr, tpr) From 5a0c557e9a8d4335ef9b4e188b53c5a8d38b3b4d Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Tue, 19 Sep 2017 12:14:05 +0200 Subject: [PATCH 2/8] fix iforest bench --- benchmarks/bench_isolation_forest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index 4d9f3037b2758..2905b5b7860cf 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -12,7 +12,7 @@ from sklearn.ensemble import IsolationForest from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata -from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle as sh print(__doc__) @@ -38,7 +38,7 @@ def print_outlier_ratio(y): # Removed the shuttle dataset because as of 2017-03-23 mldata.org is down: # datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover'] +datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] # Loop over all datasets for fitting and scoring the estimator: for dat in datasets: @@ -79,17 +79,17 @@ def print_outlier_ratio(y): print('--- Vectorizing data...') if dat == 'SF': - lb = MultiLabelBinarizer() - x1 = lb.fit_transform(X[:, 1]) + lb = LabelBinarizer() + x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != b'normal.').astype(int) print_outlier_ratio(y) if dat == 'SA': - lb = MultiLabelBinarizer() - x1 = lb.fit_transform(X[:, 1]) - x2 = lb.fit_transform(X[:, 2]) - x3 = lb.fit_transform(X[:, 3]) + lb = LabelBinarizer() + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b'normal.').astype(int) print_outlier_ratio(y) From 98256235990bb112f0a0b759452923ab2381c6d3 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Wed, 20 Sep 2017 16:53:09 +0200 Subject: [PATCH 3/8] make LOF benchmark an outlier detection benchmark --- benchmarks/bench_lof.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 1cbdfc08056a8..d95d373e6997a 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -5,6 +5,12 @@ A test of LocalOutlierFactor on classical anomaly detection datasets. +Note that LocalOutlierFactor is not meant to predict on a test set and its +performance is assessed in an outlier detection context: +1. The model is trained on a dataset containing outliers. +2. The ROC curve is computed on the whole dataset using the knowledge of the +labels. + """ from time import time @@ -21,9 +27,7 @@ np.random.seed(2) # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['SA', 'SF', 'shuttle'] - -novelty_detection = True # if False, training set polluted by outliers +datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] for dataset_name in datasets: # loading and vectorization @@ -76,30 +80,16 @@ if dataset_name == 'http' or dataset_name == 'smtp': y = (y != b'normal.').astype(int) - n_samples, n_features = np.shape(X) - n_samples_train = n_samples // 2 - n_samples_test = n_samples - n_samples_train - - X = X.astype(float) - X_train = X[:n_samples_train, :] - X_test = X[n_samples_train:, :] - y_train = y[:n_samples_train] - y_test = y[n_samples_train:] - - if novelty_detection: - X_train = X_train[y_train == 0] - y_train = y_train[y_train == 0] - print('LocalOutlierFactor processing...') model = LocalOutlierFactor(n_neighbors=20) tstart = time() - model.fit(X_train) + model.fit(X) fit_time = time() - tstart tstart = time() - scoring = -model._decision_function(X_test) # the lower, the more normal + scoring = -model.negative_outlier_factor_ # the lower, the more normal predict_time = time() - tstart - fpr, tpr, thresholds = roc_curve(y_test, scoring) + fpr, tpr, thresholds = roc_curve(y, scoring) AUC = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label=('ROC for %s (area = %0.3f, train-time: %0.2fs,' From 16804a10b96131245a93e5eb60d9756d9a551005 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Wed, 20 Sep 2017 17:12:09 +0200 Subject: [PATCH 4/8] remove predict_time for LOF as there is no predict on a test set --- benchmarks/bench_lof.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index d95d373e6997a..9e21807466c02 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -85,16 +85,12 @@ tstart = time() model.fit(X) fit_time = time() - tstart - tstart = time() - scoring = -model.negative_outlier_factor_ # the lower, the more normal - predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y, scoring) AUC = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, - label=('ROC for %s (area = %0.3f, train-time: %0.2fs,' - 'test-time: %0.2fs)' % (dataset_name, AUC, fit_time, - predict_time))) + label=('ROC for %s (area = %0.3f, train-time: %0.2fs)' + % (dataset_name, AUC, fit_time))) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) From 809fbe0922b90866993e60f20b16e006f3f9b47d Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Thu, 21 Sep 2017 16:25:41 +0200 Subject: [PATCH 5/8] add X.astype(float) that was inadvertently removed in a previous commit --- benchmarks/bench_lof.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 9e21807466c02..64009cfb7a4c7 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -80,6 +80,8 @@ if dataset_name == 'http' or dataset_name == 'smtp': y = (y != b'normal.').astype(int) + X = X.astype(float) + print('LocalOutlierFactor processing...') model = LocalOutlierFactor(n_neighbors=20) tstart = time() From 8696938883978fb0d6aa0984f12a5fe140c78a92 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Mon, 25 Sep 2017 13:49:57 +0200 Subject: [PATCH 6/8] fix and clarify randomness in LOF benchmark --- benchmarks/bench_lof.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 64009cfb7a4c7..a085042fa9167 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -7,9 +7,13 @@ Note that LocalOutlierFactor is not meant to predict on a test set and its performance is assessed in an outlier detection context: -1. The model is trained on a dataset containing outliers. -2. The ROC curve is computed on the whole dataset using the knowledge of the +1. The model is trained on the whole dataset which is assumed to contain +outliers. +2. The ROC curve is computed on the same dataset using the knowledge of the labels. +In this context there is no need to shuffle the dataset because the model +is trained and tested on the whole dataset. The randomness of this benchmark +is only caused by the random selection of anomalies in the SA dataset. """ @@ -20,21 +24,21 @@ from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata from sklearn.preprocessing import LabelBinarizer -from sklearn.utils import shuffle as sh print(__doc__) -np.random.seed(2) +SEED = 2 # to control the random selection of anomalies in the SA dataset # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] +plt.figure() for dataset_name in datasets: # loading and vectorization print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: - dataset = fetch_kddcup99(subset=dataset_name, shuffle=True, - percent10=False) + dataset = fetch_kddcup99(subset=dataset_name, percent10=True, + random_state=SEED) X = dataset.data y = dataset.target @@ -42,7 +46,6 @@ dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target - X, y = sh(X, y) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) @@ -51,7 +54,7 @@ y = (y != 1).astype(int) if dataset_name == 'forestcover': - dataset = fetch_covtype(shuffle=True) + dataset = fetch_covtype() X = dataset.data y = dataset.target # normal data are those with attribute 2 From aaf9e514b930464aa7e5e87d3e842b4801e110cc Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Mon, 25 Sep 2017 14:25:34 +0200 Subject: [PATCH 7/8] fix and clarify randomness in iforest benchmark --- benchmarks/bench_isolation_forest.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index 2905b5b7860cf..e0542ad72c012 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -3,6 +3,17 @@ IsolationForest benchmark ========================================== A test of IsolationForest on classical anomaly detection datasets. + +The benchmark is run as follows: +1. The dataset is randomly split into a training set and a test set, both +assumed to contain outliers. +2. Isolation Forest is trained on the training set. +3. The ROC curve is computed on the test set using the knowledge of the labels. + +Note that the smtp dataset contains a very small proportion of outliers. +Therefore, depending on the seed of the random number generator, randomly +splitting the data set might lead to a test set containing no outliers. In this +case a warning is raised when computing the ROC curve. """ from time import time @@ -30,14 +41,13 @@ def print_outlier_ratio(y): print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) -np.random.seed(1) +SEED = 1 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) # Set this to true for plotting score histograms for each dataset: with_decision_function_histograms = False -# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down: -# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] +# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] # Loop over all datasets for fitting and scoring the estimator: @@ -47,7 +57,8 @@ def print_outlier_ratio(y): print('====== %s ======' % dat) print('--- Fetching data...') if dat in ['http', 'smtp', 'SF', 'SA']: - dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True) + dataset = fetch_kddcup99(subset=dat, shuffle=True, + percent10=True, random_state=SEED) X = dataset.data y = dataset.target @@ -55,7 +66,7 @@ def print_outlier_ratio(y): dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target - X, y = sh(X, y) + X, y = sh(X, y, random_state=SEED) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) @@ -65,7 +76,7 @@ def print_outlier_ratio(y): print('----- ') if dat == 'forestcover': - dataset = fetch_covtype(shuffle=True) + dataset = fetch_covtype(shuffle=True, random_state=SEED) X = dataset.data y = dataset.target # normal data are those with attribute 2 @@ -108,7 +119,7 @@ def print_outlier_ratio(y): y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') - model = IsolationForest(n_jobs=-1) + model = IsolationForest(n_jobs=-1, random_state=SEED) tstart = time() model.fit(X_train) fit_time = time() - tstart From 3e06c31d82e498baa2a0f112e7bcb500d6c86e5e Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Mon, 9 Oct 2017 14:40:31 +0200 Subject: [PATCH 8/8] SEED -> random_state --- benchmarks/bench_isolation_forest.py | 10 +++++----- benchmarks/bench_lof.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index e0542ad72c012..547b4f3ed2ddc 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -41,7 +41,7 @@ def print_outlier_ratio(y): print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) -SEED = 1 +random_state = 1 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) # Set this to true for plotting score histograms for each dataset: @@ -58,7 +58,7 @@ def print_outlier_ratio(y): print('--- Fetching data...') if dat in ['http', 'smtp', 'SF', 'SA']: dataset = fetch_kddcup99(subset=dat, shuffle=True, - percent10=True, random_state=SEED) + percent10=True, random_state=random_state) X = dataset.data y = dataset.target @@ -66,7 +66,7 @@ def print_outlier_ratio(y): dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target - X, y = sh(X, y, random_state=SEED) + X, y = sh(X, y, random_state=random_state) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) @@ -76,7 +76,7 @@ def print_outlier_ratio(y): print('----- ') if dat == 'forestcover': - dataset = fetch_covtype(shuffle=True, random_state=SEED) + dataset = fetch_covtype(shuffle=True, random_state=random_state) X = dataset.data y = dataset.target # normal data are those with attribute 2 @@ -119,7 +119,7 @@ def print_outlier_ratio(y): y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') - model = IsolationForest(n_jobs=-1, random_state=SEED) + model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index a085042fa9167..4d063b8100fcd 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -27,7 +27,7 @@ print(__doc__) -SEED = 2 # to control the random selection of anomalies in the SA dataset +random_state = 2 # to control the random selection of anomalies in SA # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] @@ -38,7 +38,7 @@ print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: dataset = fetch_kddcup99(subset=dataset_name, percent10=True, - random_state=SEED) + random_state=random_state) X = dataset.data y = dataset.target