From 67a85ecdb50a5a2459e1682d3ce2de29addf716f Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Tue, 19 Sep 2017 12:13:45 +0200
Subject: [PATCH 1/8] fix lof bench

---
 benchmarks/bench_lof.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
index 620adc3d43b0c..1cbdfc08056a8 100644
--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -21,7 +21,7 @@
 np.random.seed(2)
 
 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['shuttle']
+datasets = ['SA', 'SF', 'shuttle']
 
 novelty_detection = True  # if False, training set polluted by outliers
 
@@ -61,24 +61,20 @@
 
     if dataset_name == 'SF':
         lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
         X = np.c_[X[:, :1], x1, X[:, 2:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     if dataset_name == 'SA':
         lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
-        lb.fit(X[:, 2])
-        x2 = lb.transform(X[:, 2])
-        lb.fit(X[:, 3])
-        x3 = lb.transform(X[:, 3])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     if dataset_name == 'http' or dataset_name == 'smtp':
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     n_samples, n_features = np.shape(X)
     n_samples_train = n_samples // 2
@@ -101,7 +97,7 @@
     fit_time = time() - tstart
     tstart = time()
 
-    scoring = -model.decision_function(X_test)  # the lower, the more normal
+    scoring = -model._decision_function(X_test)  # the lower, the more normal
     predict_time = time() - tstart
     fpr, tpr, thresholds = roc_curve(y_test, scoring)
     AUC = auc(fpr, tpr)

From 5a0c557e9a8d4335ef9b4e188b53c5a8d38b3b4d Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Tue, 19 Sep 2017 12:14:05 +0200
Subject: [PATCH 2/8] fix iforest bench

---
 benchmarks/bench_isolation_forest.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
index 4d9f3037b2758..2905b5b7860cf 100644
--- a/benchmarks/bench_isolation_forest.py
+++ b/benchmarks/bench_isolation_forest.py
@@ -12,7 +12,7 @@
 from sklearn.ensemble import IsolationForest
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils import shuffle as sh
 
 print(__doc__)
@@ -38,7 +38,7 @@ def print_outlier_ratio(y):
 
 # Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
 # datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
 # Loop over all datasets for fitting and scoring the estimator:
 for dat in datasets:
@@ -79,17 +79,17 @@ def print_outlier_ratio(y):
     print('--- Vectorizing data...')
 
     if dat == 'SF':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
         X = np.c_[X[:, :1], x1, X[:, 2:]]
         y = (y != b'normal.').astype(int)
         print_outlier_ratio(y)
 
     if dat == 'SA':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
-        x2 = lb.fit_transform(X[:, 2])
-        x3 = lb.fit_transform(X[:, 3])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
         y = (y != b'normal.').astype(int)
         print_outlier_ratio(y)

From 98256235990bb112f0a0b759452923ab2381c6d3 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Wed, 20 Sep 2017 16:53:09 +0200
Subject: [PATCH 3/8] make LOF benchmark an outlier detection benchmark

---
 benchmarks/bench_lof.py | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
index 1cbdfc08056a8..d95d373e6997a 100644
--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -5,6 +5,12 @@
 
 A test of LocalOutlierFactor on classical anomaly detection datasets.
 
+Note that LocalOutlierFactor is not meant to predict on a test set and its
+performance is assessed in an outlier detection context:
+1. The model is trained on a dataset containing outliers.
+2. The ROC curve is computed on the whole dataset using the knowledge of the
+labels.
+
 """
 
 from time import time
@@ -21,9 +27,7 @@
 np.random.seed(2)
 
 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['SA', 'SF', 'shuttle']
-
-novelty_detection = True  # if False, training set polluted by outliers
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
 for dataset_name in datasets:
     # loading and vectorization
@@ -76,30 +80,16 @@
     if dataset_name == 'http' or dataset_name == 'smtp':
         y = (y != b'normal.').astype(int)
 
-    n_samples, n_features = np.shape(X)
-    n_samples_train = n_samples // 2
-    n_samples_test = n_samples - n_samples_train
-
-    X = X.astype(float)
-    X_train = X[:n_samples_train, :]
-    X_test = X[n_samples_train:, :]
-    y_train = y[:n_samples_train]
-    y_test = y[n_samples_train:]
-
-    if novelty_detection:
-        X_train = X_train[y_train == 0]
-        y_train = y_train[y_train == 0]
-
     print('LocalOutlierFactor processing...')
     model = LocalOutlierFactor(n_neighbors=20)
     tstart = time()
-    model.fit(X_train)
+    model.fit(X)
     fit_time = time() - tstart
     tstart = time()
 
-    scoring = -model._decision_function(X_test)  # the lower, the more normal
+    scoring = -model.negative_outlier_factor_  # the lower, the more normal
     predict_time = time() - tstart
-    fpr, tpr, thresholds = roc_curve(y_test, scoring)
+    fpr, tpr, thresholds = roc_curve(y, scoring)
     AUC = auc(fpr, tpr)
     plt.plot(fpr, tpr, lw=1,
              label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'

From 16804a10b96131245a93e5eb60d9756d9a551005 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Wed, 20 Sep 2017 17:12:09 +0200
Subject: [PATCH 4/8] remove predict_time for LOF as there is no predict on a
 test set

---
 benchmarks/bench_lof.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
index d95d373e6997a..9e21807466c02 100644
--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -85,16 +85,12 @@
     tstart = time()
     model.fit(X)
     fit_time = time() - tstart
-    tstart = time()
-
     scoring = -model.negative_outlier_factor_  # the lower, the more normal
-    predict_time = time() - tstart
     fpr, tpr, thresholds = roc_curve(y, scoring)
     AUC = auc(fpr, tpr)
     plt.plot(fpr, tpr, lw=1,
-             label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
-                    'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
-                                            predict_time)))
+             label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
+                    % (dataset_name, AUC, fit_time)))
 
 plt.xlim([-0.05, 1.05])
 plt.ylim([-0.05, 1.05])

From 809fbe0922b90866993e60f20b16e006f3f9b47d Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Thu, 21 Sep 2017 16:25:41 +0200
Subject: [PATCH 5/8] add X.astype(float) that was inadvertently removed in a
 previous commit

---
 benchmarks/bench_lof.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
index 9e21807466c02..64009cfb7a4c7 100644
--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -80,6 +80,8 @@
     if dataset_name == 'http' or dataset_name == 'smtp':
         y = (y != b'normal.').astype(int)
 
+    X = X.astype(float)
+
     print('LocalOutlierFactor processing...')
     model = LocalOutlierFactor(n_neighbors=20)
     tstart = time()

From 8696938883978fb0d6aa0984f12a5fe140c78a92 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Mon, 25 Sep 2017 13:49:57 +0200
Subject: [PATCH 6/8] fix and clarify randomness in LOF benchmark

---
 benchmarks/bench_lof.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
index 64009cfb7a4c7..a085042fa9167 100644
--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -7,9 +7,13 @@
 
 Note that LocalOutlierFactor is not meant to predict on a test set and its
 performance is assessed in an outlier detection context:
-1. The model is trained on a dataset containing outliers.
-2. The ROC curve is computed on the whole dataset using the knowledge of the
+1. The model is trained on the whole dataset which is assumed to contain
+outliers.
+2. The ROC curve is computed on the same dataset using the knowledge of the
 labels.
+In this context there is no need to shuffle the dataset because the model
+is trained and tested on the whole dataset. The randomness of this benchmark
+is only caused by the random selection of anomalies in the SA dataset.
 
 """
 
@@ -20,21 +24,21 @@
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
 from sklearn.preprocessing import LabelBinarizer
-from sklearn.utils import shuffle as sh
 
 print(__doc__)
 
-np.random.seed(2)
+SEED = 2  # to control the random selection of anomalies in the SA dataset
 
 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
+plt.figure()
 for dataset_name in datasets:
     # loading and vectorization
     print('loading data')
     if dataset_name in ['http', 'smtp', 'SA', 'SF']:
-        dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
-                                 percent10=False)
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
+                                 random_state=SEED)
         X = dataset.data
         y = dataset.target
 
@@ -42,7 +46,6 @@
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -51,7 +54,7 @@
         y = (y != 1).astype(int)
 
     if dataset_name == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype()
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2

From aaf9e514b930464aa7e5e87d3e842b4801e110cc Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Mon, 25 Sep 2017 14:25:34 +0200
Subject: [PATCH 7/8] fix and clarify randomness in iforest benchmark

---
 benchmarks/bench_isolation_forest.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
index 2905b5b7860cf..e0542ad72c012 100644
--- a/benchmarks/bench_isolation_forest.py
+++ b/benchmarks/bench_isolation_forest.py
@@ -3,6 +3,17 @@
 IsolationForest benchmark
 ==========================================
 A test of IsolationForest on classical anomaly detection datasets.
+
+The benchmark is run as follows:
+1. The dataset is randomly split into a training set and a test set, both
+assumed to contain outliers.
+2. Isolation Forest is trained on the training set.
+3. The ROC curve is computed on the test set using the knowledge of the labels.
+
+Note that the smtp dataset contains a very small proportion of outliers.
+Therefore, depending on the seed of the random number generator, randomly
+splitting the data set might lead to a test set containing no outliers. In this
+case a warning is raised when computing the ROC curve.
 """
 
 from time import time
@@ -30,14 +41,13 @@ def print_outlier_ratio(y):
     print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
 
 
-np.random.seed(1)
+SEED = 1
 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
 
 # Set this to true for plotting score histograms for each dataset:
 with_decision_function_histograms = False
 
-# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
-# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
+# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
 # Loop over all datasets for fitting and scoring the estimator:
@@ -47,7 +57,8 @@ def print_outlier_ratio(y):
     print('====== %s ======' % dat)
     print('--- Fetching data...')
     if dat in ['http', 'smtp', 'SF', 'SA']:
-        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
+        dataset = fetch_kddcup99(subset=dat, shuffle=True,
+                                 percent10=True, random_state=SEED)
         X = dataset.data
         y = dataset.target
 
@@ -55,7 +66,7 @@ def print_outlier_ratio(y):
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
+        X, y = sh(X, y, random_state=SEED)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
         print('----- ')
 
     if dat == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype(shuffle=True, random_state=SEED)
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1)
+    model = IsolationForest(n_jobs=-1, random_state=SEED)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart

From 3e06c31d82e498baa2a0f112e7bcb500d6c86e5e Mon Sep 17 00:00:00 2001
From: Albert Thomas <albertthomas88@gmail.com>
Date: Mon, 9 Oct 2017 14:40:31 +0200
Subject: [PATCH 8/8] SEED -> random_state

---
 benchmarks/bench_isolation_forest.py | 10 +++++-----
 benchmarks/bench_lof.py              |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
index e0542ad72c012..547b4f3ed2ddc 100644
--- a/benchmarks/bench_isolation_forest.py
+++ b/benchmarks/bench_isolation_forest.py
@@ -41,7 +41,7 @@ def print_outlier_ratio(y):
     print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
 
 
-SEED = 1
+random_state = 1
 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
 
 # Set this to true for plotting score histograms for each dataset:
@@ -58,7 +58,7 @@ def print_outlier_ratio(y):
     print('--- Fetching data...')
     if dat in ['http', 'smtp', 'SF', 'SA']:
         dataset = fetch_kddcup99(subset=dat, shuffle=True,
-                                 percent10=True, random_state=SEED)
+                                 percent10=True, random_state=random_state)
         X = dataset.data
         y = dataset.target
 
@@ -66,7 +66,7 @@ def print_outlier_ratio(y):
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y, random_state=SEED)
+        X, y = sh(X, y, random_state=random_state)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -76,7 +76,7 @@ def print_outlier_ratio(y):
         print('----- ')
 
     if dat == 'forestcover':
-        dataset = fetch_covtype(shuffle=True, random_state=SEED)
+        dataset = fetch_covtype(shuffle=True, random_state=random_state)
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -119,7 +119,7 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1, random_state=SEED)
+    model = IsolationForest(n_jobs=-1, random_state=random_state)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart
diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
index a085042fa9167..4d063b8100fcd 100644
--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -27,7 +27,7 @@
 
 print(__doc__)
 
-SEED = 2  # to control the random selection of anomalies in the SA dataset
+random_state = 2  # to control the random selection of anomalies in SA
 
 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
@@ -38,7 +38,7 @@
     print('loading data')
     if dataset_name in ['http', 'smtp', 'SA', 'SF']:
         dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
-                                 random_state=SEED)
+                                 random_state=random_state)
         X = dataset.data
         y = dataset.target