fix and clarify randomness in iforest benchmark

albertcthomas · albertcthomas · commit aaf9e514b930 · 2017-09-25T14:25:34.000+02:00
diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -3,6 +3,17 @@
 IsolationForest benchmark
 ==========================================
 A test of IsolationForest on classical anomaly detection datasets.
+
+The benchmark is run as follows:
+1. The dataset is randomly split into a training set and a test set, both
+assumed to contain outliers.
+2. Isolation Forest is trained on the training set.
+3. The ROC curve is computed on the test set using the knowledge of the labels.
+
+Note that the smtp dataset contains a very small proportion of outliers.
+Therefore, depending on the seed of the random number generator, randomly
+splitting the data set might lead to a test set containing no outliers. In this
+case a warning is raised when computing the ROC curve.
 """
 
 from time import time
@@ -30,14 +41,13 @@ def print_outlier_ratio(y):
     print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
 
 
-np.random.seed(1)
+SEED = 1
 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
 
 # Set this to true for plotting score histograms for each dataset:
 with_decision_function_histograms = False
 
-# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
-# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
+# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
 # Loop over all datasets for fitting and scoring the estimator:
@@ -47,15 +57,16 @@ def print_outlier_ratio(y):
     print('====== %s ======' % dat)
     print('--- Fetching data...')
     if dat in ['http', 'smtp', 'SF', 'SA']:
-        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
+        dataset = fetch_kddcup99(subset=dat, shuffle=True,
+                                 percent10=True, random_state=SEED)
         X = dataset.data
         y = dataset.target
 
     if dat == 'shuttle':
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
+        X, y = sh(X, y, random_state=SEED)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
         print('----- ')
 
     if dat == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype(shuffle=True, random_state=SEED)
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1)
+    model = IsolationForest(n_jobs=-1, random_state=SEED)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart