donigian
diff --git a/‎benchmarks/bench_isolation_forest.py
Lines changed: 26 additions & 15 deletions b/‎benchmarks/bench_isolation_forest.py
Lines changed: 26 additions & 15 deletions
diff --git a/‎benchmarks/bench_lof.py
Lines changed: 28 additions & 41 deletions b/‎benchmarks/bench_lof.py
Lines changed: 28 additions & 41 deletions
diff --git a/‎doc/datasets/index.rst
Lines changed: 3 additions & 0 deletions b/‎doc/datasets/index.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/index.rst
Lines changed: 2 additions & 0 deletions b/‎doc/index.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/modules/manifold.rst
Lines changed: 3 additions & 3 deletions b/‎doc/modules/manifold.rst
Lines changed: 3 additions & 3 deletions
diff --git a/‎doc/modules/preprocessing.rst
Lines changed: 9 additions & 0 deletions b/‎doc/modules/preprocessing.rst
Lines changed: 9 additions & 0 deletions
@@ -3,6 +3,17 @@
 IsolationForest benchmark
 ==========================================
 A test of IsolationForest on classical anomaly detection datasets.
+
+The benchmark is run as f
F438
ollows:
+1. The dataset is randomly split into a training set and a test set, both
+assumed to contain outliers.
+2. Isolation Forest is trained on the training set.
+3. The ROC curve is computed on the test set using the knowledge of the labels.
+
+Note that the smtp dataset contains a very small proportion of outliers.
+Therefore, depending on the seed of the random number generator, randomly
+splitting the data set might lead to a test set containing no outliers. In this
+case a warning is raised when computing the ROC curve.
 """
 
 from time import time
@@ -12,7 +23,7 @@
 from sklearn.ensemble import IsolationForest
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils import shuffle as sh
 
 print(__doc__)
@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
     print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
 
 
-np.random.seed(1)
+random_state = 1
 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
 
 # Set this to true for plotting score histograms for each dataset:
 with_decision_function_histograms = False
 
-# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
-# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
+# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
 # Loop over all datasets for fitting and scoring the estimator:
 for dat in datasets:
@@ -47,15 +57,16 @@ def print_outlier_ratio(y):
     print('====== %s ======' % dat)
     print('--- Fetching data...')
     if dat in ['http', 'smtp', 'SF', 'SA']:
-        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
+        dataset = fetch_kddcup99(subset=dat, shuffle=True,
+                                 percent10=True, random_state=random_state)
         X = dataset.data
         y = dataset.target
 
     if dat == 'shuttle':
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
+        X, y = sh(X, y, random_state=random_state)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
         print('----- ')
 
     if dat == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype(shuffle=True, random_state=random_state)
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -79,17 +90,17 @@ def print_outlier_ratio(y):
     print('--- Vectorizing data...')
 
     if dat == 'SF':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
         X = np.c_[X[:, :1], x1, X[:, 2:]]
         y = (y != b'normal.').astype(int)
         print_outlier_ratio(y)
 
     if dat == 'SA':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
-        x2 = lb.fit_transform(X[:, 2])
-        x3 = lb.fit_transform(X[:, 3])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
         y = (y != b'normal.').astype(int)
         print_outlier_ratio(y)
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1)
+    model = IsolationForest(n_jobs=-1, random_state=random_state)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart
 
@@ -5,6 +5,16 @@
 
 A test of LocalOutlierFactor on classical anomaly detection datasets.
 
+Note that LocalOutlierFactor is not meant to predict on a test set and its
+performance is assessed in an outlier detection context:
+1. The model is trained on the whole dataset which is assumed to contain
+outliers.
+2. The ROC curve is computed on the same dataset using the knowledge of the
+labels.
+In this context there is no need to shuffle the dataset because the model
+is trained and tested on the whole dataset. The randomness of this benchmark
+is only caused by the random selection of anomalies in the SA dataset.
+
 """
 
 from time import time
@@ -14,31 +24,28 @@
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
 from sklearn.preprocessing import LabelBinarizer
-from sklearn.utils import shuffle as sh
 
 print(__doc__)
 
-np.random.seed(2)
+random_state = 2  # to control the random selection of anomalies in SA
 
 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['shuttle']
-
-novelty_detection = True  # if False, training set polluted by outliers
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
+plt.figure()
 for dataset_name in datasets:
     # loading and vectorization
     print('loading data')
     if dataset_name in ['http', 'smtp', 'SA', 'SF']:
-        dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
-                                 percent10=False)
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
+                                 random_state=random_state)
         X = dataset.data
         y = dataset.target
 
     if dataset_name == 'shuttle':
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -47,7 +54,7 @@
         y = (y != 1).astype(int)
 
     if dataset_name == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype()
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -61,54 +68,34 @@
 
     if dataset_name == 'SF':
         lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
         X = np.c_[X[:, :1], x1, X[:, 2:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     if dataset_name == 'SA':
         lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
-        lb.fit(X[:, 2])
-        x2 = lb.transform(X[:, 2])
-        lb.fit(X[:, 3])
-        x3 = lb.transform(X[:, 3])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     if dataset_name == 'http' or dataset_name == 'smtp':
-        y = (y != 'normal.').astype(int)
-
-    n_samples, n_features = np.shape(X)
-    n_samples_train = n_samples // 2
-    n_samples_test = n_samples - n_samples_train
+        y = (y != b'normal.').astype(int)
 
     X = X.astype(float)
-    X_train = X[:n_samples_train, :]
-    X_test = X[n_samples_train:, :]
-    y_train = y[:n_samples_train]
-    y_test = y[n_samples_train:]
-
-    if novelty_detection:
-        X_train = X_train[y_train == 0]
-        y_train = y_train[y_train == 0]
 
     print('LocalOutlierFactor processing...')
     model = LocalOutlierFactor(n_neighbors=20)
     tstart = time()
-    model.fit(X_train)
+    model.fit(X)
     fit_time = time() - tstart
-    tstart = time()
-
-    scoring = -model.decision_function(X_test)  # the lower, the more normal
-    predict_time = time() - tstart
-    fpr, tpr, thresholds = roc_curve(y_test, scoring)
+    scoring = -model.negative_outlier_factor_  # the lower, the more normal
+    fpr, tpr, thresholds = roc_curve(y, scoring)
     AUC = auc(fpr, tpr)
     plt.plot(fpr, tpr, lw=1,
-             label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
-                    'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
-                                            predict_time)))
+             label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
+                    % (dataset_name, AUC, fit_time)))
 
 plt.xlim([-0.05, 1.05])
 plt.ylim([-0.05, 1.05])
 
@@ -321,6 +321,7 @@ writing data in that format.
     labeled_faces
     covtype
     rcv1
+    kddcup99
 
 
 .. include:: olivetti_faces.rst
@@ -335,6 +336,8 @@ writing data in that format.
 
 .. include:: rcv1.rst
 
+.. include:: kddcup99.rst
+
 .. _boston_house_prices:
 
 .. include:: ../../sklearn/datasets/descr/boston_house_prices.rst
 
@@ -207,6 +207,8 @@
                     <li><em>On-going development:</em>
                     <a href="/dev/whats_new.html"><em>What's new</em> (Changelog)</a>
                     </li>
+                    <li><em>October 2017.</em> scikit-learn 0.19.1 is available for download (<a href="whats_new.html#version-0-19">Changelog</a>).
+                    </li>
                     <li><em>July 2017.</em> scikit-learn 0.19.0 is available for download (<a href="whats_new/v0.19.html#version-0-19">Changelog</a>).
                     </li>
                     <li><em>June 2017.</em> scikit-learn 0.18.2 is available for download (<a href="whats_new/v0.18.html#version-0-18-2">Changelog</a>).
 
@@ -533,7 +533,7 @@ the quality of the resulting embedding:
 * maximum number of iterations
 * angle (not used in the exact method)
 
-The perplexity is defined as :math:`k=2^(S)` where :math:`S` is the Shannon
+The perplexity is defined as :math:`k=2^{(S)}` where :math:`S` is the Shannon
 entropy of the conditional probability distribution. The perplexity of a
 :math:`k`-sided die is :math:`k`, so that :math:`k` is effectively the number of
 nearest neighbors t-SNE considers when generating the conditional probabilities.
@@ -598,8 +598,8 @@ where label regions largely overlap. This is a strong clue that this data can
 be well separated by non linear methods that focus on the local structure (e.g.
 an SVM with a Gaussian RBF kernel). However, failing to visualize well
 separated homogeneously labeled groups with t-SNE in 2D does not necessarily
-implie that the data cannot be correctly classified by a supervised model. It
-might be the case that 2 dimensions are not enough low to accurately represents
+imply that the data cannot be correctly classified by a supervised model. It
+might be the case that 2 dimensions are not low enough to accurately represents
 the internal structure of the data.
 
 
 
@@ -610,6 +610,15 @@ a transformer that applies a log transformation in a pipeline, do::
     array([[ 0.        ,  0.69314718],
            [ 1.09861229,  1.38629436]])
 
+You can ensure that ``func`` and ``inverse_func`` are the inverse of each other
+by setting ``check_inverse=True`` and calling ``fit`` before
+``transform``. Please note that a warning is raised and can be turned into an
+error with a ``filterwarnings``::
+
+  >>> import warnings
+  >>> warnings.filterwarnings("error", message=".*check_inverse*.",
+  ...                         category=UserWarning, append=False)
+
 For a full code example that demonstrates using a :class:`FunctionTransformer`
 to do custom feature selection,
 see :ref:`sphx_glr_auto_examples_preprocessing_plot_function_transformer.py`