scikit-learn · glouppe · Oct 24, 2015 · Jan 26, 2015 · arjoly · Oct 22, 2015
diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -0,0 +1,108 @@
+"""
+==========================================
+IsolationForest benchmark
+==========================================
+
+A test of IsolationForest on classical anomaly detection datasets.
+
+"""
+print(__doc__)
+
+from time import time
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.ensemble import IsolationForest
+from sklearn.metrics import roc_curve, auc
+from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils import shuffle as sh
+
+np.random.seed(1)
+
+
+datasets = ['http']#, 'smtp', 'SA', 'SF', 'shuttle', 'forestcove
8000
r']
+
+for dat in datasets:
+    # loading and vectorization
+    print('loading data')
+    if dat in ['http', 'smtp', 'SA', 'SF']:
+        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
+        X = dataset.data
+        y = dataset.target
+
+    if dat == 'shuttle':
+        dataset = fetch_mldata('shuttle')
+        X = dataset.data
+        y = dataset.target
+        sh(X, y)
+        # we remove data with label 4
+        # normal data are then those of class 1
+        s = (y != 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 1).astype(int)
+
+    if dat == 'forestcover':
+        dataset = fetch_covtype(shuffle=True)
+        X = dataset.data
+        y = dataset.target
+        # normal data are those with attribute 2
+        # abnormal those with attribute 4
+        s = (y == 2) + (y == 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 2).astype(int)
+
+    print('vectorizing data')
+
+    if dat == 'SF':
+        lb = LabelBinarizer()
+        lb.fit(X[:, 1])
+        x1 = lb.transform(X[:, 1])
+        X = np.c_[X[:, :1], x1, X[:, 2:]]
+        y = (y != 'normal.').astype(int)
+
+    if dat == 'SA':
+        lb = LabelBinarizer()
+        lb.fit(X[:, 1])
+        x1 = lb.transform(X[:, 1])
+        lb.fit(X[:, 2])
+        x2 = lb.transform(X[:, 2])
+        lb.fit(X[:, 3])
+        x3 = lb.transform(X[:, 3])
+        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
+        y = (y != 'normal.').astype(int)
+
+    if dat == 'http' or dat == 'smtp':
+        y = (y != 'normal.').astype(int)
+
+    n_samples, n_features = np.shape(X)
+    n_samples_train = n_samples // 2
+    n_samples_test = n_samples - n_samples_train
+
+    X = X.astype(float)
+    X_train = X[:n_samples_train, :]
+    X_test = X[n_samples_train:, :]
+    y_train = y[:n_samples_train]
+    y_test = y[n_samples_train:]
+
+    print('IsolationForest processing...')
+    model = IsolationForest(bootstrap=True, n_jobs=-1)
+    tstart = time()
+    model.fit(X_train)
+    fit_time = time() - tstart
+    tstart = time()
+
+    scoring = model.predict(X_test)  # the lower, the more normal
+    predict_time = time() - tstart
+    fpr, tpr, thresholds = roc_curve(y_test, scoring)
+    AUC = auc(fpr, tpr)
+    plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time))
+
+plt.xlim([-0.05, 1.05])
+plt.ylim([-0.05, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('Receiver operating characteristic')
+plt.legend(loc="lower right")
+plt.show()
diff --git a/doc/datasets/kddcup99.rst b/doc/datasets/kddcup99.rst
@@ -0,0 +1,36 @@
+
+.. _kddcup99:
+
+Kddcup 99 dataset
+=================
+
+The KDD Cup '99 dataset was created by processing the tcpdump portions
+of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
+created by MIT Lincoln Lab. The artificial data (described on the `dataset's
+homepage <http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html>`_) was
+generated using a closed network and hand-injected attacks to produce a
+large number of different types of attack with normal activity in the
+background. As the initial goal was to produce a large training set for
+supervised learning algorithms, there is a large proportion (80.1%) of
+abnormal data which is unrealistic in real world, and inapropriate for
+unsupervised anomaly detection which aims at detecting 'abnormal' data, ie
+1) qualitatively different from normal data
+2) in large minority among the observations.
+We thus transform the KDD Data set into two differents data set: SA and SF.
+
+-SA is obtained by simply selecting all the normal data, and a small
+proportion of abnormal data to gives an anomaly proportion of 1%.
+
+-SF is obtained as in [2]
+by simply picking up the data whose attribute logged_in is positive, thus
+focusing on the intrusion attack, which gives a proportion of 0.3% of
+attack.
+
+-http and smtp are two subsets of SF corresponding with third feature
+equal to 'http' (resp. to 'smtp')
+
+:func:`sklearn.datasets.fetch_kddcup99` will load the kddcup99 dataset;
+it returns a dictionary-like object
+with the feature matrix in the ``data`` member
+and the target values in ``target``.
+The dataset will be downloaded from the web if necessary.
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -221,6 +221,7 @@ Loaders
    datasets.fetch_olivetti_faces
    datasets.fetch_california_housing
    datasets.fetch_covtype
+   datasets.fetch_kddcup99
    datasets.fetch_rcv1
    datasets.load_mlcomp
    datasets.load_sample_image
@@ -351,6 +352,7 @@ Samples generator
    ensemble.ExtraTreesRegressor
    ensemble.GradientBoostingClassifier
    ensemble.GradientBoostingRegressor
+   ensemble.IsolationForest
    ensemble.RandomForestClassifier
    ensemble.RandomTreesEmbedding
    ensemble.RandomForestRegressor

diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
@@ -192,4 +192,45 @@ multiple modes.
      an outlier detection method) and a covariance-based outlier
      detection with :class:`covariance.MinCovDet`.
 
+Isolation Forest
+----------------------------
+
+One efficient way of performing outlier detection in high-dimensional datasets
+is to use random forests.
+:class:`ensemble.IsolationForest` consists in 'isolating' the observations
+by randomly selecting a feature and then randomly selecting a split value
+between the maximum and minimum values of the selected feature.
+
+Since recursive partitioning can be represented by a tree structure, the
+number of splitting required to isolate a point is equivalent to the path
+length from the root node to a terminating node.
+
+This path length, averaged among a forest of such random trees, is a
+measure of abnormality and our decision function.
+
+Indeed random partitioning produces noticeable shorter paths for anomalies.
+Hence, when a forest of random trees collectively produce shorter path
+lengths for some particular points, then they are highly likely to be
+anomalies.
+
+This strategy is illustrated below.
+
+.. figure:: ../auto_examples/ensemble/images/plot_isolation_forest_001.png
+   :target: ../auto_examples/ensemble/plot_isolation_forest.html
+   :align: center
+   :scale: 75%
+
+.. topic:: Examples:
 
+   * See :ref:`example_ensemble_plot_isolation_forest.py` for
+     an illustration of the use of IsolationForest.
+
+   * See :ref:`example_covariance_plot_outlier_detection.py` for a
+     comparison of :class:`ensemble.IsolationForest` with
+     :class:`svm.OneClassSVM` (tuned to perform like an outlier detection
+     method) and a covariance-based outlier detection with
+     :class:`covariance.MinCovDet`.
+
+.. topic:: References:
+    .. [LTZ2008] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
+           Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py
@@ -3,7 +3,7 @@
 Outlier detection with several methods.
 ==========================================
 
-When the amount of contamination is known, this example illustrates two
+When the amount of contamination is known, this example illustrates three
 different ways of performing :ref:`outlier_detection`:
 
 - based on a robust estimator of covariance, which is assuming that the
@@ -14,6 +14,10 @@
   data set, hence performing better when the data is strongly
   non-Gaussian, i.e. with two well-separated clusters;
 
+- using the Isolation Forest algorithm, which is based on random forests and
+  hence more adapted to large-dimensional settings, even if it performs
+  quite well in the examples below.
+
 The ground truth about inliers and outliers is given by the points colors
 while the orange-filled area indicates which points are reported as inliers
 by each method.
@@ -32,6 +36,9 @@
 
 from sklearn import svm
 from sklearn.covariance import EllipticEnvelope
+from sklearn.ensemble import IsolationForest
+
+rng = np.random.RandomState(42)
 
 # Example settings
 n_samples = 200
@@ -42,7 +49,8 @@
 classifiers = {
     "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                      kernel="rbf", gamma=0.1),
-    "robust covariance estimator": EllipticEnvelope(contamination=.1)}
+    "robust covariance estimator": EllipticEnvelope(contamination=.1),
+    "Isolation Forest": IsolationForest(max_samples=n_samples, random_state=rng)}
 
 # Compare given classifiers under given settings
 xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
@@ -61,7 +69,7 @@
     # Add outliers
     X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
 
-    # Fit the model with the One-Class SVM
+    # Fit the model
     plt.figure(figsize=(10, 5))
     for i, (clf_name, clf) in enumerate(classifiers.items()):
         # fit the data and tag outliers
@@ -74,7 +82,7 @@
         # plot the levels lines and the points
         Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
         Z = Z.reshape(xx.shape)
-        subplot = plt.subplot(1, 2, i + 1)
+        subplot = plt.subplot(1, 3, i + 1)
         subplot.set_title("Outlier detection")
         subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                          cmap=plt.cm.Blues_r)

diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py
@@ -0,0 +1,69 @@
+"""
+==========================================
+IsolationForest example
+==========================================
+
+An example using IsolationForest for anomaly detection.
+
+IsolationForest consists in 'isolating' the observations by randomly selecting
+a feature and then randomly selecting a split value between the maximum and
+minimum values of the selected feature.
+
+Since recursive partitioning can be represented by a tree structure, the
+number of splitting required to isolate a sample is equivalent to the path
+length from the root node to a terminating node.
+
+This path length, averaged among a forest of such random trees, is a measure
+of abnormality and our decision function.
+
+Indeed random partitioning produces noticeable shorter paths for anomalies.
+Hence, when a forest of random trees collectively produce shorter path lengths
+for some particular samples, then they are highly likely to be anomalies.
+
+.. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
+    Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
+
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.ensemble import IsolationForest
+
+rng = np.random.RandomState(42)
+
+# Generate train data
+X = 0.3 * rng.randn(100, 2)
+X_train = np.r_[X + 2, X - 2]
+# Generate some regular novel observations
+X = 0.3 * rng.randn(20, 2)
+X_test = np.r_[X + 2, X - 2]
+# Generate some abnormal novel observations
+X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
+
+# fit the model
+clf = IsolationForest(max_samples=100, random_state=rng)
+clf.fit(X_train)
+y_pred_train = clf.predict(X_train)
+y_pred_test = clf.predict(X_test)
+y_pred_outliers = clf.predict(X_outliers)
+
+# plot the line, the samples, and the nearest vectors to the plane
+xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
+Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+Z = Z.reshape(xx.shape)
+
+plt.title("IsolationForest")
+plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
+
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
+plt.axis('tight')
+plt.xlim((-5, 5))
+plt.ylim((-5, 5))
+plt.legend([b1, b2, c],
+           ["training observations",
+            "new regular observations", "new abnormal observations"],
+           loc="upper left")
+plt.show()
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
@@ -16,6 +16,7 @@
 from .base import load_sample_images
 from .base import load_sample_image
 from .covtype import fetch_covtype
+from .kddcup99 import fetch_kddcup99
 from .mlcomp import load_mlcomp
 from .lfw import load_lfw_pairs
 from .lfw import load_lfw_people
@@ -65,6 +66,7 @@
            'fetch_california_housing',
            'fetch_covtype',
            'fetch_rcv1',
+           'fetch_kddcup99',
            'get_data_home',
            'load_boston',
            'load_diabetes',