|
| 1 | +""" |
| 2 | +========================================== |
| 3 | +Evaluation of outlier detection estimators |
| 4 | +========================================== |
| 5 | +
|
| 6 | +This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor` |
| 7 | +(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on |
| 8 | +classical anomaly detection datasets. The algorithm performance |
| 9 | +is assessed in an outlier detection context: |
| 10 | +
|
| 11 | +1. The algorithms are trained on the whole dataset which is assumed to |
| 12 | +contain outliers. |
| 13 | +
|
| 14 | +2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed |
| 15 | +on the same dataset using the knowledge of the labels. |
| 16 | +
|
| 17 | +""" |
| 18 | + |
| 19 | +# Author: Pharuj Rajborirug <pharuj.ra@kmitl.ac.th> |
| 20 | +# License: BSD 3 clause |
| 21 | + |
| 22 | +print(__doc__) |
| 23 | + |
| 24 | +# %% |
| 25 | +# Define a data preprocessing function |
| 26 | +# ---------------------------------- |
| 27 | +# |
| 28 | +# The example uses real-world datasets available in |
| 29 | +# :class:`sklearn.datasets` and the sample size of some datasets is reduced |
| 30 | +# to speed up computation. After the data preprocessing, the datasets' targets |
| 31 | +# will have two classes, 0 representing inliers and 1 representing outliers. |
| 32 | +# The `preprocess_dataset` function returns data and target. |
| 33 | + |
| 34 | +import numpy as np |
| 35 | +from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml |
| 36 | +from sklearn.preprocessing import LabelBinarizer |
| 37 | +import pandas as pd |
| 38 | + |
| 39 | +rng = np.random.RandomState(42) |
| 40 | + |
| 41 | + |
| 42 | +def preprocess_dataset(dataset_name): |
| 43 | + |
| 44 | + # loading and vectorization |
| 45 | + print(f"Loading {dataset_name} data") |
| 46 | + if dataset_name in ["http", "smtp", "SA", "SF"]: |
| 47 | + dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng) |
| 48 | + X = dataset.data |
| 49 | + y = dataset.target |
| 50 | + lb = LabelBinarizer() |
| 51 | + |
| 52 | + if dataset_name == "SF": |
| 53 | + idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) |
| 54 | + X = X[idx] # reduce the sample size |
| 55 | + y = y[idx] |
| 56 | + x1 = lb.fit_transform(X[:, 1].astype(str)) |
| 57 | + X = np.c_[X[:, :1], x1, X[:, 2:]] |
| 58 | + elif dataset_name == "SA": |
| 59 | + idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) |
| 60 | + X = X[idx] # reduce the sample size |
| 61 | + y = y[idx] |
| 62 | + x1 = lb.fit_transform(X[:, 1].astype(str)) |
| 63 | + x2 = lb.fit_transform(X[:, 2].astype(str)) |
| 64 | + x3 = lb.fit_transform(X[:, 3].astype(str)) |
| 65 | + X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] |
| 66 | + y = (y != b"normal.").astype(int) |
| 67 | + if dataset_name == "forestcover": |
| 68 | + dataset = fetch_covtype() |
| 69 | + X = dataset.data |
| 70 | + y = dataset.target |
| 71 | + idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) |
| 72 | + X = X[idx] # reduce the sample size |
| 73 | + y = y[idx] |
| 74 | + |
| 75 | + # inliers are those with attribute 2 |
| 76 | + # outliers are those with attribute 4 |
| 77 | + s = (y == 2) + (y == 4) |
| 78 | + X = X[s, :] |
| 79 | + y = y[s] |
| 80 | + y = (y != 2).astype(int) |
| 81 | + if dataset_name in ["glass", "wdbc", "cardiotocography"]: |
| 82 | + dataset = fetch_openml(name=dataset_name, version=1, as_frame=False) |
| 83 | + X = dataset.data |
| 84 | + y = dataset.target |
| 85 | + |
| 86 | + if dataset_name == "glass": |
| 87 | + s = y == "tableware" |
| 88 | + y = s.astype(int) |
| 89 | + if dataset_name == "wdbc": |
| 90 | + s = y == "2" |
| 91 | + y = s.astype(int) |
| 92 | + X_mal, y_mal = X[s], y[s] |
| 93 | + X_ben, y_ben = X[~s], y[~s] |
| 94 | + |
| 95 | + # downsampled to 39 points (9.8% outliers) |
| 96 | + idx = rng.choice(y_mal.shape[0], 39, replace=False) |
| 97 | + X_mal2 = X_mal[idx] |
| 98 | + y_mal2 = y_mal[idx] |
| 99 | + X = np.concatenate((X_ben, X_mal2), axis=0) |
| 100 | + y = np.concatenate((y_ben, y_mal2), axis=0) |
| 101 | + if dataset_name == "cardiotocography": |
| 102 | + s = y == "3" |
| 103 | + y = s.astype(int) |
| 104 | + # 0 represents inliers, and 1 represents outliers |
| 105 | + y = pd.Series(y, dtype="category") |
| 106 | + return (X, y) |
| 107 | + |
| 108 | + |
| 109 | +# %% |
| 110 | +# Define an outlier prediction function |
| 111 | +# ------------------------------------- |
| 112 | +# There is no particular reason to choose algorithms |
| 113 | +# :class:`~sklearn.neighbors.LocalOutlierFactor` and |
| 114 | +# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that |
| 115 | +# different algorithm performs well on different datasets. The following |
| 116 | +# `compute_prediction` function returns average outlier score of X. |
| 117 | + |
| 118 | + |
| 119 | +from sklearn.neighbors import LocalOutlierFactor |
| 120 | +from sklearn.ensemble import IsolationForest |
| 121 | + |
| 122 | + |
| 123 | +def compute_prediction(X, model_name): |
| 124 | + |
| 125 | + print(f"Computing {model_name} prediction...") |
| 126 | + if model_name == "LOF": |
| 127 | + clf = LocalOutlierFactor(n_neighbors=20, contamination="auto") |
| 128 | + clf.fit(X) |
| 129 | + y_pred = clf.negative_outlier_factor_ |
| 130 | + if model_name == "IForest": |
| 131 | + clf = IsolationForest(random_state=rng, contamination="auto") |
| 132 | + y_pred = clf.fit(X).decision_function(X) |
| 133 | + return y_pred |
| 134 | + |
| 135 | + |
| 136 | +# %% |
| 137 | +# Plot and interpret results |
| 138 | +# -------------------------- |
| 139 | +# |
| 140 | +# The algorithm performance relates to how good the true positive rate (TPR) |
| 141 | +# is at low value of the false positive rate (FPR). The best algorithms |
| 142 | +# have the curve on the top-left of the plot and the area under curve (AUC) |
| 143 | +# close to 1. The diagonal dashed line represents a random classification |
| 144 | +# of outliers and inliers. |
| 145 | + |
| 146 | + |
| 147 | +import math |
| 148 | +import matplotlib.pyplot as plt |
| 149 | +from sklearn.metrics import RocCurveDisplay |
| 150 | + |
| 151 | +datasets_name = [ |
| 152 | + "http", |
| 153 | + "smtp", |
| 154 | + "SA", |
| 155 | + "SF", |
| 156 | + "forestcover", |
| 157 | + "glass", |
| 158 | + "wdbc", |
| 159 | + "cardiotocography", |
| 160 | +] |
| 161 | + |
| 162 | +models_name = [ |
| 163 | + "LOF", |
| 164 | + "IForest", |
| 165 | +] |
| 166 | + |
| 167 | +# plotting parameters |
| 168 | +cols = 2 |
| 169 | +linewidth = 1 |
| 170 | +pos_label = 0 # mean 0 belongs to positive class |
| 171 | +rows = math.ceil(len(datasets_name) / cols) |
| 172 | + |
| 173 | +fig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3)) |
| 174 | + |
| 175 | +for i, dataset_name in enumerate(datasets_name): |
| 176 | + (X, y) = preprocess_dataset(dataset_name=dataset_name) |
| 177 | + |
| 178 | + for model_name in models_name: |
| 179 | + y_pred = compute_prediction(X, model_name=model_name) |
| 180 | + display = RocCurveDisplay.from_predictions( |
| 181 | + y, |
| 182 | + y_pred, |
| 183 | + pos_label=pos_label, |
| 184 | + name=model_name, |
| 185 | + linewidth=linewidth, |
| 186 | + ax=axs[i // cols, i % cols], |
| 187 | + ) |
| 188 | + axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=":") |
| 189 | + axs[i // cols, i % cols].set_title(dataset_name) |
| 190 | + axs[i // cols, i % cols].set_xlabel("False Positive Rate") |
| 191 | + axs[i // cols, i % cols].set_ylabel("True Positive Rate") |
| 192 | +plt.tight_layout(pad=2.0) # spacing between subplots |
| 193 | +plt.show() |
0 commit comments