8000 DOC Example: Evaluation of outlier detection estimators (#16606) · scikit-learn/scikit-learn@3b5f460 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3b5f460

Browse files
MaiRajborirugjeremiedbbArturoAmorQ
authored
DOC Example: Evaluation of outlier detection estimators (#16606)
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
1 parent d0c53f7 commit 3b5f460

File tree

2 files changed

+200
-0
lines changed

2 files changed

+200
-0
lines changed

doc/modules/outlier_detection.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,12 @@ sections hereunder.
131131
:class:`neighbors.LocalOutlierFactor` and
132132
:class:`covariance.EllipticEnvelope`.
133133

134+
* See :ref:`sphx_glr_auto_examples_miscellaneous_plot_outlier_detection_bench.py`
135+
for an example showing how to evaluate outlier detection estimators,
136+
the :class:`neighbors.LocalOutlierFactor` and the
137+
:class:`ensemble.IsolationForest`, using ROC curves from
138+
:class:`metrics.RocCurveDisplay`.
139+
134140
Novelty Detection
135141
=================
136142

@@ -310,6 +316,7 @@ allows you to add more trees to an already fitted model::
310316
* Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
311317
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
312318

319+
.. _local_outlier_factor:
313320

314321
Local Outlier Factor
315322
--------------------
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
"""
2+
==========================================
3+
Evaluation of outlier detection estimators
4+
==========================================
5+
6+
This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor`
7+
(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on
8+
classical anomaly detection datasets. The algorithm performance
9+
is assessed in an outlier detection context:
10+
11+
1. The algorithms are trained on the whole dataset which is assumed to
12+
contain outliers.
13+
14+
2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed
15+
on the same dataset using the knowledge of the labels.
16+
17+
"""
18+
19+
# Author: Pharuj Rajborirug <pharuj.ra@kmitl.ac.th>
20+
# License: BSD 3 clause
21+
22+
print(__doc__)
23+
24+
# %%
25+
# Define a data preprocessing function
26+
# ----------------------------------
27+
#
28+
# The example uses real-world datasets available in
29+
# :class:`sklearn.datasets` and the sample size of some datasets is reduced
30+
# to speed up computation. After the data preprocessing, the datasets' targets
31+
# will have two classes, 0 representing inliers and 1 representing outliers.
32+
# The `preprocess_dataset` function returns data and target.
33+
34+
import numpy as np
35+
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
36+
from sklearn.preprocessing import LabelBinarizer
37+
import pandas as pd
38+
39+
rng = np.random.RandomState(42)
40+
41+
42+
def preprocess_dataset(dataset_name):
43+
44+
# loading and vectorization
45+
print(f"Loading {dataset_name} data")
46+
if dataset_name in ["http", "smtp", "SA", "SF"]:
47+
dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng)
48+
X = dataset.data
49+
y = dataset.target
50+
lb = LabelBinarizer()
51+
52+
if dataset_name == "SF":
53+
idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
54+
X = X[idx] # reduce the sample size
55+
y = y[idx]
56+
x1 = lb.fit_transform(X[:, 1].astype(str))
57+
X = np.c_[X[:, :1], x1, X[:, 2:]]
58+
elif dataset_name == "SA":
59+
idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
60+
X = X[idx] # reduce the sample size
61+
y = y[idx]
62+
x1 = lb.fit_transform(X[:, 1].astype(str))
63+
x2 = lb.fit_transform(X[:, 2].astype(str))
64+
x3 = lb.fit_transform(X[:, 3].astype(str))
65+
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
66+
y = (y != b"normal.").astype(int)
67+
if dataset_name == "forestcover":
68+
dataset = fetch_covtype()
69+
X = dataset.data
70+
y = dataset.target
71+
idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
72+
X = X[idx] # reduce the sample size
73+
y = y[idx]
74+
75+
# inliers are those with attribute 2
76+
# outliers are those with attribute 4
77+
s = (y == 2) + (y == 4)
78+
X = X[s, :]
79+
y = y[s]
80+
y = (y != 2).astype(int)
81+
if dataset_name in ["glass", "wdbc", "cardiotocography"]:
82+
dataset = fetch_openml(name=dataset_name, version=1, as_frame=False)
83+
X = dataset.data
84+
y = dataset.target
85+
86+
if dataset_name == "glass":
87+
s = y == "tableware"
88+
y = s.astype(int)
89+
if dataset_name == "wdbc":
90+
s = y == "2"
91+
y = s.astype(int)
92+
X_mal, y_mal = X[s], y[s]
93+
X_ben, y_ben = X[~s], y[~s]
94+
95+
# downsampled to 39 points (9.8% outliers)
96+
idx = rng.choice(y_mal.shape[0], 39, replace=False)
97+
X_mal2 = X_mal[idx]
98+
y_mal2 = y_mal[idx]
99+
X = np.concatenate((X_ben, X_mal2), axis=0)
100+
y = np.concatenate((y_ben, y_mal2), axis=0)
101+
if dataset_name == "cardiotocography":
102+
s = y == "3"
103+
y = s.astype(int)
104+
# 0 represents inliers, and 1 represents outliers
105+
y = pd.Series(y, dtype="category")
106+
return (X, y)
107+
108+
109+
# %%
110+
# Define an outlier prediction function
111+
# -------------------------------------
112+
# There is no particular reason to choose algorithms
113+
# :class:`~sklearn.neighbors.LocalOutlierFactor` and
114+
# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that
115+
# different algorithm performs well on different datasets. The following
116+
# `compute_prediction` function returns average outlier score of X.
117+
118+
119+
from sklearn.neighbors import LocalOutlierFactor
120+
from sklearn.ensemble import IsolationForest
121+
122+
123+
def compute_prediction(X, model_name):
124+
125+
print(f"Computing {model_name} prediction...")
126+
if model_name == "LOF":
127+
clf = LocalOutlierFactor(n_neighbors=20, contamination="auto")
128+
clf.fit(X)
129+
y_pred = clf.negative_outlier_factor_
130+
if model_name == "IForest":
131+
clf = IsolationForest(random_state=rng, contamination="auto")
132+
y_pred = clf.fit(X).decision_function(X)
133+
return y_pred
134+
135+
136+
# %%
137+
# Plot and interpret results
138+
# --------------------------
139+
#
140+
# The algorithm performance relates to how good the true positive rate (TPR)
141+
# is at low value of the false positive rate (FPR). The best algorithms
142+
# have the curve on the top-left of the plot and the area under curve (AUC)
143+
# close to 1. The diagonal dashed line represents a random classification
144+
# of outliers and inliers.
145+
146+
147+
import math
148+
import matplotlib.pyplot as plt
149+
from sklearn.metrics import RocCurveDisplay
150+
151+
datasets_name = [
152+
"http",
153+
"smtp",
154+
"SA",
155+
"SF",
156+
"forestcover",
157+
"glass",
158+
"wdbc",
159+
"cardiotocography",
160+
]
161+
162+
models_name = [
163+
"LOF",
164+
"IForest",
165+
]
166+
167+
# plotting parameters
168+
cols = 2
169+
linewidth = 1
170+
pos_label = 0 # mean 0 belongs to positive class
171+
rows = math.ceil(len(datasets_name) / cols)
172+
173+
fig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3))
174+
175+
for i, dataset_name in enumerate(datasets_name):
176+
(X, y) = preprocess_dataset(dataset_name=dataset_name)
177+
178+
for model_name in models_name:
179+
y_pred = compute_prediction(X, model_name=model_name)
180+
display = RocCurveDisplay.from_predictions(
181+
y,
182+
y_pred,
183+
pos_label=pos_label,
184+
name=model_name,
185+
linewidth=linewidth,
186+
ax=axs[i // cols, i % cols],
187+
)
188+
axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=":")
189+
axs[i // cols, i % cols].set_title(dataset_name)
190+
axs[i // cols, i % cols].set_xlabel("False Positive Rate")
191+
axs[i // cols, i % cols].set_ylabel("True Positive Rate")
192+
plt.tight_layout(pad=2.0) # spacing between subplots
193+
plt.show()

0 commit comments

Comments
 (0)
0