8000 DOC Comparison plot for anomaly detection methods. (#10004) · scikit-learn/scikit-learn@8e599c6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8e599c6

Browse files
agramfortjnothman
authored andcommitted
DOC Comparison plot for anomaly detection methods. (#10004)
1 parent 4a5cea2 commit 8e599c6

File tree

2 files changed

+132
-0
lines changed

2 files changed

+132
-0
lines changed

doc/modules/outlier_detection.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@ new observations can then be sorted as inliers or outliers with a
3333

3434
Inliers are labeled 1, while outliers are labeled -1.
3535

36+
Overview of outlier detection methods
37+
=====================================
38+
39+
.. figure:: ../auto_examples/images/sphx_glr_plot_anomaly_comparison_001.png
40+
:target: ../auto_examples/plot_anomaly_comparison.html
41+
:align: center
42+
:scale: 50
43+
44+
A comparison of the outlier detection algorithms in scikit-learn
45+
46+
3647
Novelty Detection
3748
=================
3849

examples/plot_anomaly_comparison.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""
2+
============================================================================
3+
Comparing anomaly detection algorithms for outlier detection on toy datasets
4+
============================================================================
5+
6+
This example shows characteristics of different anomaly detection algorithms
7+
on 2D datasets. Datasets contain one or two modes (regions of high density)
8+
to illustrate the ability of algorithms to cope with multimodal data.
9+
10+
For each dataset, 15% of samples are generated as random uniform noise. This
11+
proportion is the value given to the nu parameter of the OneClassSVM and the
12+
contamination parameter of the other outlier detection algorithms.
13+
Decision boundaries between inliers and outliers are displayed in black.
14+
15+
Local Outlier Factor (LOF) does not show a decision boundary in black as it
16+
has no predict method to be applied on new data.
17+
18+
While these examples give some intuition about the algorithms, this
19+
intuition might not apply to very high dimensional data.
20+
21+
Finally, note that parameters of the models have been here handpicked but
22+
that in practice they need to be adjusted. In the absence of labelled data,
23+
the problem is completely unsupervised so model selection can be a challenge.
24+
"""
25+
26+
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
27+
# Albert Thomas <albert.thomas@telecom-paristech.fr>
28+
# License: BSD 3 clause
29+
30+
import time
31+
32+
import numpy as np
33+
import matplotlib
34+
import matplotlib.pyplot as plt
35+
36+
from sklearn import svm
37+
from sklearn.datasets import make_moons, make_blobs
38+
from sklearn.covariance import EllipticEnvelope
39+
from sklearn.ensemble import IsolationForest
40+
from sklearn.neighbors import LocalOutlierFactor
41+
42+
print(__doc__)
43+
44+
matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
45+
46+
# Example settings
47+
n_samples = 300
48+
outliers_fraction = 0.15
49+
n_outliers = int(outliers_fraction * n_samples)
50+
n_inliers = n_samples - n_outliers
51+
52+
# define outlier/anomaly detection methods to be compared
53+
anomaly_algorithms = [
54+
("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
55+
("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
56+
gamma=0.1)),
57+
("Isolation Forest", IsolationForest(contamination=outliers_fraction,
58+
random_state=42)),
59+
("Local Outlier Factor", LocalOutlierFactor(
60+
n_neighbors=35, contamination=outliers_fraction))]
61+
62+
# Define datasets
63+
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
64+
datasets = [
65+
make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
66+
**blobs_params)[0],
67+
make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
68+
**blobs_params)[0],
69+
4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
70+
np.array([0.5, 0.25])),
71+
14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)]
72+
73+
# Compare given classifiers under given settings
74+
xx, yy = np.meshgrid(np.linspace(-7, 7, 150),
75+
np.linspace(-7, 7, 150))
76+
77+
plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))
78+
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
79+
hspace=.01)
80+
81+
plot_num = 1
82+
rng = np.random.RandomState(42)
83+
84+
for i_dataset, X in enumerate(datasets):
85+
# Add outliers
86+
X = np.concatenate([X, rng.uniform(low=-6, high=6,
87+
size=(n_outliers, 2))], axis=0)
88+
89+
for name, algorithm in anomaly_algorithms:
90+
t0 = time.time()
91+
algorithm.fit(X)
92+
t1 = time.time()
93+
plt.subplot(len(datasets), len(anomaly_algorithms), plot_num)
94+
if i_dataset == 0:
95+
plt.title(name, size=18)
96+
97+
# fit the data and tag outliers
98+
if name == "Local Outlier Factor":
99+
y_pred = algorithm.fit_predict(X)
100+
else:
101+
y_pred = algorithm.fit(X).predict(X)
102+
103+
# plot the levels lines and the points
104+
if name != "Local Outlier Factor": # LOF does not implement predict
105+
Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])
106+
Z = Z.reshape(xx.shape)
107+
plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
108+
109+
colors = np.array(['#377eb8', '#ff7f00'])
110+
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2])
111+
112+
plt.xlim(-7, 7)
113+
plt.ylim(-7, 7)
114+
plt.xticks(())
115+
plt.yticks(())
116+
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
117+
transform=plt.gca().transAxes, size=15,
118+
horizontalalignment='right')
119+
plot_num += 1
120+
121+
plt.show()

0 commit comments

Comments
 (0)
0