diff --git a/examples/miscellaneous/plot_outlier_detection_bench.py b/examples/miscellaneous/plot_outlier_detection_bench.py index a0901a31b8a2e..b4fae93131971 100644 --- a/examples/miscellaneous/plot_outlier_detection_bench.py +++ b/examples/miscellaneous/plot_outlier_detection_bench.py @@ -3,197 +3,451 @@ Evaluation of outlier detection estimators ========================================== -This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor` -(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on -classical anomaly detection datasets. The algorithm performance -is assessed in an outlier detection context: +This example compares two outlier detection algorithms, namely +:ref:`local_outlier_factor` (LOF) and :ref:`isolation_forest` (IForest), on +real-world datasets available in :class:`sklearn.datasets`. The goal is to show +that different algorithms perform well on different datasets. -1. The algorithms are trained on the whole dataset which is assumed to -contain outliers. +The algorithms are trained in an outlier detection context: -2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed -on the same dataset using the knowledge of the labels. +1. The ROC curves are computed using knowledge of the ground-truth labels +and displayed using :class:`~sklearn.metrics.RocCurveDisplay`. +2. The performance is assessed in terms of the ROC-AUC. """ # Author: Pharuj Rajborirug +# Arturo Amor # License: BSD 3 clause -print(__doc__) +# %% +# Dataset preprocessing and model training +# ======================================== +# +# Different outlier detection models require different preprocessing. In the +# presence of categorical variables, +# :class:`~sklearn.preprocessing.OrdinalEncoder` is often a good strategy for +# tree-based models such as :class:`~sklearn.ensemble.IsolationForest`, whereas +# neighbors-based models such as :class:`~sklearn.neighbors.LocalOutlierFactor` +# would be impacted by the ordering induced by ordinal encoding. To avoid +# inducing an ordering, on should rather use +# :class:`~sklearn.preprocessing.OneHotEncoder`. +# +# Neighbors-based models may also require scaling of the numerical features (see +# for instance :ref:`neighbors_scaling`). In the presence of outliers, a good +# option is to use a :class:`~sklearn.preprocessing.RobustScaler`. + +from sklearn.compose import ColumnTransformer +from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + OneHotEncoder, + OrdinalEncoder, + RobustScaler, +) + + +def make_estimator(name, categorical_columns=None, iforest_kw=None, lof_kw=None): + """Create an outlier detection estimator based on its name.""" + if name == "LOF": + outlier_detector = LocalOutlierFactor(**(lof_kw or {})) + if categorical_columns is None: + preprocessor = RobustScaler() + else: + preprocessor = ColumnTransformer( + transformers=[("categorical", OneHotEncoder(), categorical_columns)], + remainder=RobustScaler(), + ) + else: # name == "IForest" + outlier_detector = IsolationForest(**(iforest_kw or {})) + if categorical_columns is None: + preprocessor = None + else: + ordinal_encoder = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ) + preprocessor = ColumnTransformer( + transformers=[ + ("categorical", ordinal_encoder, categorical_columns), + ], + remainder="passthrough", + ) + + return make_pipeline(preprocessor, outlier_detector) + # %% -# Define a data preprocessing function -# ------------------------------------ +# The following `fit_predict` function returns the average outlier score of X. + +from time import perf_counter + + +def fit_predict(estimator, X): + tic = perf_counter() + if estimator[-1].__class__.__name__ == "LocalOutlierFactor": + estimator.fit(X) + y_pred = estimator[-1].negative_outlier_factor_ + else: # "IsolationForest" + y_pred = estimator.fit(X).decision_function(X) + toc = perf_counter() + print(f"Duration for {model_name}: {toc - tic:.2f} s") + return y_pred + + +# %% +# On the rest of the example we process one dataset per section. After loading +# the data, the targets are modified to consist of two classes: 0 representing +# inliers and 1 representing outliers. Due to computational constraints of the +# scikit-learn documentation, the sample size of some datasets is reduced using +# a stratified :class:`~sklearn.model_selection.train_test_split`. +# +# Furthermore, we set `n_neighbors` to match the expected number of anomalies +# `expected_n_anomalies = n_samples * expected_anomaly_fraction`. This is a good +# heuristic as long as the proportion of outliers is not very low, the reason +# being that `n_neighbors` should be at least greater than the number of samples +# in the less populated cluster (see +# :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py`). +# +# KDDCup99 - SA dataset +# --------------------- # -# The example uses real-world datasets available in -# :class:`sklearn.datasets` and the sample size of some datasets is reduced -# to speed up computation. After the data preprocessing, the datasets' targets -# will have two classes, 0 representing inliers and 1 representing outliers. -# The `preprocess_dataset` function returns data and target. +# The :ref:`kddcup99_dataset` was generated using a closed network and +# hand-injected attacks. The SA dataset is a subset of it obtained by simply +# selecting all the normal data and an anomaly proportion of around 3%. +# %% import numpy as np -import pandas as pd - -from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml -from sklearn.preprocessing import LabelBinarizer - -rng = np.random.RandomState(42) - - -def preprocess_dataset(dataset_name): - # loading and vectorization - print(f"Loading {dataset_name} data") - if dataset_name in ["http", "smtp", "SA", "SF"]: - dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng) - X = dataset.data - y = dataset.target - lb = LabelBinarizer() - - if dataset_name == "SF": - idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) - X = X[idx] # reduce the sample size - y = y[idx] - x1 = lb.fit_transform(X[:, 1].astype(str)) - X = np.c_[X[:, :1], x1, X[:, 2:]] - elif dataset_name == "SA": - idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) - X = X[idx] # reduce the sample size - y = y[idx] - x1 = lb.fit_transform(X[:, 1].astype(str)) - x2 = lb.fit_transform(X[:, 2].astype(str)) - x3 = lb.fit_transform(X[:, 3].astype(str)) - X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] - y = (y != b"normal.").astype(int) - if dataset_name == "forestcover": - dataset = fetch_covtype() - X = dataset.data - y = dataset.target - idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) - X = X[idx] # reduce the sample size - y = y[idx] - - # inliers are those with attribute 2 - # outliers are those with attribute 4 - s = (y == 2) + (y == 4) - X = X[s, :] - y = y[s] - y = (y != 2).astype(int) - if dataset_name in ["glass", "wdbc", "cardiotocography"]: - dataset = fetch_openml( - name=dataset_name, version=1, as_frame=False, parser="pandas" - ) - X = dataset.data - y = dataset.target - - if dataset_name == "glass": - s = y == "tableware" - y = s.astype(int) - if dataset_name == "wdbc": - s = y == "2" - y = s.astype(int) - X_mal, y_mal = X[s], y[s] - X_ben, y_ben = X[~s], y[~s] - - # downsampled to 39 points (9.8% outliers) - idx = rng.choice(y_mal.shape[0], 39, replace=False) - X_mal2 = X_mal[idx] - y_mal2 = y_mal[idx] - X = np.concatenate((X_ben, X_mal2), axis=0) - y = np.concatenate((y_ben, y_mal2), axis=0) - if dataset_name == "cardiotocography": - s = y == "3" - y = s.astype(int) - # 0 represents inliers, and 1 represents outliers - y = pd.Series(y, dtype="category") - return (X, y) - - -# %% -# Define an outlier prediction function -# ------------------------------------- -# There is no particular reason to choose algorithms -# :class:`~sklearn.neighbors.LocalOutlierFactor` and -# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that -# different algorithm performs well on different datasets. The following -# `compute_prediction` function returns average outlier score of X. +from sklearn.datasets import fetch_kddcup99 +from sklearn.model_selection import train_test_split -from sklearn.ensemble import IsolationForest -from sklearn.neighbors import LocalOutlierFactor +X, y = fetch_kddcup99( + subset="SA", percent10=True, random_state=42, return_X_y=True, as_frame=True +) +y = (y != b"normal.").astype(np.int32) +X, _, y, _ = train_test_split(X, y, train_size=0.1, stratify=y, random_state=42) +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") -def compute_prediction(X, model_name): - print(f"Computing {model_name} prediction...") - if model_name == "LOF": - clf = LocalOutlierFactor(n_neighbors=20, contamination="auto") - clf.fit(X) - y_pred = clf.negative_outlier_factor_ - if model_name == "IForest": - clf = IsolationForest(random_state=rng, contamination="auto") - y_pred = clf.fit(X).decision_function(X) - return y_pred +# %% +# The SA dataset contains 41 features out of which 3 are categorical: +# "protocol_type", "service" and "flag". +# %% +y_true = {} +y_pred = {"LOF": {}, "IForest": {}} +model_names = ["LOF", "IForest"] +cat_columns = ["protocol_type", "service", "flag"] + +y_true["KDDCup99 - SA"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + categorical_columns=cat_columns, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["KDDCup99 - SA"] = fit_predict(model, X) # %% -# Plot and interpret results -# -------------------------- +# Forest covertypes dataset +# ------------------------- # -# The algorithm performance relates to how good the true positive rate (TPR) -# is at low value of the false positive rate (FPR). The best algorithms -# have the curve on the top-left of the plot and the area under curve (AUC) -# close to 1. The diagonal dashed line represents a random classification -# of outliers and inliers. +# The :ref:`covtype_dataset` is a multiclass dataset where the target is the +# dominant species of tree in a given patch of forest. It contains 54 features, +# some of which ("Wilderness_Area" and "Soil_Type") are already binary encoded. +# Though originally meant as a classification task, one can regard inliers as +# samples encoded with label 2 and outliers as those with label 4. +# %% +from sklearn.datasets import fetch_covtype -import math +X, y = fetch_covtype(return_X_y=True, as_frame=True) +s = (y == 2) + (y == 4) +X = X.loc[s] +y = y.loc[s] +y = (y != 2).astype(np.int32) + +X, _, y, _ = train_test_split(X, y, train_size=0.05, stratify=y, random_state=42) +X_forestcover = X # save X for later use +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") + +# %% +y_true["forestcover"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["forestcover"] = fit_predict(model, X) + +# %% +# Ames Housing dataset +# -------------------- +# +# The `Ames housing dataset `_ is originally a +# regression dataset where the target are sales prices of houses in Ames, Iowa. +# Here we convert it into an outlier detection problem by regarding houses with +# price over 70 USD/sqft. To make the problem easier, we drop intermediate +# prices between 40 and 70 USD/sqft. + +# %% import matplotlib.pyplot as plt -from sklearn.metrics import RocCurveDisplay +from sklearn.datasets import fetch_openml -datasets_name = [ - "http", - "smtp", - "SA", - "SF", - "forestcover", - "glass", - "wdbc", - "cardiotocography", -] +X, y = fetch_openml( + name="ames_housing", version=1, return_X_y=True, as_frame=True, parser="pandas" +) +y = y.div(X["Lot_Area"]) -models_name = [ - "LOF", - "IForest", -] +# None values in pandas 1.5.1 were mapped to np.nan in pandas 2.0.1 +X["Misc_Feature"] = X["Misc_Feature"].cat.add_categories("NoInfo").fillna("NoInfo") +X["Mas_Vnr_Type"] = X["Mas_Vnr_Type"].cat.add_categories("NoInfo").fillna("NoInfo") + +X.drop(columns="Lot_Area", inplace=True) +mask = (y < 40) | (y > 70) +X = X.loc[mask] +y = y.loc[mask] +y.hist(bins=20, edgecolor="black") +plt.xlabel("House price in USD/sqft") +_ = plt.title("Distribution of house prices in Ames") + +# %% +y = (y > 70).astype(np.int32) + +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") + +# %% +# The dataset contains 46 categorical features. In this case it is easier use a +# :class:`~sklearn.compose.make_column_selector` to find them instead of passing +# a list made by hand. + +# %% +from sklearn.compose import make_column_selector as selector + +categorical_columns_selector = selector(dtype_include="category") +cat_columns = categorical_columns_selector(X) + +y_true["ames_housing"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + categorical_columns=cat_columns, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["ames_housing"] = fit_predict(model, X) + +# %% +# Cardiotocography dataset +# ------------------------ +# +# The `Cardiotocography dataset `_ is a multiclass +# dataset of fetal cardiotocograms, the classes being the fetal heart rate (FHR) +# pattern encoded with labels from 1 to 10. Here we set class 3 (the minority +# class) to represent the outliers. It contains 30 numerical features, some of +# which are binary encoded and some are continuous. + +# %% +X, y = fetch_openml( + name="cardiotocography", version=1, return_X_y=True, as_frame=False, parser="pandas" +) +X_cardiotocography = X # save X for later use +s = y == "3" +y = s.astype(np.int32) + +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") + +# %% +y_true["cardiotocography"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["cardiotocography"] = fit_predict(model, X) + +# %% +# Plot and interpret results +# ========================== +# +# The algorithm performance relates to how good the true positive rate (TPR) is +# at low value of the false positive rate (FPR). The best algorithms have the +# curve on the top-left of the plot and the area under curve (AUC) close to 1. +# The diagonal dashed line represents a random classification of outliers and +# inliers. + +# %% +import math + +from sklearn.metrics import RocCurveDisplay -# plotting parameters cols = 2 -linewidth = 1 pos_label = 0 # mean 0 belongs to positive class -rows = math.ceil(len(datasets_name) / cols) - -fig, axs = plt.subplots(rows, cols, figsize=(10, 10 * 2), sharex=True, sharey=True) +datasets_names = y_true.keys() +rows = math.ceil(len(datasets_names) / cols) -for ax, dataset_name in zip(axs.ravel(), datasets_name): - (X, y) = preprocess_dataset(dataset_name=dataset_name) +fig, axs = plt.subplots(nrows=rows, ncols=cols, squeeze=False, figsize=(10, rows * 4)) - for model_idx, model_name in enumerate(models_name): - y_pred = compute_prediction(X, model_name=model_name) +for ax, dataset_name in zip(axs.ravel(), datasets_names): + for model_idx, model_name in enumerate(model_names): display = RocCurveDisplay.from_predictions( - y, - y_pred, + y_true[dataset_name], + y_pred[model_name][dataset_name], pos_label=pos_label, name=model_name, - linewidth=linewidth, ax=ax, - plot_chance_level=(model_idx == len(models_name) - 1), - chance_level_kw={ - "linewidth": linewidth, - "linestyle": ":", - }, + plot_chance_level=(model_idx == len(model_names) - 1), + chance_level_kw={"linestyle": ":"}, ) ax.set_title(dataset_name) - ax.legend(loc="lower right") -plt.tight_layout(pad=2.0) # spacing between subplots +_ = plt.tight_layout(pad=2.0) # spacing between subplots + +# %% +# We observe that once the number of neighbors is tuned, LOF and IForest perform +# similarly in terms of ROC AUC for the forestcover and cardiotocography +# datasets. The score for IForest is slightly better for the SA dataset and LOF +# performs considerably better on the Ames housing dataset than IForest. +# +# Ablation study +# ============== +# +# In this section we explore the impact of the hyperparameter `n_neighbors` and +# the choice of scaling the numerical variables on the LOF model. Here we use +# the :ref:`covtype_dataset` dataset as the binary encoded categories introduce +# a natural scale of euclidean distances between 0 and 1. We then want a scaling +# method to avoid granting a privilege to non-binary features and that is robust +# enough to outliers so that the task of finding them does not become too +# difficult. + +# %% +X = X_forestcover +y = y_true["forestcover"] + +n_samples = X.shape[0] +n_neighbors_list = (n_samples * np.array([0.2, 0.02, 0.01, 0.001])).astype(np.int32) +model = make_pipeline(RobustScaler(), LocalOutlierFactor()) + +linestyles = ["solid", "dashed", "dashdot", ":", (5, (10, 3))] + +fig, ax = plt.subplots() +for model_idx, (linestyle, n_neighbors) in enumerate(zip(linestyles, n_neighbors_list)): + model.set_params(localoutlierfactor__n_neighbors=n_neighbors) + model.fit(X) + y_pred = model[-1].negative_outlier_factor_ + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=f"n_neighbors = {n_neighbors}", + ax=ax, + plot_chance_level=(model_idx == len(n_neighbors_list) - 1), + chance_level_kw={"linestyle": (0, (1, 10))}, + linestyle=linestyle, + linewidth=2, + ) +_ = ax.set_title("RobustScaler with varying n_neighbors\non forestcover dataset") + +# %% +# We observe that the number of neighbors has a big impact on the performance of +# the model. If one has access to (at least some) ground truth labels, it is +# then important to tune `n_neighbors` accordingly. A convenient way to do so is +# to explore values for `n_neighbors` of the order of magnitud of the expected +# contamination. + +# %% +from sklearn.preprocessing import MinMaxScaler, SplineTransformer, StandardScaler + +preprocessor_list = [ + None, + RobustScaler(), + StandardScaler(), + MinMaxScaler(), + SplineTransformer(), +] +expected_anomaly_fraction = 0.02 +lof = LocalOutlierFactor(n_neighbors=int(n_samples * expected_anomaly_fraction)) + +fig, ax = plt.subplots() +for model_idx, (linestyle, preprocessor) in enumerate( + zip(linestyles, preprocessor_list) +): + model = make_pipeline(preprocessor, lof) + model.fit(X) + y_pred = model[-1].negative_outlier_factor_ + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=str(preprocessor).split("(")[0], + ax=ax, + plot_chance_level=(model_idx == len(preprocessor_list) - 1), + chance_level_kw={"linestyle": (0, (1, 10))}, + linestyle=linestyle, + linewidth=2, + ) +_ = ax.set_title("Fixed n_neighbors with varying preprocessing\non forestcover dataset") + +# %% +# On the one hand, :class:`~sklearn.preprocessing.RobustScaler` scales each +# feature independently by using the interquartile range (IQR) by default, which +# is the range between the 25th and 75th percentiles of the data. It centers the +# data by subtracting the median and then scale it by dividing by the IQR. The +# IQR is robust to outliers: the median and interquartile range are less +# affected by extreme values than the range, the mean and the standard +# deviation. Furthermore, :class:`~sklearn.preprocessing.RobustScaler` does not +# squash marginal outlier values, contrary to +# :class:`~sklearn.preprocessing.StandardScaler`. +# +# On the other hand, :class:`~sklearn.preprocessing.MinMaxScaler` scales each +# feature individually such that its range maps into the range between zero and +# one. If there are outliers in the data, they can skew it towards either the +# minimum or maximum values, leading to a completely different distribution of +# data with large marginal outliers: all non-outlier values can be collapsed +# almost together as a result. +# +# We also evaluated no preprocessing at all (by passing `None` to the pipeline), +# :class:`~sklearn.preprocessing.StandardScaler` and +# :class:`~sklearn.preprocessing.SplineTransformer`. Please refer to their +# respective documentation for more details. +# +# Note that the optimal preprocessing depends on the dataset, as shown below: + +# %% +X = X_cardiotocography +y = y_true["cardiotocography"] + +n_samples, expected_anomaly_fraction = X.shape[0], 0.025 +lof = LocalOutlierFactor(n_neighbors=int(n_samples * expected_anomaly_fraction)) + +fig, ax = plt.subplots() +for model_idx, (linestyle, preprocessor) in enumerate( + zip(linestyles, preprocessor_list) +): + model = make_pipeline(preprocessor, lof) + model.fit(X) + y_pred = model[-1].negative_outlier_factor_ + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=str(preprocessor).split("(")[0], + ax=ax, + plot_chance_level=(model_idx == len(preprocessor_list) - 1), + chance_level_kw={"linestyle": (0, (1, 10))}, + linestyle=linestyle, + linewidth=2, + ) +ax.set_title( + "Fixed n_neighbors with varying preprocessing\non cardiotocography dataset" +) plt.show() diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 6e0ae0ae1c109..5b78fe0636d8c 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -52,6 +52,8 @@ scaled_X_train = scaler.fit_transform(X_train) # %% +# .. _neighbors_scaling: +# # Effect of rescaling on a k-neighbors models # =========================================== #