8000 DOC example on feature selection using negative `tol` values (#26205) · scikit-learn/scikit-learn@6ef2b26 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6ef2b26

Browse files
rprkhArturoAmorQ
authored andcommitted
DOC example on feature selection using negative tol values (#26205)
Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
1 parent 000c78c commit 6ef2b26

File tree

1 file changed

+50
-3
lines changed

1 file changed

+50
-3
lines changed

examples/feature_selection/plot_select_from_model_diabetes.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,6 @@
122122
print(f"Done in {toc_bwd - tic_bwd:.3f}s")
123123

124124
# %%
125-
# Discussion
126-
# ----------
127-
#
128125
# Interestingly, forward and backward selection have selected the same set of
129126
# features. In general, this isn't the case and the two methods would lead to
130127
# different results.
@@ -145,3 +142,53 @@
145142
# attribute. The forward SFS is faster than the backward SFS because it only
146143
# needs to perform `n_features_to_select = 2` iterations, while the backward
147144
# SFS needs to perform `n_features - n_features_to_select = 8` iterations.
145+
#
146+
# Using negative tolerance values
147+
# -------------------------------
148+
#
149+
# :class:`~sklearn.feature_selection.SequentialFeatureSelector` can be used
150+
# to remove features present in the dataset and return a
151+
# smaller subset of the original features with `direction="backward"`
152+
# and a negative value of `tol`.
153+
#
154+
# We begin by loading the Breast Cancer dataset, consisting of 30 different
155+
# features and 569 samples.
156+
from sklearn.datasets import load_breast_cancer
157+
import numpy as np
158+
159+
breast_cancer_data = load_breast_cancer()
160+
X, y = breast_cancer_data.data, breast_cancer_data.target
161+
feature_names = np.array(breast_cancer_data.feature_names)
162+
print(breast_cancer_data.DESCR)
163+
164+
# %%
165+
# We will make use of the :class:`~sklearn.linear_model.LogisticRegression`
166+
# estimator with :class:`~sklearn.feature_selection.SequentialFeatureSelector`
167+
# to perform the feature selection.
168+
from sklearn.linear_model import LogisticRegression
169+
from sklearn.pipeline import make_pipeline
170+
from sklearn.preprocessing import StandardScaler
171+
from sklearn.metrics import roc_auc_score
172+
173+
for tol in [-1e-2, -1e-3, -1e-4]:
174+
start = time()
175+
feature_selector = SequentialFeatureSelector(
176+
LogisticRegression(),
177+
n_features_to_select="auto",
178+
direction="backward",
179+
scoring="roc_auc",
180+
tol=tol,
181+
n_jobs=2,
182+
)
183+
model = make_pipeline(StandardScaler(), feature_selector, LogisticRegression())
184+
model.fit(X, y)
185+
end = time()
186+
print(f"\ntol: {tol}")
187+
print(f"Features selected: {feature_names[model[1].get_support()]}")
188+
print(f"ROC AUC score: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.3f}")
189+
print(f"Done in {end - start:.3f}s")
190+
191+
# %%
192+
# We can see that the number of features selected tend to increase as negative
193+
# values of `tol` approach to zero. The time taken for feature selection also
194+
# decreases as the values of `tol` come closer to zero.

0 commit comments

Comments
 (0)
0