-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[MRG+1] Select k-best features in SelectFromModel #9616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
775facc
07c1709
529bcb8
53ee64f
f4f8ea3
63438b5
a3eac5b
d509f33
0012403
c56c25b
40805d8
a0875d8
2bdfb48
d522280
8b0d7b1
42b94bd
e901c3d
dba1928
9249b53
e5bb39a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
import pytest | ||
import numpy as np | ||
|
||
from sklearn.utils.testing import assert_true | ||
|
@@ -8,6 +9,7 @@ | |
from sklearn.utils.testing import assert_array_almost_equal | ||
from sklearn.utils.testing import assert_array_equal | ||
from sklearn.utils.testing import assert_almost_equal | ||
from sklearn.utils.testing import assert_allclose | ||
from sklearn.utils.testing import assert_raises | ||
from sklearn.utils.testing import skip_if_32bit | ||
|
||
|
@@ -17,6 +19,7 @@ | |
from sklearn.feature_selection import SelectFromModel | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.linear_model import PassiveAggressiveClassifier | ||
from sklearn.base import BaseEstimator | ||
|
||
iris = datasets.load_iris() | ||
data, y = iris.data, iris.target | ||
|
@@ -40,6 +43,121 @@ def test_input_estimator_unchanged(): | |
assert_true(transformer.estimator is est) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"max_features, err_type, err_msg", | ||
[(-1, ValueError, "'max_features' should be 0 and"), | ||
(data.shape[1] + 1, ValueError, "'max_features' should be 0 and"), | ||
('gobbledigook', TypeError, "should be an integer"), | ||
('all', TypeError, "should be an integer")] | ||
) | ||
def test_max_features_error(max_features, err_type, err_msg): | ||
clf = RandomForestClassifier(n_estimators=50, random_state=0) | ||
|
||
transformer = SelectFromModel(estimator=clf, | ||
max_features=max_features, | ||
threshold=-np.inf) | ||
with pytest.raises(err_type, match=err_msg): | ||
transformer.fit(data, y) | ||
|
||
|
||
@pytest.mark.parametrize("max_features", [0, 2, data.shape[1]]) | ||
def test_max_features_dim(max_features): | ||
clf = RandomForestClassifier(n_estimators=50, random_state=0) | ||
transformer = SelectFromModel(estimator=clf, | ||
max_features=max_features, | ||
threshold=-np.inf) | ||
X_trans = transformer.fit_transform(data, y) | ||
assert X_trans.shape[1] == max_features | ||
|
||
|
||
class FixedImportanceEstimator(BaseEstimator): | ||
def __init__(self, importances): | ||
self.importances = importances | ||
|
||
def fit(self, X, y=None): | ||
self.feature_importances_ = np.array(self.importances) | ||
|
||
|
||
def test_max_features(): | ||
# Test max_features parameter using various values | ||
X, y = datasets.make_classification( | ||
n_samples=1000, n_features=10, n_informative=3, n_redundant=0, | ||
n_repeated=0, shuffle=False, random_state=0) | ||
max_features = X.shape[1] | ||
est = RandomForestClassifier(n_estimators=50, random_state=0) | ||
|
||
transformer1 = SelectFromModel(estimator=est, | ||
threshold=-np.inf) | ||
transformer2 = SelectFromModel(estimator=est, | ||
max_features=max_features, | ||
threshold=-np.inf) | ||
X_new1 = transformer1.fit_transform(X, y) | ||
X_new2 = transformer2.fit_transform(X, y) | ||
assert_allclose(X_new1, X_new2) | ||
|
||
# Test max_features against actual model. | ||
transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025, | ||
random_state=42)) | ||
X_new1 = transformer1.fit_transform(X, y) | ||
scores1 = np.abs(transformer1.estimator_.coef_) | ||
candidate_indices1 = np.argsort(-scores1, kind='mergesort') | ||
|
||
for n_features in range(1, X_new1.shape[1] + 1): | ||
transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025, | ||
random_state=42), | ||
max_features=n_features, | ||
threshold=-np.inf) | ||
X_new2 = transformer2.fit_transform(X, y) | ||
scores2 = np.abs(transformer2.estimator_.coef_) | ||
candidate_indices2 = np.argsort(-scores2, kind='mergesort') | ||
assert_allclose(X[:, candidate_indices1[:n_features]], | ||
X[:, candidate_indices2[:n_features]]) | ||
assert_allclose(transformer1.estimator_.coef_, | ||
transformer2.estimator_.coef_) | ||
|
||
|
||
def test_max_features_tiebreak(): | ||
# Test if max_features can break tie among feature importance | ||
X, y = datasets.make_classification( | ||
n_samples=1000, n_features=10, n_informative=3, n_redundant=0, | ||
n_repeated=0, shuffle=False, random_state=0) | ||
max_features = X.shape[1] | ||
|
||
feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) | ||
for n_features in range(1, max_features + 1): | ||
transformer = SelectFromModel( | ||
FixedImportanceEstimator(feature_importances), | ||
max_features=n_features, | ||
threshold=-np.inf) | ||
X_new = transformer.fit_transform(X, y) | ||
selected_feature_indices = np.where(transformer._get_support_mask())[0] | ||
assert_array_equal(selected_feature_indices, np.arange(n_features)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm okay with this approach, but wonder if we'd be better off taking There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally think it is better to give users exactly the number of features they ask for. From my experience as a user, I don't care so much which of feature X or feature Y I get when both are tied in importance as much as I do that when I ask for Z features, I get Z features and not less. Consistency with |
||
assert X_new.shape[1] == n_features | ||
|
||
|
||
def test_threshold_and_max_features(): | ||
X, y = datasets.make_classification( | ||
n_samples=1000, n_features=10, n_informative=3, n_redundant=0, | ||
n_repeated=0, shuffle=False, random_state=0) | ||
est = RandomForestClassifier(n_estimators=50, random_state=0) | ||
|
||
transformer1 = SelectFromModel(estimator=est, max_features=3, | ||
threshold=-np.inf) | ||
X_new1 = transformer1.fit_transform(X, y) | ||
|
||
transformer2 = SelectFromModel(estimator=est, threshold=0.04) | ||
X_new2 = transformer2.fit_transform(X, y) | ||
|
||
transformer3 = SelectFromModel(estimator=est, max_features=3, | ||
threshold=0.04) | ||
X_new3 = transformer3.fit_transform(X, y) | ||
assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) | ||
selected_indices = transformer3.transform( | ||
np.arange(X.shape[1])[np.newaxis, :]) | ||
assert_allclose(X_new3, X[:, selected_indices[0]]) | ||
|
||
|
||
@skip_if_32bit | ||
def test_feature_importances(): | ||
X, y = datasets.make_classification( | ||
n_samples=1000, n_features=10, n_informative=3, n_redundant=0, | ||
|
@@ -87,7 +205,8 @@ def test_coef_default_threshold(): | |
n_repeated=0, shuffle=False, random_state=0) | ||
|
||
# For the Lasso and related models, the threshold defaults to 1e-5 | ||
transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) | ||
transformer = SelectFromModel(estimator=Lasso(alpha=0.1, | ||
random_state=42)) | ||
transformer.fit(X, y) | ||
X_new = transformer.transform(X) | ||
mask = np.abs(transformer.estimator_.coef_) > 1e-5 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It probably wasn't your doing, but generally you should avoid touching code not related to the change.