8000 [MRG+1] Select k-best features in SelectFromModel by nsheth12 · Pull Request #9616 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+1] Select k-best features in SelectFromModel #9616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jul 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,13 @@ Linear, kernelized and related models
underlying implementation is not random.
:issue:`9497` by :user:`Albert Thomas <albertcthomas>`.

Preprocessing and feature selection

- Added select K best features functionality to
:class:`feature_selection.SelectFromModel`.
:issue:`6689` by :user:`Nihar Sheth <nsheth12>` and
:user:`Quazi Rahman <qmaruf>`.

Decomposition, manifold learning and clustering

- Deprecate ``precomputed`` parameter in function
Expand Down
38 changes: 33 additions & 5 deletions sklearn/feature_selection/from_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# License: BSD 3 clause

import numpy as np
import numbers

from .base import SelectorMixin
from ..base import BaseEstimator, clone, MetaEstimatorMixin
Expand Down Expand Up @@ -113,6 +114,13 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
``threshold`` in the case where the ``coef_`` attribute of the
estimator is of dimension 2.

max_features : int or None, optional
The maximum number of features selected scoring above ``threshold``.
To disable ``threshold`` and only select based on ``max_features``,
set ``threshold=-np.inf``.

.. versionadded:: 0.20

Attributes
----------
estimator_ : an estimator
Expand All @@ -123,11 +131,13 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
threshold_ : float
The threshold value used for feature selection.
"""
def __init__(self, estimator, threshold=None, prefit=False, norm_order=1):
def __init__(self, estimator, threshold=None, prefit=False,
norm_order=1, max_features=None):
self.estimator = estimator
self.threshold = threshold
self.prefit = prefit
self.norm_order = norm_order
self.max_features = max_features

def _get_support_mask(self):
# SelectFromModel can directly call on transform.
Expand All @@ -136,12 +146,20 @@ def _get_support_mask(self):
elif hasattr(self, 'estimator_'):
estimator = self.estimator_
else:
raise ValueError(
'Either fit SelectFromModel before transform or set "prefit='
'True" and pass a fitted estimator to the constructor.') 10000
raise ValueError('Either fit the model before transform or set'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably wasn't your doing, but generally you should avoid touching code not related to the change.

' "prefit=True" while passing the fitted'
' estimator to the constructor.')
scores = _get_feature_importances(estimator, self.norm_order)
threshold = _calculate_threshold(estimator, scores, self.threshold)
return scores >= threshold
if self.max_features is not None:
mask = np.zeros_like(scores, dtype=bool)
candidate_indices = \
np.argsort(-scores, kind='mergesort')[:self.max_features]
mask[candidate_indices] = True
else:
mask = np.ones_like(scores, dtype=bool)
mask[scores < threshold] = False
return mask

def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.
Expand All @@ -161,6 +179,16 @@ def fit(self, X, y=None, **fit_params):
-------
self : object
"""
if self.max_features is not None:
if not isinstance(self.max_features, numbers.Integral):
raise TypeError("'max_features' should be an integer between"
" 0 and {} features. Got {!r} instead."
.format(X.shape[1], self.max_features))
elif self.max_features < 0 or self.max_features > X.shape[1]:
raise ValueError("'max_features' should be 0 and {} features."
"Got {} instead."
.format(X.shape[1], self.max_features))

if self.prefit:
raise NotFittedError(
"Since 'prefit=True', call transform directly")
Expand Down
121 changes: 120 additions & 1 deletion sklearn/feature_selection/tests/test_from_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import numpy as np

from sklearn.utils.testing import assert_true
Expand All @@ -8,6 +9,7 @@
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import skip_if_32bit

Expand All @@ -17,6 +19,7 @@
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.base import BaseEstimator

iris = datasets.load_iris()
data, y = iris.data, iris.target
Expand All @@ -40,6 +43,121 @@ def test_input_estimator_unchanged():
assert_true(transformer.estimator is est)


@pytest.mark.parametrize(
"max_features, err_type, err_msg",
[(-1, ValueError, "'max_features' should be 0 and"),
(data.shape[1] + 1, ValueError, "'max_features' should be 0 and"),
('gobbledigook', TypeError, "should be an integer"),
('all', TypeError, "should be an integer")]
)
def test_max_features_error(max_features, err_type, err_msg):
clf = RandomForestClassifier(n_estimators=50, random_state=0)

transformer = SelectFromModel(estimator=clf,
max_features=max_features,
threshold=-np.inf)
with pytest.raises(err_type, match=err_msg):
transformer.fit(data, y)


@pytest.mark.parametrize("max_features", [0, 2, data.shape[1]])
def test_max_features_dim(max_features):
clf = RandomForestClassifier(n_estimators=50, random_state=0)
transformer = SelectFromModel(estimator=clf,
max_features=max_features,
threshold=-np.inf)
X_trans = transformer.fit_transform(data, y)
assert X_trans.shape[1] == max_features


class FixedImportanceEstimator(BaseEstimator):
def __init__(self, importances):
self.importances = importances

def fit(self, X, y=None):
self.feature_importances_ = np.array(self.importances)


def test_max_features():
# Test max_features parameter using various values
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
max_features = X.shape[1]
est = RandomForestClassifier(n_estimators=50, random_state=0)

transformer1 = SelectFromModel(estimator=est,
threshold=-np.inf)
transformer2 = SelectFromModel(estimator=est,
max_features=max_features,
threshold=-np.inf)
X_new1 = transformer1.fit_transform(X, y)
X_new2 = transformer2.fit_transform(X, y)
assert_allclose(X_new1, X_new2)

# Test max_features against actual model.
transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025,
random_state=42))
X_new1 = transformer1.fit_transform(X, y)
scores1 = np.abs(transformer1.estimator_.coef_)
candidate_indices1 = np.argsort(-scores1, kind='mergesort')

for n_features in range(1, X_new1.shape[1] + 1):
transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025,
random_state=42),
max_features=n_features,
threshold=-np.inf)
X_new2 = transformer2.fit_transform(X, y)
scores2 = np.abs(transformer2.estimator_.coef_)
candidate_indices2 = np.argsort(-scores2, kind='mergesort')
assert_allclose(X[:, candidate_indices1[:n_features]],
X[:, candidate_indices2[:n_features]])
assert_allclose(transformer1.estimator_.coef_,
transformer2.estimator_.coef_)


def test_max_features_tiebreak():
# Test if max_features can break tie among feature importance
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
max_features = X.shape[1]

feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
for n_features in range(1, max_features + 1):
transformer = SelectFromModel(
FixedImportanceEstimator(feature_importances),
max_features=n_features,
threshold=-np.inf)
X_new = transformer.fit_transform(X, y)
selected_feature_indices = np.where(transformer._get_support_mask())[0]
assert_array_equal(selected_feature_indices, np.arange(n_features))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with this approach, but wonder if we'd be better off taking max_features literally and returning none of the tying features at the cutoff (to avoid users being surprised by the tie-breaking; although we do break ties like this in SelectKBest and SelectPercentile, and perhaps we should remain consistent). WDYT? Perhaps it rarely matters.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally think it is better to give users exactly the number of features they ask for. From my experience as a user, I don't care so much which of feature X or feature Y I get when both are tied in importance as much as I do that when I ask for Z features, I get Z features and not less. Consistency with SelectKBest and SelectPercentile would be other arguments in favor of keeping as is. However, this is just my 2 cents, and I'll defer to you on the final decision.

assert X_new.shape[1] == n_features


def test_threshold_and_max_features():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
est = RandomForestClassifier(n_estimators=50, random_state=0)

transformer1 = SelectFromModel(estimator=est, max_features=3,
threshold=-np.inf)
X_new1 = transformer1.fit_transform(X, y)

transformer2 = SelectFromModel(estimator=est, threshold=0.04)
X_new2 = transformer2.fit_transform(X, y)

transformer3 = SelectFromModel(estimator=est, max_features=3,
threshold=0.04)
X_new3 = transformer3.fit_transform(X, y)
assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
selected_indices = transformer3.transform(
np.arange(X.shape[1])[np.newaxis, :])
assert_allclose(X_new3, X[:, selected_indices[0]])


@skip_if_32bit
def test_feature_importances():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
Expand Down Expand Up @@ -87,7 +205,8 @@ def test_coef_default_threshold():
n_repeated=0, shuffle=False, random_state=0)

# For the Lasso and related models, the threshold defaults to 1e-5
transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
transformer = SelectFromModel(estimator=Lasso(alpha=0.1,
random_state=42))
transformer.fit(X, y)
X_new = transformer.transform(X)
mask = np.abs(transformer.estimator_.coef_) > 1e-5
Expand Down
0