|
10 | 10 | from sklearn.utils.testing import assert_almost_equal
|
11 | 11 | from sklearn.utils.testing import assert_warns
|
12 | 12 | from sklearn.utils.testing import skip_if_32bit
|
| 13 | +from sklearn.utils.testing import assert_equal |
13 | 14 |
|
14 | 15 | from sklearn import datasets
|
15 | 16 | from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
|
16 | 17 | from sklearn.svm import LinearSVC
|
17 | 18 | from sklearn.feature_selection import SelectFromModel
|
18 | 19 | from sklearn.ensemble import RandomForestClassifier
|
19 | 20 | from sklearn.linear_model import PassiveAggressiveClassifier
|
| 21 | +from sklearn.base import BaseEstimator |
20 | 22 |
|
21 | 23 | iris = datasets.load_iris()
|
22 | 24 | data, y = iris.data, iris.target
|
@@ -63,6 +65,112 @@ def test_input_estimator_unchanged():
|
63 | 65 | assert_true(transformer.estimator is est)
|
64 | 66 |
|
65 | 67 |
|
| 68 | +def check_invalid_max_features(est, X, y): |
| 69 | + max_features = X.shape[1] |
| 70 | + for invalid_max_n_feature in [-1, max_features + 1, 'gobbledigook']: |
| 71 | + transformer = SelectFromModel(estimator=est, |
| 72 | + max_features=invalid_max_n_feature) |
| 73 | + assert_raises(ValueError, transformer.fit, X, y) |
| 74 | + |
| 75 | + |
| 76 | +def check_valid_max_features(est, X, y): |
| 77 | + max_features = X.shape[1] |
| 78 | + for valid_max_n_feature in [0, max_features, 'all', 5]: |
| 79 | + transformer = SelectFromModel(estimator=est, |
| 80 | + max_features=valid_max_n_feature) |
| 81 | + X_new = transformer.fit_transform(X, y) |
| 82 | + if valid_max_n_feature == 'all': |
| 83 | + valid_max_n_feature = max_features |
| 84 | + assert_equal(X_new.shape[1], valid_max_n_feature) |
| 85 | + |
| 86 | + |
| 87 | +class FixedImportanceEstimator(BaseEstimator): |
| 88 | +
F438
def __init__(self, importances): |
| 89 | + self.importances = importances |
| 90 | + |
| 91 | + def fit(self, X, y=None): |
| 92 | + self.feature_importances_ = np.array(self.importances) |
| 93 | + |
| 94 | + |
| 95 | +def check_max_features(est, X, y): |
| 96 | + X = X.copy() |
| 97 | + max_features = X.shape[1] |
| 98 | + |
| 99 | + check_valid_max_features(est, X, y) |
| 100 | + check_invalid_max_features(est, X, y) |
| 101 | + |
| 102 | + transformer1 = SelectFromModel(estimator=est, max_features='all') |
| 103 | + transformer2 = SelectFromModel(estimator=est, |
| 104 | + max_features=max_features) |
| 105 | + X_new1 = transformer1.fit_transform(X, y) |
| 106 | + X_new2 = transformer2.fit_transform(X, y) |
| 107 | + assert_array_equal(X_new1, X_new2) |
| 108 | + |
| 109 | + # Test max_features against actual model. |
| 110 | + |
| 111 | + transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025)) |
| 112 | + X_new1 = transformer1.fit_transform(X, y) |
| 113 | + for n_features in range(1, X_new1.shape[1] + 1): |
| 114 | + transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025), |
| 115 | + max_features=n_features) |
| 116 | + X_new2 = transformer2.fit_transform(X, y) |
| 117 | + assert_array_equal(X_new1[:, :n_features], X_new2) |
| 118 | + assert_array_equal(transformer1.estimator_.coef_, |
| 119 | + transformer2.estimator_.coef_) |
| 120 | + |
| 121 | + # Test if max_features can break tie among feature importance |
| 122 | + |
| 123 | + feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) |
| 124 | + for n_features in range(1, max_features + 1): |
| 125 | + transformer = SelectFromModel( |
| 126 | + FixedImportanceEstimator(feature_importances), |
| 127 | + max_features=n_features) |
| 128 | + X_new = transformer.fit_transform(X, y) |
| 129 | + selected_feature_indices = np.where(transformer._get_support_mask())[0] |
| 130 | + assert_array_equal(selected_feature_indices, np.arange(n_features)) |
| 131 | + assert_equal(X_new.shape[1], n_features) |
| 132 | + |
| 133 | + |
| 134 | +def check_threshold_and_max_features(est, X, y): |
| 135 | + transformer1 = SelectFromModel(estimator=est, max_features=3) |
| 136 | + X_new1 = transformer1.fit_transform(X, y) |
| 137 | + |
| 138 | + transformer2 = SelectFromModel(estimator=est, threshold=0.04) |
| 139 | + X_new2 = transformer2.fit_transform(X, y) |
| 140 | + |
| 141 | + transformer3 = SelectFromModel(estimator=est, max_features=3, |
| 142 | + threshold=0.04) |
| 143 | + X_new3 = transformer3.fit_transform(X, y) |
| 144 | + assert_equal(X_new3.shape[1], min(X_new1.shape[1], X_new2.shape[1])) |
| 145 | + selected_indices = \ |
| 146 | + transformer3.transform(np.arange(X.shape[1]))[np.newaxis, :] |
| 147 | + assert_array_equal(X_new3, X[:, selected_indices[0][0]]) |
| 148 | + |
| 149 | + """ |
| 150 | + If threshold and max_features are not provided, all features are |
| 151 | + returned, use threshold=None if it is not required. |
| 152 | + """ |
| 153 | + transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) |
| 154 | + X_new = transformer.fit_transform(X, y) |
| 155 | + assert_array_equal(X, X_new) |
| 156 | + |
| 157 | + transformer = SelectFromModel(estimator=Lasso(alpha=0.1), max_features=3) |
| 158 | + X_new = transformer.fit_transform(X, y) |
| 159 | + assert_equal(X_new.shape[1], 3) |
| 160 | + |
| 161 | + # Threshold will be applied if it is not None |
| 162 | + transformer = SelectFromModel(estimator=Lasso(alpha=0.1), threshold=1e-5) |
| 163 | + X_new = transformer.fit_transform(X, y) |
| 164 | + mask = np.abs(transformer.estimator_.coef_) > 1e-5 |
| 165 | + assert_array_equal(X_new, X[:, mask]) |
| 166 | + |
| 167 | + transformer = SelectFromModel(estimator=Lasso(alpha=0.1), threshold=1e-5, |
| 168 | + max_features=4) |
| 169 | + X_new = transformer.fit_transform(X, y) |
| 170 | + mask = np.abs(transformer.estimator_.coef_) > 1e-5 |
| 171 | + assert_array_equal(X_new, X[:, mask]) |
| 172 | + |
| 173 | + |
66 | 174 | @skip_if_32bit
|
67 | 175 | def test_feature_importances():
|
68 | 176 | X, y = datasets.make_classification(
|
@@ -95,12 +203,16 @@ def test_feature_importances():
|
95 | 203 | assert_almost_equal(importances, importances_bis)
|
96 | 204 |
|
97 | 205 | # For the Lasso and related models, the threshold defaults to 1e-5
|
98 |
| - transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) |
| 206 | + transformer = SelectFromModel(estimator=Lasso(alpha=0.1), threshold=1e-5) |
99 | 207 | transformer.fit(X, y)
|
100 | 208 | X_new = transformer.transform(X)
|
101 | 209 | mask = np.abs(transformer.estimator_.coef_) > 1e-5
|
102 | 210 | assert_array_equal(X_new, X[:, mask])
|
103 | 211 |
|
| 212 | + # Test max_features parameter using various values |
| 213 | + check_max_features(est, X, y) |
| 214 | + check_threshold_and_max_features(est, X, y) |
| 215 | + |
104 | 216 |
|
105 | 217 | def test_partial_fit():
|
106 | 218 | est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
|
|
0 commit comments