8000 Merge pull request #1657 from glouppe/feature-importances · seckcoder/scikit-learn@a54809e · GitHub
[go: up one dir, main page]

Skip to content

Commit a54809e

Browse files
committed
Merge pull request scikit-learn#1657 from glouppe/feature-importances
[MRG] Remove compute_importances
2 parents 1adfed9 + 7130c99 commit a54809e

File tree

11 files changed

+1395
-1552
lines changed

11 files changed

+1395
-1552
lines changed

doc/whats_new.rst

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,19 @@ Changelog
1616
scoring function such as area under the ROC curve and f-beta scores.
1717
See :ref:`score_func_objects` for details. By `Andreas Müller`_.
1818
Passing a function from :mod:`sklearn.metrics` as ``score_func`` is
19-
deprecated.
19+
deprecated.
20+
21+
- Added :class:`ensemble.AdaBoostClassifier` and
22+
:class:`ensemble.AdaBoostRegressor`, by `Noel Dawe`_ and
23+
`Gilles Louppe`_. See the :ref:`AdaBoost <adaboost>` section of the user
24+
guide for details and examples.
25+
26+
- Feature importances in :class:`tree.DecisionTreeClassifier`,
27+
:class:`tree.DecisionTreeRegressor` and all derived ensemble estimators
28+
are now computed on the fly when accessing the ``feature_importances_``
29+
attribute. Setting ``compute_importances=True`` is no longer required.
30+
By `Gilles Louppe`_.
2031

21-
- Added :class:`ensemble.AdaBoostClassifier` and
22-
:class:`ensemble.AdaBoostRegressor`, by `Noel Dawe`_ and `Gilles Louppe`_.
23-
See the :ref:`AdaBoost <adaboost>` section of the user guide for
24-
details and examples.
2532

2633
.. _changes_0_13:
2734

sklearn/ensemble/forest.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ def _parallel_build_trees(n_trees, forest, X, y, sample_weight,
7575
seed = random_state.randint(MAX_INT)
7676

7777
tree = forest._make_estimator(append=False)
78-
tree.set_params(compute_importances=forest.compute_importances)
7978
tree.set_params(random_state=check_random_state(seed))
8079

8180
if forest.bootstrap:
@@ -230,6 +229,13 @@ def __init__(self,
230229
estimator_params=estimator_params)
231230

232231
self.bootstrap = bootstrap
232+
233+
if compute_importances:
234+
warn("Setting compute_importances=True is no longer "
235+
"required. Variable importances are now computed on the fly "
236+
"when accessing the feature_importances_ attribute. This "
237+
"parameter will be removed in 0.15.", DeprecationWarning)
238+
233239
self.compute_importances = compute_importances
234240
self.oob_score = oob_score
235241
self.n_jobs = n_jobs
@@ -239,7 +245,6 @@ def __init__(self,
239245
self.n_outputs_ = None
240246
self.classes_ = None
241247
self.n_classes_ = None
242-
self.feature_importances_ = None
243248

244249
self.verbose = verbose
245250

@@ -453,14 +458,24 @@ def fit(self, X, y, sample_weight=None):
453458

454459
self.oob_score_ /= self.n_outputs_
455460

456-
# Sum the importances
457-
if self.compute_importances:
458-
self.feature_importances_ = \
459-
sum(tree.feature_importances_ for tree in self.estimators_) \
460-
/ self.n_estimators
461-
462461
return self
463462

463+
@property
464+
def feature_importances_(self):
465+
"""Return the feature importances (the higher, the more important the
466+
feature).
467+
468+
Returns
469+
-------
470+
feature_importances_ : array, shape = [n_features]
471+
"""
472+
if self.estimators_ is None or len(self.estimators_) == 0:
473+
raise ValueError("Estimator not fitted, "
474+
"call `fit` before `feature_importances_`.")
475+
476+
return sum(tree.feature_importances_
477+
for tree in self.estimators_) / self.n_estimators
478+
464479

465480
class ForestClassifier(BaseForest, ClassifierMixin):
466481
"""Base class for forest of trees-based classifiers.
@@ -731,10 +746,6 @@ class RandomForestClassifier(ForestClassifier):
731746
bootstrap : boolean, optional (default=True)
732747
Whether bootstrap samples are used when building trees.
733748
734-
compute_importances : boolean, optional (default=True)
735-
Whether feature importances are computed and stored into the
736-
``feature_importances_`` attribute when calling fit.
737-
738749
oob_score : bool
739750
Whether to use out-of-bag samples to estimate
740751
the generalization error.
@@ -877,10 +888,6 @@ class RandomForestRegressor(ForestRegressor):
877888
bootstrap : boolean, optional (default=True)
878889
Whether bootstrap samples are used when building trees.
879890
880-
compute_importances : boolean, optional (default=True)
881-
Whether feature importances are computed and stored into the
882-
``feature_importances_`` attribute when calling fit.
883-
884891
oob_score : bool
885892
whether to use out-of-bag samples to estimate
886893
the generalization error.
@@ -1015,10 +1022,6 @@ class ExtraTreesClassifier(ForestClassifier):
10151022
bootstrap : boolean, optional (default=False)
10161023
Whether bootstrap samples are used when building trees.
10171024
1018-
compute_importances : boolean, optional (default=True)
1019-
Whether feature importances are computed and stored into the
1020-
``feature_importances_`` attribute when calling fit.
1021-
10221025
oob_score : bool
10231026
Whether to use out-of-bag samples to estimate
10241027
the generalization error.
@@ -1166,10 +1169,6 @@ class ExtraTreesRegressor(ForestRegressor):
11661169
Whether bootstrap samples are used when building trees.
11671170
Note: this parameter is tree-specific.
11681171
1169-
compute_importances : boolean, optional (default=True)
1170-
Whether feature importances are computed and stored into the
1171-
``feature_importances_`` attribute when calling fit.
1172-
11731172
oob_score : bool
11741173
Whether to use out-of-bag samples to estimate
11751174
the generalization error.

sklearn/ensemble/gradient_boosting.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ def _fit_stage(self, i, X, X_argsorted, y, y_pred, sample_mask,
463463
min_samples_leaf=self.min_samples_leaf,
464464
min_density=self.min_density,
465465
max_features=self.max_features,
466-
compute_importances=False,
467466
random_state=random_state)
468467

469468
tree.fit(X, residual, sample_mask, X_argsorted, check_input=False)
@@ -681,14 +680,21 @@ def staged_decision_function(self, X):
681680

682681
@property
683682
def feature_importances_(self):
683+
"""Return the feature importances (the higher, the more important the
684+
feature).
685+
686+
Returns
687+
-------
688+
feature_importances_ : array, shape = [n_features]
689+
"""
684690
if self.estimators_ is None or len(self.estimators_) == 0:
685691
raise ValueError("Estimator not fitted, "
686692
"call `fit` before `feature_importances_`.")
693+
687694
total_sum = np.zeros((self.n_features, ), dtype=np.float64)
688695
for stage in self.estimators_:
689-
stage_sum = sum(
690-
tree.tree_.compute_feature_importances(method='gini')
691-
for tree in stage) / len(stage)
696+
stage_sum = sum(tree.feature_importances_
697+
for tree in stage) / len(stage)
692698
total_sum += stage_sum
693699

694700
importances = total_sum / len(self.estimators_)

sklearn/ensemble/tests/test_forest.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def test_importances():
183183
shuffle=False,
184184
random_state=0)
185185

186-
clf = RandomForestClassifier(n_estimators=10, compute_importances=True)
186+
clf = RandomForestClassifier(n_estimators=10)
187187
clf.fit(X, y)
188188
importances = clf.feature_importances_
189189
n_important = sum(importances > 0.1)
@@ -194,10 +194,6 @@ def test_importances():
194194
X_new = clf.transform(X, threshold="mean")
195195
assert_less(0 < X_new.shape[1], X.shape[1])
196196

197-
clf = RandomForestClassifier(n_estimators=10)
198-
clf.fit(X, y)
199-
assert_true(clf.feature_importances_ is None)
200-
201197

202198
def test_oob_score_classification():
203199
"""Check that oob prediction is a good estimation of the generalization

sklearn/ensemble/tests/test_weight_boosting.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def test_importances():
164164
random_state=1)
165165

166166
for alg in ['SAMME', 'SAMME.R']:
167-
clf = AdaBoostClassifier(algorithm=alg, compute_importances=True)
167+
clf = AdaBoostClassifier(algorithm=alg)
168168

169169
clf.fit(X, y)
170170
importances = clf.feature_importances_
@@ -173,10 +173,6 @@ def test_importances():
173173
assert_e F438 qual((importances[:3, np.newaxis] >= importances[3:]).all(),
174174
True)
175175

176-
clf = AdaBoostClassifier()
177-
clf.fit(X, y)
178-
assert_true(clf.feature_importances_ is None)
179-
180176

181177
def test_error():
182178
"""Test that it gives proper exception on deficient input."""

sklearn/ensemble/weight_boosting.py

Lines changed: 30 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ def __init__(self,
5050
base_estimator,
5151
n_estimators=50,
5252
estimator_params=tuple(),
53-
learning_rate=1.,
54-
compute_importances=False):
53+
learning_rate=1.):
5554

5655
super(BaseWeightBoosting, self).__init__(
5756
base_estimator=base_estimator,
@@ -61,8 +60,6 @@ def __init__(self,
6160
self.estimator_weights_ = None
6261
self.estimator_errors_ = None
6362
self.learning_rate = learning_rate
64-
self.compute_importances = compute_importances
65-
self.feature_importances_ = None
6663

6764
def fit(self, X, y, sample_weight=None):
6865
"""Build a boosted classifier/regressor from the training set (X, y).
@@ -89,9 +86,6 @@ def fit(self, X, y, sample_weight=None):
8986
if self.learning_rate <= 0:
9087
raise ValueError("learning_rate must be greater than zero")
9188

92-
if self.compute_importances:
93-
self.base_estimator.set_params(compute_importances=True)
94-
9589
# Check data
9690
X, y = check_arrays(X, y, sparse_format="dense")
9791

@@ -142,21 +136,6 @@ def fit(self, X, y, sample_weight=None):
142136
# Normalize
143137
sample_weight /= sample_weight_sum
144138

145-
# Sum the importances
146-
try:
147-
if self.compute_importances:
148-
norm = self.estimator_weights_.sum()
149-
self.feature_importances_ = (
150-
sum(weight * clf.feature_importances_ for weight, clf
151-
in zip(self.estimator_weights_, self.estimators_))
152-
/ norm)
153-
154-
except AttributeError:
155-
raise AttributeError(
156-
"Unable to compute feature importances "
157-
"since base_estimator does not have a "
158-
"feature_importances_ attribute")
159-
160139
return self
161140

162141
@abstractmethod
@@ -213,18 +192,41 @@ def staged_score(self, X, y):
213192
Returns
214193
-------
215194
z : float
216-
217195
"""
218196
for y_pred in self.staged_predict(X):
219197
if isinstance(self, ClassifierMixin):
220198
yield accuracy_score(y, y_pred)
221199
else:
222200
yield r2_score(y, y_pred)
223201

202+
@property
203+
def feature_importances_(self):
204+
"""Return the feature importances (the higher, the more important the
205+
feature).
206+
207+
Returns
208+
-------
209+
feature_importances_ : array, shape = [n_features]
210+
"""
211+
if self.estimators_ is None or len(self.estimators_) == 0:
212+
raise ValueError("Estimator not fitted, "
213+
"call `fit` before `feature_importances_`.")
214+
215+
try:
216+
norm = self.estimator_weights_.sum()
217+
return (sum(weight * clf.feature_importances_ for weight, clf
218+
in zip(self.estimator_weights_, self.estimators_))
219+
/ norm)
220+
221+
except AttributeError:
222+
raise AttributeError(
223+
"Unable to compute feature importances "
224+
"since base_estimator does not have a "
225+
"feature_importances_ attribute")
226+
224227

225228
def _samme_proba(estimator, n_classes, X):
226-
"""
227-
Calculate algorithm 4, step 2, equation c) of Zhu et al [1]
229+
"""Calculate algorithm 4, step 2, equation c) of Zhu et al [1].
228230
229231
References
230232
----------
@@ -277,10 +279,6 @@ class AdaBoostClassifier(BaseWeightBoosting, ClassifierMixin):
277279
The SAMME.R algorithm typically converges faster than SAMME,
278280
achieving a lower test error with fewer boosting iterations.
279281
280-
compute_importances : boolean, optional (default=False)
281-
Whether feature importances are computed and stored in the
282-
``feature_importances_`` attribute when calling fit.
283-
284282
Attributes
285283
----------
286284
`estimators_` : list of classifiers
@@ -301,7 +299,6 @@ class AdaBoostClassifier(BaseWeightBoosting, ClassifierMixin):
301299
302300
`feature_importances_` : array of shape = [n_features]
303301
The feature importances if supported by the ``base_estimator``.
304-
Only computed if ``compute_importances=True``.
305302
306303
See also
307304
--------
@@ -319,14 +316,12 @@ def __init__(self,
319316
base_estimator=DecisionTreeClassifier(max_depth=1),
320317
n_estimators=50,
321318
learning_rate=1.,
322-
algorithm='SAMME.R',
323-
compute_importances=False):
319+
algorithm='SAMME.R'):
324320

325321
super(AdaBoostClassifier, self).__init__(
326322
base_estimator=base_estimator,
327323
n_estimators=n_estimators,
328-
learning_rate=learning_rate,
329-
compute_importances=compute_importances)
324+
learning_rate=learning_rate)
330325

331326
self.algorithm = algorithm
332327

@@ -801,10 +796,6 @@ class AdaBoostRegressor(BaseWeightBoosting, RegressorMixin):
801796
The loss function to use when updating the weights after each
802797
boosting iteration.
803798
804-
compute_importances : boolean, optional (default=False)
805-
Whether feature importances are computed and stored in the
806-
``feature_importances_`` attribute when calling fit.
807-
808799
random_state : int, RandomState instance or None, optional (default=None)
809800
If int, random_state is the seed used by the random number generator;
810801
If RandomState instance, random_state is the random number generator;
@@ -824,7 +815,6 @@ class AdaBoostRegressor(BaseWeightBoosting, RegressorMixin):
824815
825816
`feature_importances_` : array of shape = [n_features]
826817
The feature importances if supported by the ``base_estimator``.
827-
Only computed if ``compute_importances=True``.
828818
829819
See also
830820
--------
@@ -843,14 +833,12 @@ def __init__(self,
843833
n_estimators=50,
844834
learning_rate=1.,
845835
loss='linear',
846-
compute_importances=False,
847836
random_state=None):
848837

849838
super(AdaBoostRegressor, self).__init__(
850839
base_estimator=base_estimator,
851840
n_estimators=n_estimators,
852-
learning_rate=learning_rate,
853-
compute_importances=compute_importances)
841+
learning_rate=learning_rate)
854842

855843
self.loss = loss
856844
self.random_state = random_state

0 commit comments

Comments
 (0)
0