8000 [WIP] Allow for multiple scoring metrics in RFECV by sokolat · Pull Request #28964 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] Allow for multiple scoring metrics in RFECV #28964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 96 additions & 39 deletions sklearn/feature_selection/_rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,31 @@
"""Recursive feature elimination for feature ranking"""

import warnings
from collections import defaultdict
from numbers import Integral

import numpy as np
from joblib import effective_n_jobs

from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
from ..metrics import check_scoring
from ..model_selection import check_cv
from ..model_selection._validation import _score
from ..utils._param_validation import HasMethods, Interval, RealNotInt
from ..utils.metadata_routing import (
from sklearn.base import (
BaseEstimator,
MetaEstimatorMixin,
_fit_context,
clone,
is_classifier,
)
from sklearn.feature_selection._base import SelectorMixin, _get_feature_importances
from sklearn.metrics import check_scoring
from sklearn.model_selection import check_cv
from sklearn.model_selection._validation import _score
from sklearn.utils._param_validation import HasMethods, Interval, RealNotInt
from sklearn.utils.metadata_routing import (
_raise_for_unsupported_routing,
_RoutingNotSupportedMixin,
)
from ..utils.metaestimators import _safe_split, available_if
from ..utils.parallel import Parallel, delayed
from ..utils.validation import check_is_fitted
from ._base import SelectorMixin, _get_feature_importances
from sklearn.utils.metaestimators import _safe_split, available_if
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import check_is_fitted


def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
Expand Down Expand Up @@ -667,7 +674,7 @@
**RFE._parameter_constraints,
"min_features_to_select": [Interval(Integral, 0, None, closed="neither")],
"cv": ["cv_object"],
"scoring": [None, str, callable],
"scoring": [None, str, callable, list, tuple, dict, set],
"n_jobs": [None, Integral],
}
_parameter_constraints.pop("n_features_to_select")
Expand Down Expand Up @@ -781,36 +788,86 @@
scores, step_n_features = zip(*scores_features)

step_n_features_rev = np.array(step_n_features[0])[::-1]
scores = np.array(scores)

# Reverse order such that lowest number of features is selected in case of tie.
scores_sum_rev = np.sum(scores, axis=0)[::-1]
n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)]
if isinstance(self.scoring, (list, tuple, set, dict)):

multi_scores = {score_name: [] for score_name in scores[0][0]}
for score in scores:
buffer = defaultdict(list)

Check warning on line 796 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L796

Added line #L796 was not covered by tests
for multi_score in score:
for score_name, score_value in multi_score.items():
buffer[score_name].append(score_value)

Check warning on line 799 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L799

Added line #L799 was not covered by tests
for score_name, score_values in buffer.items():
multi_scores[score_name].append(score_values)

Check warning on line 801 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L801

Added line #L801 was not covered by tests

multi_scores = {
score_name: np.array(score_values)
for score_name, score_values in multi_scores.items()
}

n_features_to_select = {score_name: 0 for score_name in multi_scores}

for score_name, score_values in multi_scores.items():
score_values_sum_rev = np.sum(score_values, axis=0)[::-1]
n_features_to_select[score_name] = step_n_features_rev[

Check warning on line 812 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L811-L812

Added lines #L811 - L812 were not covered by tests
np.argmax(score_values_sum_rev)
]

for score_name in multi_scores:
multi_scores[score_name] = multi_scores[score_name][:, ::-1]

Check warning on line 817 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L817

Added line #L817 was not covered by tests

self.cv_results_ = {"n_features": step_n_features_rev}

Check warning on line 819 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L819

Added line #L819 was not covered by tests
for score_name, scores in multi_scores.items():
self.cv_results_[f"mean_test_{score_name}"] = np.mean(scores, axis=0)
self.cv_results_[f"std_test_{score_name}"] = np.std(scores, axis=0)

Check warning on line 822 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L821-L822

Added lines #L821 - L822 were not covered by tests
self.cv_results_.update(
{
f"split{i}_test_{score_name}": scores[i]
for i in range(scores.shape[0])
}
)
self.cv_results_[f"rank_test_{score_name}"] = (

Check warning on line 829 in sklearn/feature_selection/_rfe.py

View check run for this annotation

Codecov / codecov/patch

sklearn/feature_selection/_rfe.py#L829

Added line #L829 was not covered by tests
np.argsort(self.cv_results_[f"mean_test_{score_name}"])[::-1] + 1
)

# Re-execute an elimination with best_k over the whole set
rfe = RFE(
estimator=self.estimator,
n_features_to_select=n_features_to_select,
step=self.step,
importance_getter=self.importance_getter,
verbose=self.verbose,
)
else:

rfe.fit(X, y)
scores = np.array(scores)

# Reverse order such that lowest
# number of features is selected in case of tie.
scores_sum_rev = np.sum(scores, axis=0)[::-1]
n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)]

# Re-execute an elimination with best_k over the whole set
rfe = RFE(
estimator=self.estimator,
n_features_to_select=n_features_to_select,
step=self.step,
importance_getter=self.importance_getter,
verbose=self.verbose,
)

rfe.fit(X, y)

# Set final attributes
self.support_ = rfe.support_
self.n_features_ = rfe.n_features_
self.ranking_ = rfe.ranking_
self.estimator_ = clone(self.estimator)
self.estimator_.fit(self._transform(X), y)

# reverse to stay consistent with before
scores_rev = scores[:, ::-1]

self.cv_results_ = {
"mean_test_score": np.mean(scores_rev, axis=0),
"std_test_score": np.std(scores_rev, axis=0),
**{
f"split{i}_test_score": scores_rev[i]
for i in range(scores.shape[0])
},
"n_features": step_n_features_rev,
}

# Set final attributes
self.support_ = rfe.support_
self.n_features_ = rfe.n_features_
self.ranking_ = rfe.ranking_
self.estimator_ = clone(self.estimator)
self.estimator_.fit(self._transform(X), y)

# reverse to stay consistent with before
scores_rev = scores[:, ::-1]
self.cv_results_ = {
"mean_test_score": np.mean(scores_rev, axis=0),
"std_test_score": np.std(scores_rev, axis=0),
**{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])},
"n_features": step_n_features_rev,
}
return self
14 changes: 14 additions & 0 deletions sklearn/utils/_param_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@
return constraint
if isinstance(constraint, str) and constraint == "nan":
return _NanConstraint()
if isinstance(constraint, (list, tuple, set, dict)):
return _Multimetric()

Check warning on line 143 in sklearn/utils/_param_validation.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/_param_validation.py#L143

Added line #L143 was not covered by tests
raise ValueError(f"Unknown constraint type: {constraint}")


Expand Down Expand Up @@ -745,6 +747,18 @@
)


class _Multimetric(_Constraint):
"""Constraint representing multimeric scorers"""

def is_satisfied_by(self, val):
from ..metrics._scorer import _MultimetricScorer

Check warning on line 754 in sklearn/utils/_param_validation.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/_param_validation.py#L754

Added line #L754 was not covered by tests

return isinstance(val, _MultimetricScorer)

Check warning on line 756 in sklearn/utils/_param_validation.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/_param_validation.py#L756

Added line #L756 was not covered by tests

def __str__(self):
return "a multimetric scorer"

Check warning on line 759 in sklearn/utils/_param_validation.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/_param_validation.py#L759

Added line #L759 was not covered by tests


class Hidden:
"""Class encapsulating a constraint not meant to be exposed to the user.

Expand Down
Loading
0