From 355954451be003e38e61f33b81262ecb0a414b11 Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Wed, 23 Oct 2013 13:02:08 -0400 Subject: [PATCH 1/9] Minimum redundancy maximum relevance feature selection --- sklearn/feature_selection/__init__.py | 3 + .../multivariate_filtering.py | 188 ++++++++++++++++++ .../tests/test_multivariate_filtering.py | 26 +++ 3 files changed, 217 insertions(+) create mode 100644 sklearn/feature_selection/multivariate_filtering.py create mode 100644 sklearn/feature_selection/tests/test_multivariate_filtering.py diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index 3d27638091995..cdad116b3dc4f 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -17,10 +17,13 @@ from .variance_threshold import VarianceThreshold +from .multivariate_filtering import MinRedundancyMaxRelevance + from .rfe import RFE from .rfe import RFECV __all__ = ['GenericUnivariateSelect', + 'MinRedundancyMaxRelevance', 'RFE', 'RFECV', 'SelectFdr', diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py new file mode 100644 index 0000000000000..5196cf024dc50 --- /dev/null +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -0,0 +1,188 @@ +# Author: Andrea Bravi +# License: 3-clause BSD + +import numpy as np +from ..base import BaseEstimator +from .base import SelectorMixin +from ..metrics.cluster.supervised import mutual_info_score + + +class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): + """ + Select the subset of features with minimal redundancy and maximal + relevance (mRMR) with the outcome. + + IMPORTANT: This version only supports data in categorical or integer form. + + Attributes + ---------- + k : int, default=2 + Number of features to select (selected_features) + mask : list, len=selected_features + Integer list of the features ordered by maximal relevance and + minimal redundancy + score : array, shape=[selected_features] + mRMR score associated to each entry in mask + relevance : array, shape=[n_features] + Relevance of all the features + redundancy : array, shape=[n_features] + Redundancy of all the features + rule : string, default='diff' + Rule to combine relevance and redundancy, either + 'diff' - difference between the two + 'prod' - product between the two + X : array, shape=[n_samples, n_features] + Input dataset, must be either integer or categorical + y : array, shape=[n_samples] + Label vector, must be either integer or categorical + + Methods + ------- + _compute_mRMR(X, y) + Computes the minimal relevance maximal redundancy of each feature + returning mask and score + + References + ---------- + .. [1] H. Peng, F. Long, and C. Ding, "Feature selection based on mutual + information: criteria of max-dependency, max-relevance, and + min-redundancy", IEEE Transactions on Pattern Analysis and Machine + Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005. + + Example + ------- + Consider the following: + + >>>X = np.array([[1, 3, 1], + [3, 3, 3], + [1, 3, 1], + [1, 3, 3], + [1, 3, 1]]) + >>>y = np.array([3, 1, 3, 1, 3]) + + X has three features, the first is partially related to y, the second is + totally unrelated, and the third is the opposite of y. + + If we run: + + >>> MinRedundancyMaxRelevance(k=1).fit_transform(X, y) + array([[1], + [3], + [1], + [3], + [1]]) + + we have just selected the most important feature (k=1) of X and reduced its + dimensionality + """ + def __init__(self, k=2, rule='diff'): + """ + Parameters + ---------- + k : int, default=2 + Number of features to select + rule : string, default='diff' + Rule to combine relevance and redundancy, either + 'diff' - difference between the two + 'prod' - product between the two + """ + self.k = k + self.rule = rule + self._rule_function = self._get_rule_function(rule) + + def fit(self, X, y): + """ + Parameters + ---------- + X : array, shape=[n_samples, n_features] + Input dataset, must be either integer or categorical + y : array, shape=[n_samples] + Label vector, must be either integer or categorical + """ + self.X = X + self.y = y + self.mask, self.score = self._compute_mRMR(X, y) + return self + + def _get_support_mask(self): + """ + Returns + ------- + support : array, dype=bool, shape=[n_features] + Boolean mask with True the selected features + """ + + support = np.zeros(self.n_features, dtype=bool) + support[[self.mask]] = True + return support + + def _compute_mRMR(self, X, y): + """ + Parameters + ---------- + X : array, shape=[n_samples, n_features] + Input dataset, must be either integer or categorical + y : array, shape=[n_samples] + Label vector, must be either integer or categorical + + Returns + ------- + mask : list, len=selected_features + Integer list of the features ordered by maximal relevance and + minimal redundancy + score : list, len=selected_features + mRMR score associated to each entry in mask + """ + M = X.shape[1] # Number of features + self.n_features = M + + # Computation of relevance and redundancy + relevance = np.zeros(M) + redundancy = np.zeros([M, M]) + for m1 in range(0, M): + relevance[m1] = mutual_info_score(X[:, m1], y) + for m2 in range(m1+1, M): + redundancy[m1, m2] = mutual_info_score(X[:, m1], + X[:, m2]) + redundancy[m2, m1] = redundancy[m1, m2] + + self.relevance = relevance + self.redundancy = redundancy + + # Sequential search optimization + mask = [] + score = [] + search_space = range(0, M) + + score.append(max(relevance)) + ind = int(relevance.argmax(0)) # Optimal feature + mask.append(ind) + search_space.pop(ind) + + fun = self._rule_function + for m in range(0, self.k-1): + tmp_score = fun(relevance[search_space], + np.mean(redundancy[:, search_space]. + take(mask, axis=0), 0)) + score.append(max(tmp_score)) + ind = tmp_score.argmax(0) + mask.append(search_space[ind]) + search_space.pop(ind) + + return mask, score + + def _get_rule_function(self, rule): + """ + Returns + ------- + fun : function + Function used to combine relevance (k) and redundancy (h) arrays + """ + if rule == 'diff': + fun = lambda k, h: k-h + elif rule == 'prod': + fun = lambda k, h: k*h + else: + raise ValueError("rule should be either 'diff' or 'prod'") + + return fun diff --git a/sklearn/feature_selection/tests/test_multivariate_filtering.py b/sklearn/feature_selection/tests/test_multivariate_filtering.py new file mode 100644 index 0000000000000..4c156eefec301 --- /dev/null +++ b/sklearn/feature_selection/tests/test_multivariate_filtering.py @@ -0,0 +1,26 @@ +from sklearn.utils.testing import (assert_array_equal, assert_raises) + +import numpy as np + +from sklearn.feature_selection import MinRedundancyMaxRelevance + +X = np.array([[1, 3, 1], + [3, 3, 3], + [1, 3, 1], + [1, 3, 3], + [1, 3, 1]]) + +y = np.array([3, 1, 3, 1, 3]) + +def test_mMRM(): + """ + Test MinRedundancyMaxRelevance with default setting. + """ + + m = MinRedundancyMaxRelevance().fit(X, y) + + assert_array_equal([2, 0], m.mask) + + assert_array_equal(0.6730116670092563, m.score[0]) + + assert_raises(ValueError, MinRedundancyMaxRelevance, rule='none') From 9f47461254965824511fd89422b48ec775ff3e4d Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Thu, 24 Oct 2013 22:52:17 -0400 Subject: [PATCH 2/9] Corrected docstring indentation error pep8 somehow does not identify this error, while Travis CI does --- sklearn/feature_selection/multivariate_filtering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py index 5196cf024dc50..a886a65d1454d 100644 --- a/sklearn/feature_selection/multivariate_filtering.py +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -53,12 +53,12 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): ------- Consider the following: - >>>X = np.array([[1, 3, 1], + >>> X = np.array([[1, 3, 1], [3, 3, 3], [1, 3, 1], [1, 3, 3], [1, 3, 1]]) - >>>y = np.array([3, 1, 3, 1, 3]) + >>> y = np.array([3, 1, 3, 1, 3]) X has three features, the first is partially related to y, the second is totally unrelated, and the third is the opposite of y. From 73d55aa3da7d92519c77f5ade353d9ca1edda2cd Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Thu, 24 Oct 2013 23:28:59 -0400 Subject: [PATCH 3/9] Corrected docstring 2 --- .../multivariate_filtering.py | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py index a886a65d1454d..bafb2bb9a02cf 100644 --- a/sklearn/feature_selection/multivariate_filtering.py +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -48,32 +48,6 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): information: criteria of max-dependency, max-relevance, and min-redundancy", IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005. - - Example - ------- - Consider the following: - - >>> X = np.array([[1, 3, 1], - [3, 3, 3], - [1, 3, 1], - [1, 3, 3], - [1, 3, 1]]) - >>> y = np.array([3, 1, 3, 1, 3]) - - X has three features, the first is partially related to y, the second is - totally unrelated, and the third is the opposite of y. - - If we run: - - >>> MinRedundancyMaxRelevance(k=1).fit_transform(X, y) - array([[1], - [3], - [1], - [3], - [1]]) - - we have just selected the most important feature (k=1) of X and reduced its - dimensionality """ def __init__(self, k=2, rule='diff'): """ From 14a7cf0ca6e93d82b0fed148b03905c7d9f0fe98 Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Thu, 24 Oct 2013 23:58:25 -0400 Subject: [PATCH 4/9] Checking for NaNs and Inf during fit() --- sklearn/feature_selection/multivariate_filtering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py index bafb2bb9a02cf..6b0774c2e5181 100644 --- a/sklearn/feature_selection/multivariate_filtering.py +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -5,7 +5,7 @@ from ..base import BaseEstimator from .base import SelectorMixin from ..metrics.cluster.supervised import mutual_info_score - +from ..utils import safe_asarray class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): """ @@ -73,6 +73,7 @@ def fit(self, X, y): y : array, shape=[n_samples] Label vector, must be either integer or categorical """ + X = safe_asarray(X) self.X = X self.y = y self.mask, self.score = self._compute_mRMR(X, y) From ec17096ce214f28813a02a4971fe66db9c981784 Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Fri, 25 Oct 2013 00:34:55 -0400 Subject: [PATCH 5/9] Substituted lambda functions because of pickle problem --- sklearn/feature_selection/multivariate_filtering.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py index 6b0774c2e5181..63518269af744 100644 --- a/sklearn/feature_selection/multivariate_filtering.py +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -5,7 +5,8 @@ from ..base import BaseEstimator from .base import SelectorMixin from ..metrics.cluster.supervised import mutual_info_score -from ..utils import safe_asarray +from ..utils.validation import array2d + class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): """ @@ -73,7 +74,8 @@ def fit(self, X, y): y : array, shape=[n_samples] Label vector, must be either integer or categorical """ - X = safe_asarray(X) + X = array2d(X) + self.X = X self.y = y self.mask, self.score = self._compute_mRMR(X, y) @@ -154,9 +156,11 @@ def _get_rule_function(self, rule): Function used to combine relevance (k) and redundancy (h) arrays """ if rule == 'diff': - fun = lambda k, h: k-h + def fun(a, b): + return a+b elif rule == 'prod': - fun = lambda k, h: k*h + def fun(a, b): + return a*b else: raise ValueError("rule should be either 'diff' or 'prod'") From 62cf55bc1143c324e893970367a9e2c34617d886 Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Fri, 25 Oct 2013 01:12:38 -0400 Subject: [PATCH 6/9] Solved pickle problem and added test --- .../multivariate_filtering.py | 47 ++++++++----------- .../tests/test_multivariate_filtering.py | 7 ++- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py index 63518269af744..ad5dd419df229 100644 --- a/sklearn/feature_selection/multivariate_filtering.py +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -63,7 +63,6 @@ def __init__(self, k=2, rule='diff'): """ self.k = k self.rule = rule - self._rule_function = self._get_rule_function(rule) def fit(self, X, y): """ @@ -136,32 +135,26 @@ def _compute_mRMR(self, X, y): mask.append(ind) search_space.pop(ind) - fun = self._rule_function - for m in range(0, self.k-1): - tmp_score = fun(relevance[search_space], - np.mean(redundancy[:, search_space]. - take(mask, axis=0), 0)) - score.append(max(tmp_score)) - ind = tmp_score.argmax(0) - mask.append(search_space[ind]) - search_space.pop(ind) - - return mask, score - - def _get_rule_function(self, rule): - """ - Returns - ------- - fun : function - Function used to combine relevance (k) and redundancy (h) arrays - """ - if rule == 'diff': - def fun(a, b): - return a+b - elif rule == 'prod': - def fun(a, b): - return a*b + if self.rule == 'diff': + for m in range(0, self.k-1): + tmp_score = relevance[search_space] - \ + np.mean(redundancy[:, search_space] + .take(mask, axis=0), 0) + score.append(max(tmp_score)) + ind = tmp_score.argmax(0) + mask.append(search_space[ind]) + search_space.pop(ind) + + elif self.rule == 'prod': + for m in range(0, self.k-1): + tmp_score = relevance[search_space] * \ + np.mean(redundancy[:, search_space] + .take(mask, axis=0), 0) + score.append(max(tmp_score)) + ind = tmp_score.argmax(0) + mask.append(search_space[ind]) + search_space.pop(ind) else: raise ValueError("rule should be either 'diff' or 'prod'") - return fun + return mask, score diff --git a/sklearn/feature_selection/tests/test_multivariate_filtering.py b/sklearn/feature_selection/tests/test_multivariate_filtering.py index 4c156eefec301..173f7bb68bfb4 100644 --- a/sklearn/feature_selection/tests/test_multivariate_filtering.py +++ b/sklearn/feature_selection/tests/test_multivariate_filtering.py @@ -12,6 +12,7 @@ y = np.array([3, 1, 3, 1, 3]) + def test_mMRM(): """ Test MinRedundancyMaxRelevance with default setting. @@ -23,4 +24,8 @@ def test_mMRM(): assert_array_equal(0.6730116670092563, m.score[0]) - assert_raises(ValueError, MinRedundancyMaxRelevance, rule='none') + m = MinRedundancyMaxRelevance(rule='prod').fit(X, y) + + assert_array_equal(0.049793044493117354, m.score[1]) + + assert_raises(ValueError, MinRedundancyMaxRelevance(rule='none').fit, X, y) From c2d18523e58a9f3f228344aef688d71552ab301c Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Tue, 29 Oct 2013 13:29:14 -0400 Subject: [PATCH 7/9] Implemented suggested corrections --- .../feature_selection/multivariate_filtering.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py index ad5dd419df229..8c6e601ba46f8 100644 --- a/sklearn/feature_selection/multivariate_filtering.py +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -37,12 +37,6 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): y : array, shape=[n_samples] Label vector, must be either integer or categorical - Methods - ------- - _compute_mRMR(X, y) - Computes the minimal relevance maximal redundancy of each feature - returning mask and score - References ---------- .. [1] H. Peng, F. Long, and C. Ding, "Feature selection based on mutual @@ -89,7 +83,7 @@ def _get_support_mask(self): """ support = np.zeros(self.n_features, dtype=bool) - support[[self.mask]] = True + support[self.mask] = True return support def _compute_mRMR(self, X, y): @@ -110,7 +104,6 @@ def _compute_mRMR(self, X, y): mRMR score associated to each entry in mask """ M = X.shape[1] # Number of features - self.n_features = M # Computation of relevance and redundancy relevance = np.zeros(M) @@ -122,9 +115,6 @@ def _compute_mRMR(self, X, y): X[:, m2]) redundancy[m2, m1] = redundancy[m1, m2] - self.relevance = relevance - self.redundancy = redundancy - # Sequential search optimization mask = [] score = [] @@ -157,4 +147,8 @@ def _compute_mRMR(self, X, y): else: raise ValueError("rule should be either 'diff' or 'prod'") + self.n_features = M + self.relevance = relevance + self.redundancy = redundancy + return mask, score From 0f47ccab360499083da360ed634b3c93ec8bbe61 Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Mon, 4 Nov 2013 16:06:10 -0500 Subject: [PATCH 8/9] Example comparing mRMR with other selection algorithms --- examples/plot_mRMR.py | 69 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 examples/plot_mRMR.py diff --git a/examples/plot_mRMR.py b/examples/plot_mRMR.py new file mode 100644 index 0000000000000..f9acf5d7d62ef --- /dev/null +++ b/examples/plot_mRMR.py @@ -0,0 +1,69 @@ +""" +=========================================== +Minimum redundancy maximum relevance (mRMR) +=========================================== + +Mutual information is a metric assessing the degree of statistical independence +between two random variables. + +mRMR feature selection consists in selecting a subset of the available features +showing high mutual information with the target and low mutual information with +each other. + +This example compares mRMR feature selection with Recursive feature elimination +(RFE) and Univariate feature selection (Uni), taking advantage of a synthetic +dataset. + +This dataset has 100 samples and 3 features: A, B and C, enabling to +respectively classify 60%, 50% and 40% of the data. + +Let's assume the plan is to choose only 2 of those 3 features. Given that A +and B have higher accuracy, we would expect a selection algorithm to pick those +two. However, it turns out that A and B are redundant with each other (i.e. +they are able to classify the same samples). Conversely, C has lower accuracy, +but provides indepedent information respect to A and B. + +As expectable, mRMR selects feature A and C, while the other two selection +algorithm select features A and B. + + +.. note:: + + See also :ref:`example_plot_rfe_digits.py`, + :ref:`example_plot_feature_selection.py` + +""" +print(__doc__) + +import numpy as np +from sklearn.feature_selection import RFE, SelectKBest, chi2, \ + MinRedundancyMaxRelevance +from sklearn.linear_model import LogisticRegression + + +# Number of samples in the dataset +N = 100 + +# Associating a class to each sample in the dataset +y = np.array([0] * 50 + [1] * 50) + +# Creating a feature able to classify 60% of the samples +A = np.array([0] * 30 + [1] * 20 + [1] * 20 + [2] * 30) + +# Creating a feature able to classify 50% of the samples +B = np.array([2] * 25 + [1] * 25 + [1] * 25 + [0] * 25) + +# Creating a feature able to classify 40% of the samples +C = np.array([2] * 20 + [0] * 30 + [1] * 30 + [2] * 20) + +X = np.array([A, B, C]).T +feature = ['A', 'B', 'C'] + +# We will be using the following three selectors +selectors = [('RFE', RFE(LogisticRegression(), 2)), + ('Uni', SelectKBest(chi2, k=2)), + ('mRMR', MinRedundancyMaxRelevance(k=2))] + +for name, selector in selectors: + k = selector.fit(X, y).get_support(True).tolist() + print name, 'selected %s and %s' % (feature[k[0]], feature[k[1]]) From 4b97111c098ba5f76c6f2cd2a2070db3ebe25688 Mon Sep 17 00:00:00 2001 From: AndreaBravi Date: Mon, 4 Nov 2013 18:03:09 -0500 Subject: [PATCH 9/9] mRMR rst description --- doc/modules/feature_selection.rst | 38 +++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst index 0f0c5aaa15781..01dd03b0dd0c0 100644 --- a/doc/modules/feature_selection.rst +++ b/doc/modules/feature_selection.rst @@ -247,10 +247,38 @@ features:: * :ref:`example_ensemble_plot_forest_importances_faces.py`: example on face recognition data. +.. _mRMR: + +Minimum Redundancy Maximal Relevance (mRMR) +=============================================== + +This filter feature selector was proposed by Peng et al. in 2005. mRMR +identifies a subset of features having maximal mutual information with the +target (i.e. relevance), and minimal mutual information with each other (i.e. +redundancy). + +The algorithm expects discretized features. Peng et al. suggest to use the mean +and standard deviation of each feature for that purpose. For instance, divide +a feature in three levels: + + [-Inf, < mean - std] + [> mean - std, < mean + std] + [> mean + std, +Inf] + +:class:`MinRedundancyMaxRelevance` + +.. topic:: References: + + * H. Peng, F. Long, C. Ding, "Feature selection based on Mutual Information: + Criteria of Max-Dependency, Max-Relevance, and Min-Redundancy", + IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.27, + n.8 (2005) + + Feature selection as part of a pipeline ======================================= -Feature selection is usually used as a pre-processing step before doing +Feature selection is usually used as a pre-processing step before doing the actual learning. The recommended way to do this in scikit-learn is to use a :class:`sklearn.pipeline.Pipeline`:: @@ -260,10 +288,10 @@ to use a :class:`sklearn.pipeline.Pipeline`:: ]) clf.fit(X, y) -In this snippet we make use of a :class:`sklearn.svm.LinearSVC` +In this snippet we make use of a :class:`sklearn.svm.LinearSVC` to evaluate feature importances and select the most relevant features. -Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the -transformed output, i.e. using only relevant features. You can perform +Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the +transformed output, i.e. using only relevant features. You can perform similar operations with the other feature selection methods and also -classifiers that provide a way to evaluate feature importances of course. +classifiers that provide a way to evaluate feature importances of course. See the :class:`sklearn.pipeline.Pipeline` examples for more details.