ENH: Small improvements/fixes to mutual info feature selection

Nikolay Mayorov · Nikolay Mayorov · commit 562a8e869a28 · 2015-10-11T21:18:51.000+03:00
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
@@ -17,11 +17,11 @@
 
 from .variance_threshold import VarianceThreshold
 
-from .multivariate_filtering import MinRedundancyMaxRelevance
-
 from .rfe import RFE
 from .rfe import RFECV
 
+from .mutual_info import MinRedundancyMaxRelevance
+
 __all__ = ['GenericUnivariateSelect',
            'MinRedundancyMaxRelevance',
            'RFE',
diff --git a/sklearn/feature_selection/mutual_info.py b/sklearn/feature_selection/mutual_info.py
@@ -108,9 +108,12 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
 
     Parameters
     ----------
-    n_features_to_select : None or int, optional (default=None)
-        Number of features to select. If None, half of the features
-        will be selected.
+    n_features_to_select : float or int, optional (default=0.5)
+        Number of features to select. The value greater than or equal 1 is
+        interpreted as the absolute number of features to select. The value
+        within (0.0, 1.0) is interpreted as the percentage from the initial
+        number of features (rounded down). Half of the features is selected by
+        default.
     categorical_features : bool or array_like with shape (n_features),
                            optional (default=False)
         If bool, then determines whether to consider all features categorical
@@ -124,8 +127,6 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
 
     Attributes
     ----------
-    n_features_ : int
-        Number of selected features.
     support_ : ndarray, shape (n_features,)
         Mask of selected features.
     relevance_ : ndarray, shape (n_features,)
@@ -147,7 +148,7 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
     .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
            Data Sets". PLoS ONE 9(2), 2014.
     """
-    def __init__(self, n_features_to_select=None, categorical_features=False,
+    def __init__(self, n_features_to_select=0.5, categorical_features=False,
                  categorical_target=False, n_neighbors=3):
         self.n_features_to_select = n_features_to_select
         self.categorical_features = categorical_features
@@ -171,9 +172,19 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = check_X_y(X, y, accept_sparse='csc')
+        X, y = check_X_y(X, y, accept_sparse='csc',
+                         y_numeric=not self.categorical_target)
 
         n_features = X.shape[1]
+
+        if self.n_features_to_select >= 1:
+            n_features_to_select = int(self.n_features_to_select)
+        elif 0 < self.n_features_to_select < 1:
+            n_features_to_select = max(
+                1, int(self.n_features_to_select * n_features))
+        else:
+            raise ValueError("`n_features_to_select` must be positive.")
+
         if isinstance(self.categorical_features, bool):
             categorical_features = np.empty(n_features, dtype=bool)
             categorical_features.fill(self.categorical_features)
@@ -203,14 +214,9 @@ def fit(self, X, y):
                     xi, xj, categorical_features[i], categorical_features[j],
                     self.n_neighbors)
 
-        if self.n_features_to_select is None:
-            self.n_features_ = (n_features + 1) // 2
-        else:
-            self.n_features_ = self.n_features_to_select
-
         support = np.zeros(n_features, dtype=bool)
         support[np.argmax(relevance)] = True
-        for i in range(self.n_features_ - 1):
+        for i in range(n_features_to_select - 1):
             selected = np.nonzero(support)[0]
             candidates = np.nonzero(~support)[0]
             D = relevance[candidates]
diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py
@@ -7,7 +7,7 @@
 from sklearn.utils.testing import (
     assert_array_equal, assert_almost_equal, assert_true)
 from sklearn.feature_selection import MinRedundancyMaxRelevance
-from sklearn.feature_selection.multivariate_filtering import _compute_mi
+from sklearn.feature_selection.mutual_info import _compute_mi
 
 
 class TestMIComputation(object):
@@ -108,8 +108,9 @@ def test_categorical(self):
         # (thus redundant) and x[3] is weekly informative. So the algorithm
         # should pick features 0 and 2.
 
-        m = MinRedundancyMaxRelevance(categorical_features=True,
-                                      categorical_target=True)
+        m = MinRedundancyMaxRelevance(
+            categorical_features=True, categorical_target=True,
+            n_features_to_select=2)
         m.fit(X, y)
         assert_array_equal(m.support_, np.array([True, False, True, False]))
 
@@ -136,7 +137,7 @@ def test_continuous(self):
 
         y = Z[:, 0]
         X = Z[:, 1:]
-        m = MinRedundancyMaxRelevance()
+        m = MinRedundancyMaxRelevance(n_features_to_select=2)
         m.fit(X, y)
         assert_array_equal(m.support_, np.array([True, False, True]))
 
@@ -148,7 +149,7 @@ def test_mixed(self):
         X[:, 2] = X[:, 2] > 0.5
 
         m = MinRedundancyMaxRelevance(
-            categorical_features=[False, False, True],
+            categorical_features=[False, False, True], n_features_to_select=2,
             categorical_target=True)
         m.fit(X, y)        
         assert_array_equal(m.support_, np.array([True, False, True]))