From 355954451be003e38e61f33b81262ecb0a414b11 Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Wed, 23 Oct 2013 13:02:08 -0400
Subject: [PATCH 1/9] Minimum redundancy maximum relevance feature selection

---
 sklearn/feature_selection/__init__.py         |   3 +
 .../multivariate_filtering.py                 | 188 ++++++++++++++++++
 .../tests/test_multivariate_filtering.py      |  26 +++
 3 files changed, 217 insertions(+)
 create mode 100644 sklearn/feature_selection/multivariate_filtering.py
 create mode 100644 sklearn/feature_selection/tests/test_multivariate_filtering.py

diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
index 3d27638091995..cdad116b3dc4f 100644
--- a/sklearn/feature_selection/__init__.py
+++ b/sklearn/feature_selection/__init__.py
@@ -17,10 +17,13 @@
 
 from .variance_threshold import VarianceThreshold
 
+from .multivariate_filtering import MinRedundancyMaxRelevance
+
 from .rfe import RFE
 from .rfe import RFECV
 
 __all__ = ['GenericUnivariateSelect',
+           'MinRedundancyMaxRelevance',
            'RFE',
            'RFECV',
            'SelectFdr',
diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
new file mode 100644
index 0000000000000..5196cf024dc50
--- /dev/null
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -0,0 +1,188 @@
+# Author: Andrea Bravi <a.bravi@uottawa.ca>
+# License: 3-clause BSD
+
+import numpy as np
+from ..base import BaseEstimator
+from .base import SelectorMixin
+from ..metrics.cluster.supervised import mutual_info_score
+
+
+class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
+    """
+    Select the subset of features with minimal redundancy and maximal
+    relevance (mRMR) with the outcome.
+
+    IMPORTANT: This version only supports data in categorical or integer form.
+
+    Attributes
+    ----------
+    k : int, default=2
+        Number of features to select (selected_features)
+    mask : list, len=selected_features
+           Integer list of the features ordered by maximal relevance and
+           minimal redundancy
+    score : array, shape=[selected_features]
+            mRMR score associated to each entry in mask
+    relevance : array, shape=[n_features]
+                Relevance of all the features
+    redundancy : array, shape=[n_features]
+                 Redundancy of all the features
+    rule : string, default='diff'
+           Rule to combine relevance and redundancy, either
+           'diff' - difference between the two
+           'prod' - product between the two
+    X : array, shape=[n_samples, n_features]
+        Input dataset, must be either integer or categorical
+    y : array, shape=[n_samples]
+        Label vector, must be either integer or categorical
+
+    Methods
+    -------
+    _compute_mRMR(X, y)
+        Computes the minimal relevance maximal redundancy of each feature
+        returning mask and score
+
+    References
+    ----------
+    .. [1] H. Peng, F. Long, and C. Ding, "Feature selection based on mutual
+       information: criteria of max-dependency, max-relevance, and
+       min-redundancy", IEEE Transactions on Pattern Analysis and Machine
+       Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005.
+
+    Example
+    -------
+    Consider the following:
+
+    >>>X = np.array([[1, 3, 1],
+                     [3, 3, 3],
+                     [1, 3, 1],
+                     [1, 3, 3],
+                     [1, 3, 1]])
+    >>>y = np.array([3, 1, 3, 1, 3])
+
+    X has three features, the first is partially related to y, the second is
+    totally unrelated, and the third is the opposite of y.
+
+    If we run:
+
+    >>> MinRedundancyMaxRelevance(k=1).fit_transform(X, y)
+    array([[1],
+           [3],
+           [1],
+           [3],
+           [1]])
+
+    we have just selected the most important feature (k=1) of X and reduced its
+    dimensionality
+    """
+    def __init__(self, k=2, rule='diff'):
+        """
+        Parameters
+        ----------
+        k : int, default=2
+            Number of features to select
+        rule : string, default='diff'
+               Rule to combine relevance and redundancy, either
+               'diff' - difference between the two
+               'prod' - product between the two
+        """
+        self.k = k
+        self.rule = rule
+        self._rule_function = self._get_rule_function(rule)
+
+    def fit(self, X, y):
+        """
+        Parameters
+        ----------
+        X : array, shape=[n_samples, n_features]
+            Input dataset, must be either integer or categorical
+        y : array, shape=[n_samples]
+            Label vector, must be either integer or categorical
+        """
+        self.X = X
+        self.y = y
+        self.mask, self.score = self._compute_mRMR(X, y)
+        return self
+
+    def _get_support_mask(self):
+        """
+        Returns
+        -------
+        support : array, dype=bool, shape=[n_features]
+                  Boolean mask with True the selected features
+        """
+
+        support = np.zeros(self.n_features, dtype=bool)
+        support[[self.mask]] = True
+        return support
+
+    def _compute_mRMR(self, X, y):
+        """
+        Parameters
+        ----------
+        X : array, shape=[n_samples, n_features]
+            Input dataset, must be either integer or categorical
+        y : array, shape=[n_samples]
+            Label vector, must be either integer or categorical
+
+        Returns
+        -------
+        mask : list, len=selected_features
+               Integer list of the features ordered by maximal relevance and
+               minimal redundancy
+        score : list, len=selected_features
+                mRMR score associated to each entry in mask
+        """
+        M = X.shape[1]  # Number of features
+        self.n_features = M
+
+        # Computation of relevance and redundancy
+        relevance = np.zeros(M)
+        redundancy = np.zeros([M, M])
+        for m1 in range(0, M):
+            relevance[m1] = mutual_info_score(X[:, m1], y)
+            for m2 in range(m1+1, M):
+                redundancy[m1, m2] = mutual_info_score(X[:, m1],
+                                                       X[:, m2])
+                redundancy[m2, m1] = redundancy[m1, m2]
+
+        self.relevance = relevance
+        self.redundancy = redundancy
+
+        # Sequential search optimization
+        mask = []
+        score = []
+        search_space = range(0, M)
+
+        score.append(max(relevance))
+        ind = int(relevance.argmax(0))  # Optimal feature
+        mask.append(ind)
+        search_space.pop(ind)
+
+        fun = self._rule_function
+        for m in range(0, self.k-1):
+            tmp_score = fun(relevance[search_space],
+                            np.mean(redundancy[:, search_space].
+                                    take(mask, axis=0), 0))
+            score.append(max(tmp_score))
+            ind = tmp_score.argmax(0)
+            mask.append(search_space[ind])
+            search_space.pop(ind)
+
+        return mask, score
+
+    def _get_rule_function(self, rule):
+        """
+        Returns
+        -------
+        fun : function
+              Function used to combine relevance (k) and redundancy (h) arrays
+        """
+        if rule == 'diff':
+            fun = lambda k, h: k-h
+        elif rule == 'prod':
+            fun = lambda k, h: k*h
+        else:
+            raise ValueError("rule should be either 'diff' or 'prod'")
+
+        return fun
diff --git a/sklearn/feature_selection/tests/test_multivariate_filtering.py b/sklearn/feature_selection/tests/test_multivariate_filtering.py
new file mode 100644
index 0000000000000..4c156eefec301
--- /dev/null
+++ b/sklearn/feature_selection/tests/test_multivariate_filtering.py
@@ -0,0 +1,26 @@
+from sklearn.utils.testing import (assert_array_equal, assert_raises)
+
+import numpy as np
+
+from sklearn.feature_selection import MinRedundancyMaxRelevance
+
+X = np.array([[1, 3, 1],
+              [3, 3, 3],
+              [1, 3, 1],
+              [1, 3, 3],
+              [1, 3, 1]])
+
+y = np.array([3, 1, 3, 1, 3])
+
+def test_mMRM():
+    """
+    Test MinRedundancyMaxRelevance with default setting.
+    """
+
+    m = MinRedundancyMaxRelevance().fit(X, y)
+
+    assert_array_equal([2, 0], m.mask)
+
+    assert_array_equal(0.6730116670092563, m.score[0])
+
+    assert_raises(ValueError, MinRedundancyMaxRelevance, rule='none')

From 9f47461254965824511fd89422b48ec775ff3e4d Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Thu, 24 Oct 2013 22:52:17 -0400
Subject: [PATCH 2/9] Corrected docstring indentation error

pep8 somehow does not identify this error, while Travis CI does
---
 sklearn/feature_selection/multivariate_filtering.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
index 5196cf024dc50..a886a65d1454d 100644
--- a/sklearn/feature_selection/multivariate_filtering.py
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -53,12 +53,12 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
     -------
     Consider the following:
 
-    >>>X = np.array([[1, 3, 1],
+    >>> X = np.array([[1, 3, 1],
                      [3, 3, 3],
                      [1, 3, 1],
                      [1, 3, 3],
                      [1, 3, 1]])
-    >>>y = np.array([3, 1, 3, 1, 3])
+    >>> y = np.array([3, 1, 3, 1, 3])
 
     X has three features, the first is partially related to y, the second is
     totally unrelated, and the third is the opposite of y.

From 73d55aa3da7d92519c77f5ade353d9ca1edda2cd Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Thu, 24 Oct 2013 23:28:59 -0400
Subject: [PATCH 3/9] Corrected docstring 2

---
 .../multivariate_filtering.py                 | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
index a886a65d1454d..bafb2bb9a02cf 100644
--- a/sklearn/feature_selection/multivariate_filtering.py
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -48,32 +48,6 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
        information: criteria of max-dependency, max-relevance, and
        min-redundancy", IEEE Transactions on Pattern Analysis and Machine
        Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005.
-
-    Example
-    -------
-    Consider the following:
-
-    >>> X = np.array([[1, 3, 1],
-                     [3, 3, 3],
-                     [1, 3, 1],
-                     [1, 3, 3],
-                     [1, 3, 1]])
-    >>> y = np.array([3, 1, 3, 1, 3])
-
-    X has three features, the first is partially related to y, the second is
-    totally unrelated, and the third is the opposite of y.
-
-    If we run:
-
-    >>> MinRedundancyMaxRelevance(k=1).fit_transform(X, y)
-    array([[1],
-           [3],
-           [1],
-           [3],
-           [1]])
-
-    we have just selected the most important feature (k=1) of X and reduced its
-    dimensionality
     """
     def __init__(self, k=2, rule='diff'):
         """

From 14a7cf0ca6e93d82b0fed148b03905c7d9f0fe98 Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Thu, 24 Oct 2013 23:58:25 -0400
Subject: [PATCH 4/9] Checking for NaNs and Inf during fit()

---
 sklearn/feature_selection/multivariate_filtering.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
index bafb2bb9a02cf..6b0774c2e5181 100644
--- a/sklearn/feature_selection/multivariate_filtering.py
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -5,7 +5,7 @@
 from ..base import BaseEstimator
 from .base import SelectorMixin
 from ..metrics.cluster.supervised import mutual_info_score
-
+from ..utils import safe_asarray
 
 class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
     """
@@ -73,6 +73,7 @@ def fit(self, X, y):
         y : array, shape=[n_samples]
             Label vector, must be either integer or categorical
         """
+        X = safe_asarray(X)
         self.X = X
         self.y = y
         self.mask, self.score = self._compute_mRMR(X, y)

From ec17096ce214f28813a02a4971fe66db9c981784 Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Fri, 25 Oct 2013 00:34:55 -0400
Subject: [PATCH 5/9] Substituted lambda functions because of pickle problem

---
 sklearn/feature_selection/multivariate_filtering.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
index 6b0774c2e5181..63518269af744 100644
--- a/sklearn/feature_selection/multivariate_filtering.py
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -5,7 +5,8 @@
 from ..base import BaseEstimator
 from .base import SelectorMixin
 from ..metrics.cluster.supervised import mutual_info_score
-from ..utils import safe_asarray
+from ..utils.validation import array2d
+
 
 class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
     """
@@ -73,7 +74,8 @@ def fit(self, X, y):
         y : array, shape=[n_samples]
             Label vector, must be either integer or categorical
         """
-        X = safe_asarray(X)
+        X = array2d(X)
+
         self.X = X
         self.y = y
         self.mask, self.score = self._compute_mRMR(X, y)
@@ -154,9 +156,11 @@ def _get_rule_function(self, rule):
               Function used to combine relevance (k) and redundancy (h) arrays
         """
         if rule == 'diff':
-            fun = lambda k, h: k-h
+            def fun(a, b):
+                return a+b
         elif rule == 'prod':
-            fun = lambda k, h: k*h
+            def fun(a, b):
+                return a*b
         else:
             raise ValueError("rule should be either 'diff' or 'prod'")
 

From 62cf55bc1143c324e893970367a9e2c34617d886 Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Fri, 25 Oct 2013 01:12:38 -0400
Subject: [PATCH 6/9] Solved pickle problem and added test

---
 .../multivariate_filtering.py                 | 47 ++++++++-----------
 .../tests/test_multivariate_filtering.py      |  7 ++-
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
index 63518269af744..ad5dd419df229 100644
--- a/sklearn/feature_selection/multivariate_filtering.py
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -63,7 +63,6 @@ def __init__(self, k=2, rule='diff'):
         """
         self.k = k
         self.rule = rule
-        self._rule_function = self._get_rule_function(rule)
 
     def fit(self, X, y):
         """
@@ -136,32 +135,26 @@ def _compute_mRMR(self, X, y):
         mask.append(ind)
         search_space.pop(ind)
 
-        fun = self._rule_function
-        for m in range(0, self.k-1):
-            tmp_score = fun(relevance[search_space],
-                            np.mean(redundancy[:, search_space].
-                                    take(mask, axis=0), 0))
-            score.append(max(tmp_score))
-            ind = tmp_score.argmax(0)
-            mask.append(search_space[ind])
-            search_space.pop(ind)
-
-        return mask, score
-
-    def _get_rule_function(self, rule):
-        """
-        Returns
-        -------
-        fun : function
-              Function used to combine relevance (k) and redundancy (h) arrays
-        """
-        if rule == 'diff':
-            def fun(a, b):
-                return a+b
-        elif rule == 'prod':
-            def fun(a, b):
-                return a*b
+        if self.rule == 'diff':
+            for m in range(0, self.k-1):
+                tmp_score = relevance[search_space] - \
+                    np.mean(redundancy[:, search_space]
+                            .take(mask, axis=0), 0)
+                score.append(max(tmp_score))
+                ind = tmp_score.argmax(0)
+                mask.append(search_space[ind])
+                search_space.pop(ind)
+
+        elif self.rule == 'prod':
+            for m in range(0, self.k-1):
+                tmp_score = relevance[search_space] * \
+                    np.mean(redundancy[:, search_space]
+                            .take(mask, axis=0), 0)
+                score.append(max(tmp_score))
+                ind = tmp_score.argmax(0)
+                mask.append(search_space[ind])
+                search_space.pop(ind)
         else:
             raise ValueError("rule should be either 'diff' or 'prod'")
 
-        return fun
+        return mask, score
diff --git a/sklearn/feature_selection/tests/test_multivariate_filtering.py b/sklearn/feature_selection/tests/test_multivariate_filtering.py
index 4c156eefec301..173f7bb68bfb4 100644
--- a/sklearn/feature_selection/tests/test_multivariate_filtering.py
+++ b/sklearn/feature_selection/tests/test_multivariate_filtering.py
@@ -12,6 +12,7 @@
 
 y = np.array([3, 1, 3, 1, 3])
 
+
 def test_mMRM():
     """
     Test MinRedundancyMaxRelevance with default setting.
@@ -23,4 +24,8 @@ def test_mMRM():
 
     assert_array_equal(0.6730116670092563, m.score[0])
 
-    assert_raises(ValueError, MinRedundancyMaxRelevance, rule='none')
+    m = MinRedundancyMaxRelevance(rule='prod').fit(X, y)
+
+    assert_array_equal(0.049793044493117354, m.score[1])
+
+    assert_raises(ValueError, MinRedundancyMaxRelevance(rule='none').fit, X, y)

From c2d18523e58a9f3f228344aef688d71552ab301c Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Tue, 29 Oct 2013 13:29:14 -0400
Subject: [PATCH 7/9] Implemented suggested corrections

---
 .../feature_selection/multivariate_filtering.py  | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
index ad5dd419df229..8c6e601ba46f8 100644
--- a/sklearn/feature_selection/multivariate_filtering.py
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -37,12 +37,6 @@ class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
     y : array, shape=[n_samples]
         Label vector, must be either integer or categorical
 
-    Methods
-    -------
-    _compute_mRMR(X, y)
-        Computes the minimal relevance maximal redundancy of each feature
-        returning mask and score
-
     References
     ----------
     .. [1] H. Peng, F. Long, and C. Ding, "Feature selection based on mutual
@@ -89,7 +83,7 @@ def _get_support_mask(self):
         """
 
         support = np.zeros(self.n_features, dtype=bool)
-        support[[self.mask]] = True
+        support[self.mask] = True
         return support
 
     def _compute_mRMR(self, X, y):
@@ -110,7 +104,6 @@ def _compute_mRMR(self, X, y):
                 mRMR score associated to each entry in mask
         """
         M = X.shape[1]  # Number of features
-        self.n_features = M
 
         # Computation of relevance and redundancy
         relevance = np.zeros(M)
@@ -122,9 +115,6 @@ def _compute_mRMR(self, X, y):
                                                        X[:, m2])
                 redundancy[m2, m1] = redundancy[m1, m2]
 
-        self.relevance = relevance
-        self.redundancy = redundancy
-
         # Sequential search optimization
         mask = []
         score = []
@@ -157,4 +147,8 @@ def _compute_mRMR(self, X, y):
         else:
             raise ValueError("rule should be either 'diff' or 'prod'")
 
+        self.n_features = M
+        self.relevance = relevance
+        self.redundancy = redundancy
+
         return mask, score

From 0f47ccab360499083da360ed634b3c93ec8bbe61 Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Mon, 4 Nov 2013 16:06:10 -0500
Subject: [PATCH 8/9] Example comparing mRMR with other selection algorithms

---
 examples/plot_mRMR.py | 69 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 examples/plot_mRMR.py

diff --git a/examples/plot_mRMR.py b/examples/plot_mRMR.py
new file mode 100644
index 0000000000000..f9acf5d7d62ef
--- /dev/null
+++ b/examples/plot_mRMR.py
@@ -0,0 +1,69 @@
+"""
+===========================================
+Minimum redundancy maximum relevance (mRMR)
+===========================================
+
+Mutual information is a metric assessing the degree of statistical independence
+between two random variables.
+
+mRMR feature selection consists in selecting a subset of the available features
+showing high mutual information with the target and low mutual information with
+each other.
+
+This example compares mRMR feature selection with Recursive feature elimination
+(RFE) and Univariate feature selection (Uni), taking advantage of a synthetic
+dataset.
+
+This dataset has 100 samples and 3 features: A, B and C, enabling to
+respectively classify 60%, 50% and 40% of the data.
+
+Let's assume the plan is to choose only 2 of those 3 features. Given that A
+and B have higher accuracy, we would expect a selection algorithm to pick those
+two. However, it turns out that A and B are redundant with each other (i.e.
+they are able to classify the same samples). Conversely, C has lower accuracy,
+but provides indepedent information respect to A and B.
+
+As expectable, mRMR selects feature A and C, while the other two selection
+algorithm select features A and B.
+
+
+.. note::
+
+    See also :ref:`example_plot_rfe_digits.py`,
+             :ref:`example_plot_feature_selection.py`
+
+"""
+print(__doc__)
+
+import numpy as np
+from sklearn.feature_selection import RFE, SelectKBest, chi2, \
+    MinRedundancyMaxRelevance
+from sklearn.linear_model import LogisticRegression
+
+
+#  Number of samples in the dataset
+N = 100
+
+#  Associating a class to each sample in the dataset
+y = np.array([0] * 50 + [1] * 50)
+
+#  Creating a feature able to classify 60% of the samples
+A = np.array([0] * 30 + [1] * 20 + [1] * 20 + [2] * 30)
+
+#  Creating a feature able to classify 50% of the samples
+B = np.array([2] * 25 + [1] * 25 + [1] * 25 + [0] * 25)
+
+#  Creating a feature able to classify 40% of the samples
+C = np.array([2] * 20 + [0] * 30 + [1] * 30 + [2] * 20)
+
+X = np.array([A, B, C]).T
+feature = ['A', 'B', 'C']
+
+# We will be using the following three selectors
+selectors = [('RFE', RFE(LogisticRegression(), 2)),
+             ('Uni', SelectKBest(chi2, k=2)),
+             ('mRMR', MinRedundancyMaxRelevance(k=2))]
+
+for name, selector in selectors:
+    k = selector.fit(X, y).get_support(True).tolist()
+    print name, 'selected %s and %s' % (feature[k[0]], feature[k[1]])

From 4b97111c098ba5f76c6f2cd2a2070db3ebe25688 Mon Sep 17 00:00:00 2001
From: AndreaBravi <andrea.bravi@gmail.com>
Date: Mon, 4 Nov 2013 18:03:09 -0500
Subject: [PATCH 9/9] mRMR rst description

---
 doc/modules/feature_selection.rst | 38 +++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 0f0c5aaa15781..01dd03b0dd0c0 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -247,10 +247,38 @@ features::
     * :ref:`example_ensemble_plot_forest_importances_faces.py`: example
       on face recognition data.
 
+.. _mRMR:
+
+Minimum Redundancy Maximal Relevance (mRMR)
+===============================================
+
+This filter feature selector was proposed by Peng et al. in 2005. mRMR
+identifies a subset of features having maximal mutual information with the
+target (i.e. relevance), and minimal mutual information with each other (i.e.
+redundancy).
+
+The algorithm expects discretized features. Peng et al. suggest to use the mean
+and standard deviation of each feature for that purpose. For instance, divide
+a feature in three levels:
+
+    [-Inf, < mean - std]
+    [> mean - std, < mean + std]
+    [> mean + std, +Inf]
+
+:class:`MinRedundancyMaxRelevance`
+
+.. topic:: References:
+
+   * H. Peng, F. Long, C. Ding, "Feature selection based on Mutual Information:
+     Criteria of Max-Dependency, Max-Relevance, and Min-Redundancy",
+     IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.27,
+     n.8 (2005)
+
+
 Feature selection as part of a pipeline
 =======================================
 
-Feature selection is usually used as a pre-processing step before doing 
+Feature selection is usually used as a pre-processing step before doing
 the actual learning. The recommended way to do this in scikit-learn is
 to use a :class:`sklearn.pipeline.Pipeline`::
 
@@ -260,10 +288,10 @@ to use a :class:`sklearn.pipeline.Pipeline`::
   ])
   clf.fit(X, y)
 
-In this snippet we make use of a :class:`sklearn.svm.LinearSVC` 
+In this snippet we make use of a :class:`sklearn.svm.LinearSVC`
 to evaluate feature importances and select the most relevant features.
-Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the 
-transformed output, i.e. using only relevant features. You can perform 
+Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the
+transformed output, i.e. using only relevant features. You can perform
 similar operations with the other feature selection methods and also
-classifiers that provide a way to evaluate feature importances of course. 
+classifiers that provide a way to evaluate feature importances of course.
 See the :class:`sklearn.pipeline.Pipeline` examples for more details.