[MRG+1] Return list instead of 3d array for MultiOutputClassifier.predict_proba (#8095)

pjbull · raghavrv · commit dd2e48c0912e · 2016-12-22T18:54:21.000+01:00
* Return list instead of 3d array for MultiOutputClassifier.predict_proba

* Update flake8, docstring, variable name

 - Changed `rs` to `rng` to follow convention.
 - Made sure changes were flake8 approved
 - Add `\` to continue docstring for `predict_proba` return value.

* Sub random.choice for np.random.choice

`np.random.choice` isn’t available in Numpy 1.6, so opt for the Python
version instead.

* Make test labels deterministic

* Remove hanging chad...

* Add bug fix and API change to whats new
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -152,6 +152,12 @@ Bug fixes
      wrong values when calling ``__call__``.
      :issue:`8087` by :user:`Alexis Mignon <AlexisMignon>`
 
+   - Fix :func:`sklearn.multioutput.MultiOutputClassifier.predict_proba` to
+     return a list of 2d arrays, rather than a 3d array. In the case where
+     different target columns had different numbers of classes, a `ValueError`
+     would be raised on trying to stack matrices with different dimensions.
+     :issue:`8093` by :user:`Peter Bull <pjbull>`.
+
 API changes summary
 -------------------
 
@@ -167,6 +173,15 @@ API changes summary
      needed for the perplexity calculation. :issue:`7954` by
      :user:`Gary Foreman <garyForeman>`.
 
+   - The :func:`sklearn.multioutput.MultiOutputClassifier.predict_proba`
+     function used to return a 3d array (``n_samples``, ``n_classes``,
+     ``n_outputs``). In the case where different target columns had different
+     numbers of classes, a `ValueError` would be raised on trying to stack
+     matrices with different dimensions. This function now returns a list of
+     arrays where the length of the list is ``n_outputs``, and each array is
+     (``n_samples``, ``n_classes``) for that particular output.
+     :issue:`8093` by :user:`Peter Bull <pjbull>`.
+
 .. _changes_0_18_1:
 
 Version 0.18.1
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
@@ -214,16 +214,18 @@ def predict_proba(self, X):
 
         Returns
         -------
-        T : (sparse) array-like, shape = (n_samples, n_classes, n_outputs)
-            The class probabilities of the samples for each of the outputs
+        p : array of shape = [n_samples, n_classes], or a list of n_outputs \
+            such arrays if n_outputs > 1.
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute `classes_`.
         """
         check_is_fitted(self, 'estimators_')
         if not hasattr(self.estimator, "predict_proba"):
             raise ValueError("The base estimator should implement"
                              "predict_proba method")
 
-        results = np.dstack([estimator.predict_proba(X) for estimator in
-                            self.estimators_])
+        results = [estimator.predict_proba(X) for estimator in
+                   self.estimators_]
         return results
 
     def score(self, X, y):
diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py
@@ -10,7 +10,7 @@
 from sklearn import datasets
 from sklearn.base import clone
 from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
-from sklearn.linear_model import Lasso
+from sklearn.linear_model import Lasso, LogisticRegression
 from sklearn.svm import LinearSVC
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
@@ -118,17 +118,21 @@ def test_multi_output_classification():
     assert_equal((n_samples, n_outputs), predictions.shape)
 
     predict_proba = multi_target_forest.predict_proba(X)
-    assert_equal((n_samples, n_classes, n_outputs), predict_proba.shape)
 
-    assert_array_equal(np.argmax(predict_proba, axis=1), predictions)
+    assert len(predict_proba) == n_outputs
+    for class_probabilities in predict_proba:
+        assert_equal((n_samples, n_classes), class_probabilities.shape)
+
+    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
+                       predictions)
 
     # train the forest with each column and assert that predictions are equal
     for i in range(3):
         forest_ = clone(forest)  # create a clone with the same state
         forest_.fit(X, y[:, i])
         assert_equal(list(forest_.predict(X)), list(predictions[:, i]))
         assert_array_equal(list(forest_.predict_proba(X)),
-                           list(predict_proba[:, :, i]))
+                           list(predict_proba[i]))
 
 
 def test_multiclass_multioutput_estimator():
@@ -150,6 +154,41 @@ def test_multiclass_multioutput_estimator():
                      list(predictions[:, i]))
 
 
+def test_multiclass_multioutput_estimator_predict_proba():
+    seed = 542
+
+    # make test deterministic
+    rng = np.random.RandomState(seed)
+
+    # random features
+    X = rng.normal(size=(5, 5))
+
+    # random labels
+    y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1)  # 2 classes
+    y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1)  # 3 classes
+
+    Y = np.concatenate([y1, y2], axis=1)
+
+    clf = MultiOutputClassifier(LogisticRegression(random_state=seed))
+
+    clf.fit(X, Y)
+
+    y_result = clf.predict_proba(X)
+    y_actual = [np.array([[0.23481764, 0.76518236],
+                          [0.67196072, 0.32803928],
+                          [0.54681448, 0.45318552],
+                          [0.34883923, 0.65116077],
+                          [0.73687069, 0.26312931]]),
+                np.array([[0.5171785, 0.23878628, 0.24403522],
+                          [0.22141451, 0.64102704, 0.13755846],
+                          [0.16751315, 0.18256843, 0.64991843],
+                          [0.27357372, 0.55201592, 0.17441036],
+                          [0.65745193, 0.26062899, 0.08191907]])]
+
+    for i in range(len(y_actual)):
+        assert_almost_equal(y_result[i], y_actual[i])
+
+
 def test_multi_output_classification_sample_weights():
     # weighted classifier
     Xw = [[1, 2, 3], [4, 5, 6]]