scikit-learn · hamsal · Jul 8, 2014 · Jul 9, 2014 · Jul 9, 2014 · Jul 10, 2014
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -9,7 +9,9 @@
 import warnings
 from abc import ABCMeta, abstractmethod
 
+import array
 import numpy as np
+import scipy.sparse as sp
 from scipy.sparse import csr_matrix, issparse
 
 from .ball_tree import BallTree
@@ -615,15 +617,40 @@ def fit(self, X, y):
         else:
             self.outputs_2d_ = True
 
-        self.classes_ = []
-        self._y = np.empty(y.shape, dtype=np.int)
-        for k in range(self._y.shape[1]):
-            classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
-            self.classes_.append(classes)
+        self.sparse_target_input_ = sp.issparse(y)
 
-        if not self.outputs_2d_:
-            self.classes_ = self.classes_[0]
-            self._y = self._y.ravel()
+        if not sp.issparse(y):
+            self.classes_ = []
+            self._y = np.empty(y.shape, dtype=np.int)
+            for k in range(self._y.shape[1]):
+                classes, self._y[:, k] = np.unique(y[:, k],
+                                                   return_inverse=True)
+                self.classes_.append(classes)
+
+            if not self.outputs_2d_:
+                self.classes_ = self.classes_[0]
+                self._y = self._y.ravel()
+        else:
+            y = y.tocsc()
+            y.eliminate_zeros()
+            nnz = np.diff(y.indptr)
+            data = array.array('i')
+            self.classes_ = []
+
+            for k in range(y.shape[1]):
+                k_col_data = y.data[y.indptr[k]:y.indptr[k + 1]]
+                classes, data_k = np.unique(k_col_data,  return_inverse=True)
+
+                if not nnz[k] == y.shape[0]:
+                    classes = np.insert(classes, 0, 0)
+                    data_k += 1
+                self.classes_.append(classes)
+                data.extend(data_k)
+
+            _y = sp.csc_matrix((data, y.indices, y.indptr), shape=y.shape,
+                               dtype=int)
+
+            self._y = _y
 
         return self._fit(X)
 

diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py
@@ -7,15 +7,20 @@
 #          Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
 #
 # License: BSD 3 clause (C) INRIA, University of Amsterdam
-
+import array
 import numpy as np
+import scipy.sparse as sp
+
 from scipy import stats
 from ..utils.extmath import weighted_mode
 
-from .base import \
-    _check_weights, _get_weights, \
-    NeighborsBase, KNeighborsMixin,\
-    RadiusNeighborsMixin, SupervisedIntegerMixin
+from .base import _check_weights
+from .base import _get_weights
+from .base import NeighborsBase
+from .base import KNeighborsMixin
+from .base import RadiusNeighborsMixin
+from .base import SupervisedIntegerMixin
+
 from ..base import ClassifierMixin
 from ..utils import check_array
 
@@ -146,18 +151,42 @@ def predict(self, X):
         n_samples = X.shape[0]
         weights = _get_weights(neigh_dist, self.weights)
 
-        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
-        for k, classes_k in enumerate(classes_):
-            if weights is None:
-                mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
-            else:
-                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
+        if not self.sparse_target_input_:
+            y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
+            for k, classes_k in enumerate(classes_):
+                if weights is None:
+                    mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
+                else:
+                    mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
 
-            mode = np.asarray(mode.ravel(), dtype=np.intp)
-            y_pred[:, k] = classes_k.take(mode)
+                mode = np.asarray(mode.ravel(), dtype=np.intp)
+                y_pred[:, k] = classes_k.take(mode)
 
-        if not self.outputs_2d_:
-            y_pred = y_pred.ravel()
+            if not self.outputs_2d_:
+                y_pred = y_pred.ravel()
+
+        else:
+
+            data = []
+            indices = array.array('i')
+            indptr = array.array('i', [0])
+
+            for k, classes_k in enumerate(classes_):
+                neigh_lbls_k = _y.getcol(k).toarray().ravel()[neigh_ind]
+                neigh_lbls_k = classes_k[neigh_lbls_k]
+
+                if weights is None:
+                    mode, _ = stats.mode(neigh_lbls_k,  axis=1)
+                else:
+                    mode, _ = weighted_mode(neigh_lbls_k, weights, axis=1)
+
+                data.extend(mode[mode != 0])
+                indices.extend(np.where(mode != 0)[0])
+                indptr.append(len(indices))
+
+            y_pred = sp.csc_matrix((data, indices, indptr),
+                                   (n_samples, n_outputs),
+                                   dtype=classes_[0].dtype)
 
         return y_pred
 
@@ -182,6 +211,10 @@ def predict_proba(self, X):
 
         classes_ = self.classes_
         _y = self._y
+
+        if self.sparse_target_input_:
+            _y = _y.toarray()
+
         if not self.outputs_2d_:
             _y = self._y.reshape((-1, 1))
             classes_ = [self.classes_]

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -209,7 +209,6 @@ def test_kneighbors_classifier_predict_proba():
     assert_array_almost_equal(real_prob, y_prob)
 
 
-
 def test_radius_neighbors_classifier(n_samples=40,
                                      n_features=5,
                                      n_test_pts=10,
@@ -849,6 +848,58 @@ def test_callable_metric():
     assert_array_almost_equal(dist1, dist2)
 
 
+def test_kneighbors_classifier_sparse_target_multioutput():
+    """Test k-NN classifier on multioutput data with sparse target data"""
+    rng = check_random_state(0)
+    n_features = 5
+    n_samples = 50
+    n_output = 4
+
+    X = rng.rand(n_samples, n_features)
+
+    # Consturct target data so that we cover two cases label encoding
+    # case 1: classes are not a 0 to n sequence
+    y_fst = rng.randint(1, 4, (n_samples, n_output//2)).astype(float)
+    # case 2: classes line up with their integer encoding
+    y_snd = rng.randint(0, 3, (n_samples, n_output//2)).astype(float)
+    y = np.hstack((y_fst, y_snd))
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    y_train = csc_matrix(y_train)
+
+    weights = [None, 'uniform', 'distance', _weight_func]
+
+    for algorithm, weights in product(ALGORITHMS, weights):
+        # Stack single output prediction
+        y_pred_so = []
+        y_pred_proba_so = []
+        for o in range(n_output):
+            knn = neighbors.KNeighborsClassifier(weights=weights,
+                                                 algorithm=algorithm)
+            knn.fit(X_train, y_train.getcol(o).toarray().ravel())
+            y_pred_so.append(knn.predict(X_test))
+            y_pred_proba_so.append(knn.predict_proba(X_test))
+
+        y_pred_so = np.vstack(y_pred_so).T
+        assert_equal(y_pred_so.shape, y_test.shape)
+        assert_equal(len(y_pred_proba_so), n_output)
+
+        # Multioutput prediction
+        knn_mo = neighbors.KNeighborsClassifier(weights=weights,
+                                                algorithm=algorithm)
+        knn_mo.fit(X_train, y_train)
+        y_pred_mo = knn_mo.predict(X_test)
+
+        assert_equal(y_pred_mo.dtype, float)
+        assert_array_equal(y_pred_mo.toarray(), y_pred_so)
+
+        # Check proba
+        y_pred_proba_mo = knn_mo.predict_proba(X_test)
+        assert_equal(len(y_pred_proba_mo), n_output)
+
+        for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so):
+            assert_array_almost_equal(proba_mo, proba_so)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()