8000 [WIP] Sparse output KNN by hamsal · Pull Request #3350 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] Sparse output KNN #3350

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
40cb849
Included some perliminary scaffolding and testing for sparse output knn
hamsal Jul 8, 2014
7ffd392
Implmented some basic outline for predict with sparse support
hamsal Jul 9, 2014
a41a463
Saved _y as sparse matrix out of base fit if the inpy target data is …
hamsal Jul 9, 2014
e3afae1
Implemented a row wise mode cython function for CSC matricies
hamsal Jul 10, 2014
60c04b2
completed implementation of csr_row_mode
hamsal Jul 10, 2014
7057858
Cleaned lines no longer used
hamsal Jul 12, 2014
2ed6bda
Include full path for imports, remove print in predict
hamsal Jul 16, 2014
3b1ac31
Use .tocsc instead of constructor, Eliminate zeros, format pep8
hamsal Jul 16, 2014
768a613
Format pep8
8000 hamsal Jul 16, 2014
c14c771
Rename testing fun lower case, format pep8, correct comment csr_row_mode
hamsal Jul 16, 2014
b07035e
Implement csr_row_mode (Reintrouduction to .c)
hamsal Jul 16, 2014
6870b8d
Test kneighbors with sparse column
hamsal Jul 18, 2014
19f8434
Update comments in kneigbors predict
hamsal Jul 18, 2014
7c71da2
Make dense columns durring prediction as a naive correctness implemen…
hamsal Aug 6, 2014
7c1fcdd
Use assert almost equal to replace assert equal in testing
hamsal Aug 6, 2014
2af4597
Revert to sp mode function with naive dense column
hamsal Aug 7, 2014
48efa6a
Manaually extract column data, scipy 0.9 vs 0.14 give different .data…
hamsal Aug 7, 2014
c510547
Revert to naive advanced indexing by densfying cloumn first
hamsal Aug 7, 2014
d18b579
Recompile sparsefuncs_fast with cython 0.20
hamsal Aug 7, 2014
3be5bde
Use advanced indexing support from scipy 0.13 up, this will break low…
hamsal Aug 11, 2014
99c5a31
Recreate scipy fancy indexing with numpy operations
hamsal Aug 12, 2014
e93896c
Separate the dense and sparse case in predict
hamsal Aug 13, 2014
bb1c0ae
Delete print statements and outdated commetns
hamsal Aug 13, 2014
043ed2e
Manage corner case with empty column of labels in predict
hamsal Aug 13, 2014
844933e
Implement weighted mode in csr_mode
hamsal Aug 13, 2014
e82770e
Support row wise wieghts mode in csr_row_mode
hamsal Aug 13, 2014
72f5cdd
Densify one column at a time to do slicing, revert changes to sparsef…
hamsal Aug 14, 2014
e61368f
Revert changes to sparsefuncs_fast (corrected)
hamsal Aug 14, 2014
cb04d96
Simplify nnz statement, and the extraction of the non zero modes
hamsal Aug 15, 2014
4d5d100
Revise Imports, use np.unique to accomplish sparse case in base fit
hamsal Aug 19, 2014
d8deae5
Use array instead of np.arry for data in base fit
hamsal Aug 19, 2014
ccae2ae
Remove redundant asserts
hamsal Aug 19, 2014
6b16ee7
Sample predict data from classes, Maintain dtype during prediction
hamsal Aug 20, 2014
2ba9f3f
Cover both target encoding cases in the test, First case classes_ lin…
hamsal Aug 20, 2014
e640807
Comment on target data construction for test, ensure integer index
hamsal Aug 21, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 35 additions & 8 deletions sklearn/neighbors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import warnings
from abc import ABCMeta, abstractmethod

import array
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix, issparse

from .ball_tree import BallTree
Expand Down Expand Up @@ -615,15 +617,40 @@ def fit(self, X, y):
else:
self.outputs_2d_ = True

self.classes_ = []
self._y = np.empty(y.shape, dtype=np.int)
for k in range(self._y.shape[1]):
classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
self.classes_.append(classes)
self.sparse_target_input_ = sp.issparse(y)

if not self.outputs_2d_:
self.classes_ = self.classes_[0]
self._y = self._y.ravel()
if not sp.issparse(y):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic should probably be moved to LabelEncoding at some point; it currently does not handle multioutput, nor sparse (but the latter had only been used for binary targets until now). The sparse implementation is not explicitly tested, and some of its conditions are only being tested because of the random number generation happening to produce entirely dense and non-dense columns.

self.classes_ = []
self._y = np.empty(y.shape, dtype=np.int)
for k in range(self._y.shape[1]):
classes, self._y[:, k] = np.unique(y[:, k],
return_inverse=True)
self.classes_.append(classes)

if not self.outputs_2d_:
self.classes_ = self.classes_[0]
self._y = self._y.ravel()
else:
y = y.tocsc()
y.eliminate_zeros()
nnz = np.diff(y.indptr)
data = array.array('i')
self.classes_ = []

for k in range(y.shape[1]):
k_col_data = y.data[y.indptr[k]:y.indptr[k + 1]]
classes, data_k = np.unique(k_col_data, return_inverse=True)

if not nnz[k] == y.shape[0]:
classes = np.insert(classes, 0, 0)
data_k += 1
self.classes_.append(classes)
data.extend(data_k)

_y = sp.csc_matrix((data, y.indices, y.indptr), shape=y.shape,
dtype=int)

self._y = _y

return self._fit(X)

Expand Down
63 changes: 48 additions & 15 deletions sklearn/neighbors/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,20 @@
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
#
# License: BSD 3 clause (C) INRIA, University of Amsterdam

import array
import numpy as np
import scipy.sparse as sp

from scipy import stats
from ..utils.extmath import weighted_mode

from .base import \
_check_weights, _get_weights, \
NeighborsBase, KNeighborsMixin,\
RadiusNeighborsMixin, SupervisedIntegerMixin
from .base import _check_weights
from .base import _get_weights
from .base import NeighborsBase
from .base import KNeighborsMixin
from .base import RadiusNeighborsMixin
from .base import SupervisedIntegerMixin

from ..base import ClassifierMixin
from ..utils import check_array

Expand Down Expand Up @@ -146,18 +151,42 @@ def predict(self, X):
n_samples = X.shape[0]
weights = _get_weights(neigh_dist, self.weights)

y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
if not self.sparse_target_input_:
y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)
mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)

if not self.outputs_2d_:
y_pred = y_pred.ravel()
if not self.outputs_2d_:
y_pred = y_pred.ravel()

else:

data = []
indices = array.array('i')
indptr = array.array('i', [0])

for k, classes_k in enumerate(classes_):
neigh_lbls_k = _y.getcol(k).toarray().ravel()[neigh_ind]
neigh_lbls_k = classes_k[neigh_lbls_k]

if weights is None:
mode, _ = stats.mode(neigh_lbls_k, axis=1)
else:
mode, _ = weighted_mode(neigh_lbls_k, weights, axis=1)

data.extend(mode[mode != 0])
indices.extend(np.where(mode != 0)[0])
indptr.append(len(indices))

y_pred = sp.csc_matrix((data, indices, indptr),
Copy link
Member

Choose a reason for hiding this comment

8000 The reason will be displayed to describe this comment to others. Learn more.

You seem to be missing a classes_.take step. Please add a test where class numbers are not contiguous (or else explicitly reject such data).

I am also not sure that you should be requiring the data to be intps, rather than classes_[0].dtype as in the dense case (with the caveat that only a dtype that is supported for the sparse input will be produced as output). I could imagine a boolean array input, but otherwise this might merely be moot.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have corrected this, the test was changed so that it fails if the sampling from classes is not used. The dtype is now also maintained when predicting and there is an assert for this in the test.

(n_samples, n_outputs),
dtype=classes_[0].dtype)

return y_pred

Expand All @@ -182,6 +211,10 @@ def predict_proba(self, X):

classes_ = self.classes_
_y = self._y

if self.sparse_target_input_:
_y = _y.toarray()

if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
Expand Down
53 changes: 52 additions & 1 deletion sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ def test_kneighbors_classifier_predict_proba():
assert_array_almost_equal(real_prob, y_prob)



def test_radius_neighbors_classifier(n_samples=40,
n_features=5,
n_test_pts=10,
Expand Down Expand Up @@ -849,6 +848,58 @@ def test_callable_metric():
assert_array_almost_equal(dist1, dist2)


def test_kneighbors_classifier_sparse_target_multioutput():
"""Test k-NN classifier on multioutput data with sparse target data"""
rng = check_random_state(0)
n_features = 5
n_samples = 50
n_output = 4

X = rng.rand(n_samples, n_features)

# Consturct target data so that we cover two cases label encoding
# case 1: classes are not a 0 to n sequence
y_fst = rng.randint(1, 4, (n_samples, n_output//2)).astype(float)
# case 2: classes line up with their integer encoding
y_snd = rng.randint(0, 3, (n_samples, n_output//2)).astype(float)
y = np.hstack((y_fst, y_snd))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = csc_matrix(y_train)

weights = [None, 'uniform', 'distance', _weight_func]

for algorithm, weights in product(ALGORITHMS, weights):
# Stack single output prediction
y_pred_so = []
y_pred_proba_so = []
for o in range(n_output):
knn = neighbors.KNeighborsClassifier(weights=weights,
algorithm=algorithm)
knn.fit(X_train, y_train.getcol(o).toarray().ravel())
y_pred_so.append(knn.predict(X_test))
y_pred_proba_so.append(knn.predict_proba(X_test))

y_pred_so = np.vstack(y_pred_so).T
assert_equal(y_pred_so.shape, y_test.shape)
assert_equal(len(y_pred_proba_so), n_output)

# Multioutput prediction
knn_mo = neighbors.KNeighborsClassifier(weights=weights,
algorithm=algorithm)
knn_mo.fit(X_train, y_train)
y_pred_mo = knn_mo.predict(X_test)

assert_equal(y_pred_mo.dtype, float)
assert_array_equal(y_pred_mo.toarray(), y_pred_so)

# Check proba
y_pred_proba_mo = knn_mo.predict_proba(X_test)
assert_equal(len(y_pred_proba_mo), n_output)

for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so):
assert_array_almost_equal(proba_mo, proba_so)

if __name__ == '__main__':
import nose
nose.runmodule()
0