From 236c766fba06aa7d160e957e925c83666a17d459 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 21 Aug 2017 11:21:15 -0500 Subject: [PATCH 01/48] add predict_proba method for RadiusNeighborsClassifier --- sklearn/neighbors/classification.py | 59 +++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index fb0dc8ad15e3f..9d331fa7a2aff 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -388,3 +388,62 @@ def predict(self, X): y_pred = y_pred.ravel() return y_pred + + def predict_proba(self, X): + """Return probability estimates for the test data X. + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Test samples. + + Returns + ------- + p : array of shape = [n_samples, n_classes], or a list of n_outputs + of such arrays if n_outputs > 1. + The class probabilities of the input samples. Classes are ordered + by lexicographic order. + Outliers will be assign 0s in all class probabilities. + """ + + X = check_array(X, accept_sparse='csr') + n_samples = X.shape[0] + + neigh_dist, neigh_ind = self.radius_neighbors(X) + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + + weights = _get_weights(neigh_dist, self.weights) + + probabilities = [] + for k, classes_k in enumerate(classes_): + pred_labels = np.zeros(len(neigh_ind), dtype=object) + pred_labels[:] = [_y[ind, k] for ind in neigh_ind] + + proba_k = np.zeros((n_samples, classes_k.size)) + + #samples have different size of neighbors within the same radius + if weights is None: + for i, idx in enumerate(pred_labels): # loop is O(n_samples) + proba_k[i,:] += np.bincount(idx, minlength = classes_k.size) + else: + for i, idx in enumerate(pred_labels): # loop is O(n_samples) + proba_k[i,:] += np.bincount(idx, weights[i], + minlength = classes_k.size) + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer + + probabilities.append(proba_k) + + if not self.outputs_2d_: + probabilities = probabilities[0] + + return probabilities \ No newline at end of file From 67a59cd2e262b9f9efd9d9e8a2954a2bda4ad760 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 21 Aug 2017 12:02:25 -0500 Subject: [PATCH 02/48] add warning --- sklearn/neighbors/classification.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 9d331fa7a2aff..38ee93254a208 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -8,6 +8,7 @@ # # License: BSD 3 clause (C) INRIA, University of Amsterdam +import warnings import numpy as np from scipy import stats from ..utils.extmath import weighted_mode @@ -411,6 +412,13 @@ def predict_proba(self, X): n_samples = X.shape[0] neigh_dist, neigh_ind = self.radius_neighbors(X) + + outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] + if len(outliers) > 0: + warnings.warn('No neighbors found for test samples %r, ' + 'their probabilities will be assgined with 0.' + % outliers) + classes_ = self.classes_ _y = self._y if not self.outputs_2d_: From b69eef16b7198599987e04102721a836280a1c54 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 21 Aug 2017 13:10:24 -0500 Subject: [PATCH 03/48] DOC Add predict_proba in class description --- sklearn/neighbors/classification.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 38ee93254a208..dc364a399cf53 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -283,7 +283,9 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, outlier_label : int, optional (default = None) Label, which is given for outlier samples (samples with no neighbors on given radius). - If set to None, ValueError is raised, when outlier is detected. + If set to None and outlier is detected, ValueError is raised when + function predict(X) is called, UserWarning is raised when function + preduct_proba(X) is called. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -298,7 +300,9 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, RadiusNeighborsClassifier(...) >>> print(neigh.predict([[1.5]])) [0] - + >>> print(neigh.predict_proba([[1.0]])) + [[ 0.66666667 0.33333333]] + See also -------- KNeighborsClassifier From 9ac27e10c25638fcbc66320a1a125724f1fc55c8 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 21 Aug 2017 14:38:17 -0500 Subject: [PATCH 04/48] Finish formats --- sklearn/neighbors/classification.py | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index dc364a399cf53..1f272ccc0721f 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -283,7 +283,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, outlier_label : int, optional (default = None) Label, which is given for outlier samples (samples with no neighbors on given radius). - If set to None and outlier is detected, ValueError is raised when + If set to None and outlier is detected, ValueError is raised when function predict(X) is called, UserWarning is raised when function preduct_proba(X) is called. @@ -302,7 +302,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, [0] >>> print(neigh.predict_proba([[1.0]])) [[ 0.66666667 0.33333333]] - + See also -------- KNeighborsClassifier @@ -393,7 +393,7 @@ def predict(self, X): y_pred = y_pred.ravel() return y_pred - + def predict_proba(self, X): """Return probability estimates for the test data X. @@ -411,42 +411,42 @@ def predict_proba(self, X): by lexicographic order. Outliers will be assign 0s in all class probabilities. """ - + X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] neigh_dist, neigh_ind = self.radius_neighbors(X) - + outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] if len(outliers) > 0: warnings.warn('No neighbors found for test samples %r, ' - 'their probabilities will be assgined with 0.' - % outliers) - + 'their probabilities will be assgined with 0.' + % outliers) + classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - weights = _get_weights(neigh_dist, self.weights) - + probabilities = [] for k, classes_k in enumerate(classes_): pred_labels = np.zeros(len(neigh_ind), dtype=object) pred_labels[:] = [_y[ind, k] for ind in neigh_ind] - + proba_k = np.zeros((n_samples, classes_k.size)) - #samples have different size of neighbors within the same radius + # samples have different size of neighbors within the same radius if weights is None: for i, idx in enumerate(pred_labels): # loop is O(n_samples) - proba_k[i,:] += np.bincount(idx, minlength = classes_k.size) + proba_k[i, :] += np.bincount(idx, + minlength=classes_k.size) else: for i, idx in enumerate(pred_labels): # loop is O(n_samples) - proba_k[i,:] += np.bincount(idx, weights[i], - minlength = classes_k.size) + proba_k[i, :] += np.bincount(idx, weights[i], + minlength=classes_k.size) # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] @@ -458,4 +458,4 @@ def predict_proba(self, X): if not self.outputs_2d_: probabilities = probabilities[0] - return probabilities \ No newline at end of file + return probabilities From 9b46e2a914231771c016f7b728897b97a5719eeb Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 21 Aug 2017 18:10:27 -0500 Subject: [PATCH 05/48] Finish formats --- sklearn/neighbors/classification.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 1f272ccc0721f..ba049b6755eb8 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -442,11 +442,12 @@ def predict_proba(self, X): if weights is None: for i, idx in enumerate(pred_labels): # loop is O(n_samples) proba_k[i, :] += np.bincount(idx, - minlength=classes_k.size) + minlength=classes_k.size) else: for i, idx in enumerate(pred_labels): # loop is O(n_samples) - proba_k[i, :] += np.bincount(idx, weights[i], - minlength=classes_k.size) + proba_k[i, :] += np.bincount(idx, + weights[i], + minlength=classes_k.size) # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] From dc9069c4f476799a6851a8089d6fcbade7647364 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Tue, 22 Aug 2017 10:54:54 -0500 Subject: [PATCH 06/48] Add test, improve warning --- sklearn/neighbors/classification.py | 5 ++++- sklearn/neighbors/tests/test_neighbors.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index ba049b6755eb8..3e8fbbc0ca7e4 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -420,7 +420,10 @@ def predict_proba(self, X): outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] if len(outliers) > 0: warnings.warn('No neighbors found for test samples %r, ' - 'their probabilities will be assgined with 0.' + 'their probabilities will be assgined with 0, ' + 'which may influence scoring. ' + 'You can try using larger radius, ' + 'or consider removing them from your dataset.' % outliers) classes_ = self.classes_ diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 052c83c71d2e7..dfadca7dfe97b 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1268,3 +1268,15 @@ def test_pairwise_boolean_distance(): nn1 = NN(metric="jaccard", algorithm='brute').fit(X) nn2 = NN(metric="jaccard", algorithm='ball_tree').fit(X) assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) + +def test_radius_neighbors_clf_predict_proba(): + # test for #9597 + # weight of uniform + # outlier warnings + def check_warn(): + clf = RadiusNeighborsClassifier(radius=1, weights='uniform') + X = [[0], [1], [2], [3]] + y = [0, 0, 1, 1] + clf.fit(X, y) + clf.predict_proba([[1],[5]]) + assert_warns(UserWarning, check_warn) From f30ed4faba4ffafcc16b581436cdabcc9f5d1480 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Tue, 22 Aug 2017 11:44:14 -0500 Subject: [PATCH 07/48] Add test --- sklearn/neighbors/tests/test_neighbors.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index dfadca7dfe97b..ceb5526c36faf 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1269,14 +1269,17 @@ def test_pairwise_boolean_distance(): nn2 = NN(metric="jaccard", algorithm='ball_tree').fit(X) assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) + def test_radius_neighbors_clf_predict_proba(): # test for #9597 # weight of uniform # outlier warnings - def check_warn(): - clf = RadiusNeighborsClassifier(radius=1, weights='uniform') + def check_warn(w='distance'): + RNC = neighbors.RadiusNeighborsClassifier + clf = RNC(radius=1, weights=w) X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] clf.fit(X, y) - clf.predict_proba([[1],[5]]) - assert_warns(UserWarning, check_warn) + clf.predict_proba([[1], [5]]) + assert_warns(UserWarning, check_warn, w='distance') + assert_warns(UserWarning, check_warn, w='uniform') From 0982ee38414686a62c7582cfac1b9258e30162a2 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Wed, 23 Aug 2017 21:14:48 -0500 Subject: [PATCH 08/48] add outlier handler --- sklearn/neighbors/classification.py | 71 +++++++++++++++++------ sklearn/neighbors/tests/test_neighbors.py | 33 ++++++++++- 2 files changed, 84 insertions(+), 20 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 3e8fbbc0ca7e4..6054ba488b2e2 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -280,12 +280,14 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : int, optional (default = None) - Label, which is given for outlier samples (samples with no - neighbors on given radius). - If set to None and outlier is detected, ValueError is raised when - function predict(X) is called, UserWarning is raised when function - preduct_proba(X) is called. + outlier_label : int, 'uniform', 'prior', optional (default = None) + - int : manual label, which is given for outlier samples (samples with + no neighbors on given radius). + - 'uniform' : outlier samples have same probabilities to be assgined + into every label. + - 'prior' : outlier samples have the same probabilities to be assgined + into labels as label probabilities of 'y' in training data. + - None : when outlier is detected, ValueError is raised. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -386,7 +388,17 @@ def predict(self, X): y_pred[inliers, k] = classes_k.take(mode) - if outliers: + if outliers: + if self.outlier_label == 'uniform': + y_pred[outliers, k] = np.random.randint(classes_k.size, + size=len(outliers)) + elif self.outlier_label == 'prior': + prior = np.bincount(_y[:, k]) / _y.shape[0] + y_pred[outliers, k] = np.random.choice(classes_k, + p=prior, + size=len(outliers)) + + if outliers and isinstance(self.outlier_label, int): y_pred[outliers, :] = self.outlier_label if not self.outputs_2d_: @@ -417,14 +429,10 @@ def predict_proba(self, X): neigh_dist, neigh_ind = self.radius_neighbors(X) + inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0] + mask = np.ones(n_samples, np.bool) + mask[inliers] = 0 outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] - if len(outliers) > 0: - warnings.warn('No neighbors found for test samples %r, ' - 'their probabilities will be assgined with 0, ' - 'which may influence scoring. ' - 'You can try using larger radius, ' - 'or consider removing them from your dataset.' - % outliers) classes_ = self.classes_ _y = self._y @@ -432,6 +440,15 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] + if self.outlier_label is not None: + neigh_dist[outliers] = 1e-6 + elif outliers: + raise ValueError('No neighbors found for test samples %r, ' + 'you can try using larger radius, ' + 'give a label for outliers, ' + 'or consider removing them from your dataset.' + % outliers) + weights = _get_weights(neigh_dist, self.weights) probabilities = [] @@ -440,17 +457,33 @@ def predict_proba(self, X): pred_labels[:] = [_y[ind, k] for ind in neigh_ind] proba_k = np.zeros((n_samples, classes_k.size)) - + proba_inliers = np.zeros((len(inliers), classes_k.size)) + # samples have different size of neighbors within the same radius if weights is None: - for i, idx in enumerate(pred_labels): # loop is O(n_samples) - proba_k[i, :] += np.bincount(idx, + for i, idx in enumerate(pred_labels[inliers]): # loop is O(n_samples) + proba_inliers[i, :] += np.bincount(idx, minlength=classes_k.size) else: - for i, idx in enumerate(pred_labels): # loop is O(n_samples) - proba_k[i, :] += np.bincount(idx, + for i, idx in enumerate(pred_labels[inliers]): # loop is O(n_samples) + proba_inliers[i, :] += np.bincount(idx, weights[i], minlength=classes_k.size) + proba_k[inliers, :] = proba_inliers + + if outliers: + if self.outlier_label == 'uniform': + proba_k[outliers, :] = 1.0 / classes_k.size + elif self.outlier_label == 'prior': + proba_k[outliers, :] = np.bincount(_y[:, k]) / _y.shape[0] + else: + proba_k[outliers, :] = 0.0 + warnings.warn('No neighbors found for test samples %r, ' + 'their probabilities will be assgined with 0., ' + 'which may influence scoring. ' + 'You can try using larger radius, ' + 'or consider removing them from your dataset.' + % outliers) # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index ceb5526c36faf..8e6f64b277c48 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1276,10 +1276,41 @@ def test_radius_neighbors_clf_predict_proba(): # outlier warnings def check_warn(w='distance'): RNC = neighbors.RadiusNeighborsClassifier - clf = RNC(radius=1, weights=w) + clf = RNC(radius=1, weights=w, outlier_label = -1) X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] clf.fit(X, y) clf.predict_proba([[1], [5]]) assert_warns(UserWarning, check_warn, w='distance') assert_warns(UserWarning, check_warn, w='uniform') + + +def test_radius_neighbors_outliers(): + # outler handlers: 1. uniform, prior, None + RNC = neighbors.RadiusNeighborsClassifier + X = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]] + y = [0, 2, 2, 1, 1, 1, 3, 3, 3, 3] + + def check_exception(): + clf = RNC(radius=1, outlier_label=None) + clf.fit(X, y) + clf.predict_proba([[1], [15]]) + assert_raises(ValueError, check_exception) + + clf = RNC(radius=1, outlier_label='uniform') + clf.fit(X, y) + predict = clf.predict_proba([[1], [15]]) + assert_equal(predict[1, 0], 0.25) + assert_equal(predict[1, 1], 0.25) + assert_equal(predict[1, 2], 0.25) + assert_equal(predict[1, 3], 0.25) + clf.predict([[1], [15]]) + + clf = RNC(radius=1, outlier_label='prior') + clf.fit(X, y) + predict = clf.predict_proba([[1], [15]]) + assert_equal(predict[1, 0], 0.1) + assert_equal(predict[1, 1], 0.3) + assert_equal(predict[1, 2], 0.2) + assert_equal(predict[1, 3], 0.4) + clf.predict([[1], [15]]) From 9565eac208f4e61a3b7018cdee1ab219e12c38d1 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Wed, 23 Aug 2017 22:45:41 -0500 Subject: [PATCH 09/48] format, 2.7 float divide --- sklearn/neighbors/classification.py | 40 +++++++++++------------ sklearn/neighbors/tests/test_neighbors.py | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 6054ba488b2e2..5fb1111342c50 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -8,7 +8,7 @@ # # License: BSD 3 clause (C) INRIA, University of Amsterdam -import warnings +import warnings as warns import numpy as np from scipy import stats from ..utils.extmath import weighted_mode @@ -393,10 +393,10 @@ def predict(self, X): y_pred[outliers, k] = np.random.randint(classes_k.size, size=len(outliers)) elif self.outlier_label == 'prior': - prior = np.bincount(_y[:, k]) / _y.shape[0] + prior = np.bincount(_y[:, k]) / float(_y.shape[0]) y_pred[outliers, k] = np.random.choice(classes_k, p=prior, - size=len(outliers)) + size=len(outliers)) if outliers and isinstance(self.outlier_label, int): y_pred[outliers, :] = self.outlier_label @@ -457,33 +457,33 @@ def predict_proba(self, X): pred_labels[:] = [_y[ind, k] for ind in neigh_ind] proba_k = np.zeros((n_samples, classes_k.size)) - proba_inliers = np.zeros((len(inliers), classes_k.size)) - + proba_inl = np.zeros((len(inliers), classes_k.size)) + # samples have different size of neighbors within the same radius if weights is None: - for i, idx in enumerate(pred_labels[inliers]): # loop is O(n_samples) - proba_inliers[i, :] += np.bincount(idx, - minlength=classes_k.size) + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] += np.bincount(idx, + minlength=classes_k.size) else: - for i, idx in enumerate(pred_labels[inliers]): # loop is O(n_samples) - proba_inliers[i, :] += np.bincount(idx, - weights[i], - minlength=classes_k.size) - proba_k[inliers, :] = proba_inliers + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] += np.bincount(idx, + weights[i], + minlength=classes_k.size) + proba_k[inliers, :] = proba_inl if outliers: if self.outlier_label == 'uniform': proba_k[outliers, :] = 1.0 / classes_k.size elif self.outlier_label == 'prior': - proba_k[outliers, :] = np.bincount(_y[:, k]) / _y.shape[0] + proba_k[outliers, :] = np.bincount(_y[:, k]) / float(_y.shape[0]) else: proba_k[outliers, :] = 0.0 - warnings.warn('No neighbors found for test samples %r, ' - 'their probabilities will be assgined with 0., ' - 'which may influence scoring. ' - 'You can try using larger radius, ' - 'or consider removing them from your dataset.' - % outliers) + warns.warn('No neighbors found for test samples %r, ' + 'their probabilities will be assgined with 0, ' + 'which may influence scoring. ' + 'You can try using larger radius, ' + 'or consider removing them from your dataset.' + % outliers) # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 8e6f64b277c48..b04fed17e8442 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1276,7 +1276,7 @@ def test_radius_neighbors_clf_predict_proba(): # outlier warnings def check_warn(w='distance'): RNC = neighbors.RadiusNeighborsClassifier - clf = RNC(radius=1, weights=w, outlier_label = -1) + clf = RNC(radius=1, weights=w, outlier_label=-1) X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] clf.fit(X, y) From 5baa0707da23e26f7206e1091ecc48ec2d97d4c3 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Thu, 24 Aug 2017 09:02:15 -0500 Subject: [PATCH 10/48] modify code length --- sklearn/neighbors/classification.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 5fb1111342c50..f9d162cd6d5d7 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -281,12 +281,12 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, list of available metrics. outlier_label : int, 'uniform', 'prior', optional (default = None) - - int : manual label, which is given for outlier samples (samples with - no neighbors on given radius). + - int : manual label, which is given for outlier samples (samples + with no neighbors on given radius). - 'uniform' : outlier samples have same probabilities to be assgined into every label. - - 'prior' : outlier samples have the same probabilities to be assgined - into labels as label probabilities of 'y' in training data. + - 'prior' : outlier samples will be labeled according to the label + distributuin of 'y' in training data. - None : when outlier is detected, ValueError is raised. metric_params : dict, optional (default = None) @@ -475,7 +475,8 @@ def predict_proba(self, X): if self.outlier_label == 'uniform': proba_k[outliers, :] = 1.0 / classes_k.size elif self.outlier_label == 'prior': - proba_k[outliers, :] = np.bincount(_y[:, k]) / float(_y.shape[0]) + proba_k[outliers, :] = (np.bincount(_y[:, k]) + / float(_y.shape[0])) else: proba_k[outliers, :] = 0.0 warns.warn('No neighbors found for test samples %r, ' From 1ceab5ab8ab58d6e852e6a3361bf16a54a2b3767 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Thu, 24 Aug 2017 09:26:18 -0500 Subject: [PATCH 11/48] indent --- sklearn/neighbors/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index f9d162cd6d5d7..5d834eaca13ff 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -476,7 +476,7 @@ def predict_proba(self, X): proba_k[outliers, :] = 1.0 / classes_k.size elif self.outlier_label == 'prior': proba_k[outliers, :] = (np.bincount(_y[:, k]) - / float(_y.shape[0])) + / float(_y.shape[0])) else: proba_k[outliers, :] = 0.0 warns.warn('No neighbors found for test samples %r, ' From cd1c25f13c4e42bb2f7a009a81f39e979b9ab26b Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Fri, 25 Aug 2017 17:16:20 -0500 Subject: [PATCH 12/48] add _check_outlier_handler, prepare for regressor --- sklearn/neighbors/base.py | 11 +++++++ sklearn/neighbors/classification.py | 39 +++++++++++++---------- sklearn/neighbors/regression.py | 6 ++-- sklearn/neighbors/tests/test_neighbors.py | 31 ++++++++++++++++++ 4 files changed, 69 insertions(+), 18 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e14da8bbc2e97..fc345f9914e4a 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -7,6 +7,7 @@ # # License: BSD 3 clause (C) INRIA, University of Amsterdam import warnings +import numbers from abc import ABCMeta, abstractmethod import numpy as np @@ -98,6 +99,16 @@ def _get_weights(dist, weights): "'distance', or a callable function") +def _check_outlier_handler(outlier_handler, kind): + """Check to make sure outlier_handler is valid""" + if (outlier_handler in [None, 'uniform', 'prior'] + or isinstance(outlier_handler, (numbers.Integral, np.integer))): + return outlier_handler + else: + raise ValueError("outlier_%s not recognized, should be int " + "'uniform', 'prior' or None." % kind) + + class NeighborsBase(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for nearest neighbors estimators.""" diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 5d834eaca13ff..3309c41bccd56 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -8,6 +8,7 @@ # # License: BSD 3 clause (C) INRIA, University of Amsterdam +import numbers import warnings as warns import numpy as np from scipy import stats @@ -15,6 +16,7 @@ from .base import \ _check_weights, _get_weights, \ + _check_outlier_handler,\ NeighborsBase, KNeighborsMixin,\ RadiusNeighborsMixin, SupervisedIntegerMixin from ..base import ClassifierMixin @@ -329,7 +331,8 @@ def __init__(self, radius=1.0, weights='uniform', metric=metric, p=p, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) - self.outlier_label = outlier_label + self.outlier_label = _check_outlier_handler(outlier_label, + kind='label') def predict(self, X): """Predict the class labels for the provided data @@ -398,9 +401,12 @@ def predict(self, X): p=prior, size=len(outliers)) - if outliers and isinstance(self.outlier_label, int): + if outliers and isinstance(self.outlier_label, (numbers.Integral, + np.integer)): y_pred[outliers, :] = self.outlier_label - + warns.warn('No neighbors found for test samples %r, ' + 'their labels will be assgined with %d. ' + % (outliers, self.outlier_label)) if not self.outputs_2d_: y_pred = y_pred.ravel() @@ -428,11 +434,10 @@ def predict_proba(self, X): n_samples = X.shape[0] neigh_dist, neigh_ind = self.radius_neighbors(X) - inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0] mask = np.ones(n_samples, np.bool) - mask[inliers] = 0 - outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] + mask[inliers] = False + outliers = np.arange(n_samples)[mask] classes_ = self.classes_ _y = self._y @@ -445,8 +450,8 @@ def predict_proba(self, X): elif outliers: raise ValueError('No neighbors found for test samples %r, ' 'you can try using larger radius, ' - 'give a label for outliers, ' - 'or consider removing them from your dataset.' + 'consider removing them from your dataset ' + 'or change oulier_label parameter.' % outliers) weights = _get_weights(neigh_dist, self.weights) @@ -477,14 +482,16 @@ def predict_proba(self, X): elif self.outlier_label == 'prior': proba_k[outliers, :] = (np.bincount(_y[:, k]) / float(_y.shape[0])) - else: - proba_k[outliers, :] = 0.0 - warns.warn('No neighbors found for test samples %r, ' - 'their probabilities will be assgined with 0, ' - 'which may influence scoring. ' - 'You can try using larger radius, ' - 'or consider removing them from your dataset.' - % outliers) + elif isinstance(self.outlier_label, (numbers.Integral, + np.integer)): + if self.outlier_label in classes_k: + proba_k[outliers, self.outlier_label] = 1.0 + else: + proba_k[outliers, :] = 0.0 + warns.warn('No neighbors found for test samples %r, ' + 'their probabilities will be assgined ' + 'with 0., which may influence scoring.' + % outliers) # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 1180850b8d21a..299c083773177 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -254,14 +254,16 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, """ def __init__(self, radius=1.0, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, **kwargs): + algorithm='auto', leaf_size=30,p=2, metric='minkowski', + outlier_value=None, metric_params=None, **kwargs): self._init_params(radius=radius, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) + self.outlier_value = _check_outlier_handler(outlier_value, + kind='value') def predict(self, X): """Predict the target for the provided data diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index b04fed17e8442..7ce8c25e9a651 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -18,6 +18,7 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings @@ -1291,12 +1292,14 @@ def test_radius_neighbors_outliers(): X = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]] y = [0, 2, 2, 1, 1, 1, 3, 3, 3, 3] + # test outiler Error rasing def check_exception(): clf = RNC(radius=1, outlier_label=None) clf.fit(X, y) clf.predict_proba([[1], [15]]) assert_raises(ValueError, check_exception) + # test uniform outlier clf = RNC(radius=1, outlier_label='uniform') clf.fit(X, y) predict = clf.predict_proba([[1], [15]]) @@ -1304,8 +1307,10 @@ def check_exception(): assert_equal(predict[1, 1], 0.25) assert_equal(predict[1, 2], 0.25) assert_equal(predict[1, 3], 0.25) + # test uniform outlier in predict method clf.predict([[1], [15]]) + # test prior outlier clf = RNC(radius=1, outlier_label='prior') clf.fit(X, y) predict = clf.predict_proba([[1], [15]]) @@ -1313,4 +1318,30 @@ def check_exception(): assert_equal(predict[1, 1], 0.3) assert_equal(predict[1, 2], 0.2) assert_equal(predict[1, 3], 0.4) + # test prior outlier in predcit method clf.predict([[1], [15]]) + + # est manual label in y + clf = RNC(radius=1, outlier_label=2) + clf.fit(X, y) + predict = clf.predict_proba([[1], [15]]) + assert_equal(predict[1, 2], 1.0) + + # test manual label out of y + clf = RNC(radius=1, outlier_label=4) + clf.fit(X, y) + predict = clf.predict_proba([[1], [15]]) + assert_equal(predict[1, 0], 0.0) + assert_equal(predict[1, 1], 0.0) + assert_equal(predict[1, 2], 0.0) + assert_equal(predict[1, 3], 0.0) + predict = clf.predict([[7], [15]]) + assert_equal(predict[0], 3.0) + assert_equal(predict[1], 4.0) + + # test check_outlier_handler error raise + assert_raise_message(ValueError, + 'outlier_label', + lambda:RNC(outlier_label='hello world')) + + neighbors.RadiusNeighborsRegressor() From be801b6a20a6ed4207371d8200504b6d4c77cf27 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Fri, 25 Aug 2017 18:18:01 -0500 Subject: [PATCH 13/48] format --- sklearn/neighbors/base.py | 5 +++-- sklearn/neighbors/regression.py | 9 ++++++--- sklearn/neighbors/tests/test_neighbors.py | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index fc345f9914e4a..b9ac2afbeb0ab 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -101,8 +101,9 @@ def _get_weights(dist, weights): def _check_outlier_handler(outlier_handler, kind): """Check to make sure outlier_handler is valid""" - if (outlier_handler in [None, 'uniform', 'prior'] - or isinstance(outlier_handler, (numbers.Integral, np.integer))): + if outlier_handler in [None, 'uniform', 'prior']: + return outlier_handler + elif isinstance(outlier_handler, (numbers.Integral, np.integer)): return outlier_handler else: raise ValueError("outlier_%s not recognized, should be int " diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 299c083773177..627119208dd5f 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -10,8 +10,11 @@ import numpy as np -from .base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin -from .base import RadiusNeighborsMixin, SupervisedFloatMixin +from .base import \ + _check_weights, _get_weights, \ + _check_outlier_handler,\ + NeighborsBase, KNeighborsMixin,\ + RadiusNeighborsMixin, SupervisedIntegerMixin from ..base import RegressorMixin from ..utils import check_array @@ -254,7 +257,7 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, """ def __init__(self, radius=1.0, weights='uniform', - algorithm='auto', leaf_size=30,p=2, metric='minkowski', + algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_value=None, metric_params=None, **kwargs): self._init_params(radius=radius, algorithm=algorithm, diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 7ce8c25e9a651..79ff0b30baa85 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1341,7 +1341,7 @@ def check_exception(): # test check_outlier_handler error raise assert_raise_message(ValueError, - 'outlier_label', - lambda:RNC(outlier_label='hello world')) + 'outlier_label', + lambda: RNC(outlier_label='hello world')) neighbors.RadiusNeighborsRegressor() From d6a2ff8d362e802f04306d7247a2a9f11f0dc8b2 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Fri, 25 Aug 2017 18:30:14 -0500 Subject: [PATCH 14/48] bug --- sklearn/neighbors/regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 627119208dd5f..d852ed288dd9e 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -14,7 +14,7 @@ _check_weights, _get_weights, \ _check_outlier_handler,\ NeighborsBase, KNeighborsMixin,\ - RadiusNeighborsMixin, SupervisedIntegerMixin + RadiusNeighborsMixin, SupervisedFloatMixin from ..base import RegressorMixin from ..utils import check_array From 96cb19ef47f453eb53e8ff85c4aef9bdb36c2518 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 28 Aug 2017 19:36:28 -0500 Subject: [PATCH 15/48] outlier --- sklearn/neighbors/classification.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 3309c41bccd56..502126822656d 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -435,9 +435,7 @@ def predict_proba(self, X): neigh_dist, neigh_ind = self.radius_neighbors(X) inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0] - mask = np.ones(n_samples, np.bool) - mask[inliers] = False - outliers = np.arange(n_samples)[mask] + outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] classes_ = self.classes_ _y = self._y From 4b480a5ec96791c0696548e74646ec7c00317199 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Wed, 30 Aug 2017 20:51:02 -0500 Subject: [PATCH 16/48] random int -> randomly choose class labels from y --- sklearn/neighbors/classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 502126822656d..cdb88cb881890 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -393,8 +393,8 @@ def predict(self, X): if outliers: if self.outlier_label == 'uniform': - y_pred[outliers, k] = np.random.randint(classes_k.size, - size=len(outliers)) + y_pred[outliers, k] = np.random.choice(classes_k, + size=len(outliers)) elif self.outlier_label == 'prior': prior = np.bincount(_y[:, k]) / float(_y.shape[0]) y_pred[outliers, k] = np.random.choice(classes_k, From 6f6c96c38359d178cd9ff7b5b8cd312cf9b05392 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 17 Sep 2017 00:44:34 -0500 Subject: [PATCH 17/48] fix weights index and vector scalar bug --- sklearn/neighbors/classification.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index cdb88cb881890..c08e35c9e6352 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -443,16 +443,14 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - if self.outlier_label is not None: - neigh_dist[outliers] = 1e-6 - elif outliers: + if outliers and self.outlier_label is None: raise ValueError('No neighbors found for test samples %r, ' 'you can try using larger radius, ' 'consider removing them from your dataset ' 'or change oulier_label parameter.' % outliers) - weights = _get_weights(neigh_dist, self.weights) + weights = _get_weights(neigh_dist, self.weights)[inliers] probabilities = [] for k, classes_k in enumerate(classes_): From 32e72070d727de27c0b73e5ebe3240fbdbbe8f24 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 17 Sep 2017 10:42:02 -0500 Subject: [PATCH 18/48] fix weights inlier index, change inlier addition to assign, get index of outlier lable --- sklearn/neighbors/classification.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index c08e35c9e6352..9ea2d4d0c95fd 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -450,7 +450,9 @@ def predict_proba(self, X): 'or change oulier_label parameter.' % outliers) - weights = _get_weights(neigh_dist, self.weights)[inliers] + weights = _get_weights(neigh_dist, self.weights) + if weights is not None: + weights = weights[inliers] probabilities = [] for k, classes_k in enumerate(classes_): @@ -463,13 +465,13 @@ def predict_proba(self, X): # samples have different size of neighbors within the same radius if weights is None: for i, idx in enumerate(pred_labels[inliers]): - proba_inl[i, :] += np.bincount(idx, - minlength=classes_k.size) + proba_inl[i, :] = np.bincount(idx, + minlength=classes_k.size) else: for i, idx in enumerate(pred_labels[inliers]): - proba_inl[i, :] += np.bincount(idx, - weights[i], - minlength=classes_k.size) + proba_inl[i, :] = np.bincount(idx, + weights[i], + minlength=classes_k.size) proba_k[inliers, :] = proba_inl if outliers: @@ -480,8 +482,10 @@ def predict_proba(self, X): / float(_y.shape[0])) elif isinstance(self.outlier_label, (numbers.Integral, np.integer)): + classes_k = classes_k.tolist() if self.outlier_label in classes_k: - proba_k[outliers, self.outlier_label] = 1.0 + proba_k[outliers, + classes_k.index(self.outlier_label)] = 1.0 else: proba_k[outliers, :] = 0.0 warns.warn('No neighbors found for test samples %r, ' From 8bb0ecf8a74a3d4682580ea057dcc49e23f1bed1 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 14 Jan 2018 21:52:52 -0600 Subject: [PATCH 19/48] resolve regression conflict --- sklearn/neighbors/regression.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index d852ed288dd9e..cc62008477bda 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -257,16 +257,14 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, """ def __init__(self, radius=1.0, weights='uniform', - algorithm='auto', leaf_size=30, p=2, metric='minkowski', - outlier_value=None, metric_params=None, **kwargs): - self._init_params(radius=radius, - algorithm=algorithm, - leaf_size=leaf_size, - p=p, metric=metric, metric_params=metric_params, - **kwargs) + algorithm='auto', leaf_size=30, + p=2, metric='minkowski', metric_params=None, **kwargs): + super(RadiusNeighborsRegressor, self).__init__( + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + p=p, metric=metric, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) - self.outlier_value = _check_outlier_handler(outlier_value, - kind='value') def predict(self, X): """Predict the target for the provided data From 326fc2280930ea7f1bdb93de6aaf903df41013d5 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 25 Feb 2018 12:37:27 -0600 Subject: [PATCH 20/48] formatting regression, remove prior/uniform, add most_frequent, move check outlier parameter in fit(), update testing --- sklearn/neighbors/base.py | 14 ++--- sklearn/neighbors/classification.py | 74 +++++++++++++---------- sklearn/neighbors/regression.py | 9 ++- sklearn/neighbors/tests/test_neighbors.py | 65 ++++++++++---------- 4 files changed, 85 insertions(+), 77 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 84979f393006b..a1606488bc501 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -100,15 +100,15 @@ def _get_weights(dist, weights): "'distance', or a callable function") -def _check_outlier_handler(outlier_handler, kind): +def _check_outlier_label(outlier_label): """Check to make sure outlier_handler is valid""" - if outlier_handler in [None, 'uniform', 'prior']: - return outlier_handler - elif isinstance(outlier_handler, (numbers.Integral, np.integer)): - return outlier_handler + if outlier_label in ['raise', 'most_frequent']: + return outlier_label + elif isinstance(outlier_label, (numbers.Integral, np.integer)): + return outlier_label else: - raise ValueError("outlier_%s not recognized, should be int " - "'uniform', 'prior' or None." % kind) + raise ValueError("outlier_label not recognized, should be int " + "'raise', or 'most_frequent'.") class NeighborsBase(six.with_metaclass(ABCMeta, BaseEstimator)): diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index f16ddf6de3f83..d79326a025284 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -16,7 +16,7 @@ from .base import \ _check_weights, _get_weights, \ - _check_outlier_handler,\ + _check_outlier_label,\ NeighborsBase, KNeighborsMixin,\ RadiusNeighborsMixin, SupervisedIntegerMixin from ..base import ClassifierMixin @@ -284,14 +284,11 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : int, 'uniform', 'prior', optional (default = None) + outlier_label : int, 'most_frequent', 'raise', optional (default = 'raise') - int : manual label, which is given for outlier samples (samples with no neighbors on given radius). - - 'uniform' : outlier samples have same probabilities to be assgined - into every label. - - 'prior' : outlier samples will be labeled according to the label - distributuin of 'y' in training data. - - None : when outlier is detected, ValueError is raised. + - 'most_frequent' : assign most frequent label to outlier. + - 'raise' : when outlier is detected, ValueError is raised. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -326,15 +323,32 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', - outlier_label=None, metric_params=None, **kwargs): + outlier_label='raise', metric_params=None, **kwargs): super(RadiusNeighborsClassifier, self).__init__( radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) - self.outlier_label = _check_outlier_handler(outlier_label, - kind='label') + self.outlier_label = outlier_label + + + def fit(self, X, y): + """Fit the model using X as training data and y as target values + + Parameters + ---------- + X : {array-like, sparse matrix, BallTree, KDTree} + Training data. If array or matrix, shape [n_samples, n_features], + or [n_samples, n_samples] if metric='precomputed'. + + y : {array-like, sparse matrix} + Target values of shape = [n_samples] or [n_samples, n_outputs] + + """ + self.outlier_label = _check_outlier_label(self.outlier_label) + super(SupervisedIntegerMixin, self).fit(X, y) + def predict(self, X): """Predict the class labels for the provided data @@ -355,8 +369,9 @@ def predict(self, X): n_samples = X.shape[0] neigh_dist, neigh_ind = self.radius_neighbors(X) - inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0] - outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] + outlier_mask = [len(nind) == 0 for nind in neigh_ind] + outliers = np.arange(n_samples)[outlier_mask] + inliers = np.arange(n_samples)[~outlier_mask] classes_ = self.classes_ _y = self._y @@ -365,7 +380,7 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - if self.outlier_label is not None: + if self.outlier_label is not 'raise': neigh_dist[outliers] = 1e-6 elif outliers: raise ValueError('No neighbors found for test samples %r, ' @@ -394,14 +409,10 @@ def predict(self, X): y_pred[inliers, k] = classes_k.take(mode) if outliers: - if self.outlier_label == 'uniform': - y_pred[outliers, k] = np.random.choice(classes_k, - size=len(outliers)) - elif self.outlier_label == 'prior': - prior = np.bincount(_y[:, k]) / float(_y.shape[0]) - y_pred[outliers, k] = np.random.choice(classes_k, - p=prior, - size=len(outliers)) + if self.outlier_label == 'most_frequent': + prior = np.bincount(_y[:, k]) + frequent_label_index = prior.argmax() + y_pred[outliers, k] = classes_k[frequent_label_index] if outliers and isinstance(self.outlier_label, (numbers.Integral, np.integer)): @@ -436,8 +447,9 @@ def predict_proba(self, X): n_samples = X.shape[0] neigh_dist, neigh_ind = self.radius_neighbors(X) - inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0] - outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] + outlier_mask = [len(nind) == 0 for nind in neigh_ind] + outliers = np.arange(n_samples)[outlier_mask] + inliers = np.arange(n_samples)[~outlier_mask] classes_ = self.classes_ _y = self._y @@ -445,7 +457,7 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - if outliers and self.outlier_label is None: + if outliers and self.outlier_label == 'raise': raise ValueError('No neighbors found for test samples %r, ' 'you can try using larger radius, ' 'consider removing them from your dataset ' @@ -477,19 +489,17 @@ def predict_proba(self, X): proba_k[inliers, :] = proba_inl if outliers: - if self.outlier_label == 'uniform': - proba_k[outliers, :] = 1.0 / classes_k.size - elif self.outlier_label == 'prior': - proba_k[outliers, :] = (np.bincount(_y[:, k]) - / float(_y.shape[0])) + if self.outlier_label == 'most_frequent': + prior = np.bincount(_y[:, k]) + frequent_label_index = prior.argmax() + proba_k[outliers, frequent_label_index] = 1.0 elif isinstance(self.outlier_label, (numbers.Integral, np.integer)): - classes_k = classes_k.tolist() if self.outlier_label in classes_k: + label_index = np.where(classes_k == self.outlier_label) proba_k[outliers, - classes_k.index(self.outlier_label)] = 1.0 + label_index[0][0]] = 1.0 else: - proba_k[outliers, :] = 0.0 warns.warn('No neighbors found for test samples %r, ' 'their probabilities will be assgined ' 'with 0., which may influence scoring.' diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 610da97857615..ddbc42e88ad19 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -17,7 +17,6 @@ from .base import \ _check_weights, _get_weights, \ - _check_outlier_handler,\ NeighborsBase, KNeighborsMixin,\ RadiusNeighborsMixin, SupervisedFloatMixin from ..base import RegressorMixin @@ -271,10 +270,10 @@ def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, **kwargs): super(RadiusNeighborsRegressor, self).__init__( - radius=radius, - algorithm=algorithm, - leaf_size=leaf_size, - p=p, metric=metric, metric_params=metric_params, **kwargs) + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + p=p, metric=metric, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) def predict(self, X): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 1455cc83bf496..57acc9b69aadb 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1349,56 +1349,55 @@ def test_radius_neighbors_outliers(): X = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]] y = [0, 2, 2, 1, 1, 1, 3, 3, 3, 3] + # test invalid outlier_label parameter + def check_exception(): + clf = RNC(radius=1, outlier_label='invalid') + clf.fit(X, y) + assert_raises(ValueError, check_exception) + # test outiler Error rasing def check_exception(): - clf = RNC(radius=1, outlier_label=None) + clf = RNC(radius=1, outlier_label='raise') clf.fit(X, y) clf.predict_proba([[1], [15]]) assert_raises(ValueError, check_exception) - # test uniform outlier - clf = RNC(radius=1, outlier_label='uniform') - clf.fit(X, y) - predict = clf.predict_proba([[1], [15]]) - assert_equal(predict[1, 0], 0.25) - assert_equal(predict[1, 1], 0.25) - assert_equal(predict[1, 2], 0.25) - assert_equal(predict[1, 3], 0.25) - # test uniform outlier in predict method - clf.predict([[1], [15]]) - - # test prior outlier - clf = RNC(radius=1, outlier_label='prior') + # test most frequent + clf = RNC(radius=1, outlier_label='most_frequent') clf.fit(X, y) predict = clf.predict_proba([[1], [15]]) - assert_equal(predict[1, 0], 0.1) - assert_equal(predict[1, 1], 0.3) - assert_equal(predict[1, 2], 0.2) - assert_equal(predict[1, 3], 0.4) - # test prior outlier in predcit method - clf.predict([[1], [15]]) - - # est manual label in y + assert_equal(predict[1, 0], 0) + assert_equal(predict[1, 1], 0) + assert_equal(predict[1, 2], 0) + assert_equal(predict[1, 3], 1) + + # test manual label in y clf = RNC(radius=1, outlier_label=2) clf.fit(X, y) - predict = clf.predict_proba([[1], [15]]) - assert_equal(predict[1, 2], 1.0) + proba = clf.predict_proba([[1], [15]]) + assert_equal(proba[1, 0], 0.0) + assert_equal(proba[1, 1], 0.0) + assert_equal(proba[1, 2], 1.0) + assert_equal(proba[1, 3], 0.0) + pred = clf.predict([[1], [15]]) + assert_equal(pred[0], 3.0) + assert_equal(pred[1], 2.0) # test manual label out of y clf = RNC(radius=1, outlier_label=4) clf.fit(X, y) - predict = clf.predict_proba([[1], [15]]) - assert_equal(predict[1, 0], 0.0) - assert_equal(predict[1, 1], 0.0) - assert_equal(predict[1, 2], 0.0) - assert_equal(predict[1, 3], 0.0) - predict = clf.predict([[7], [15]]) - assert_equal(predict[0], 3.0) - assert_equal(predict[1], 4.0) + proba = clf.predict_proba([[1], [15]]) + assert_equal(proba[1, 0], 0.0) + assert_equal(proba[1, 1], 0.0) + assert_equal(proba[1, 2], 0.0) + assert_equal(proba[1, 3], 0.0) + pred = clf.predict([[7], [15]]) + + assert_equal(pred[1], 4.0) # test check_outlier_handler error raise assert_raise_message(ValueError, 'outlier_label', lambda: RNC(outlier_label='hello world')) - neighbors.RadiusNeighborsRegressor() + From 0804952aacb9d4c4ce2689380d982871e0ed1458 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 25 Feb 2018 13:53:03 -0600 Subject: [PATCH 21/48] fix inheritance --- sklearn/neighbors/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index d79326a025284..f33bed6886017 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -347,7 +347,7 @@ def fit(self, X, y): """ self.outlier_label = _check_outlier_label(self.outlier_label) - super(SupervisedIntegerMixin, self).fit(X, y) + RadiusNeighborsClassifier.fit(self, X, y) def predict(self, X): From 3964fa58ee9b74266ae931bd9b7302a4681cebcb Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 25 Feb 2018 14:13:17 -0600 Subject: [PATCH 22/48] fix typo --- sklearn/neighbors/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index f33bed6886017..1cebce8b8988b 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -347,7 +347,7 @@ def fit(self, X, y): """ self.outlier_label = _check_outlier_label(self.outlier_label) - RadiusNeighborsClassifier.fit(self, X, y) + SupervisedIntegerMixin.fit(self, X, y) def predict(self, X): From 6a53c9f5ce6dd8ed4f28ba3e75f890e809b5c7da Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 10 Mar 2018 23:19:12 -0600 Subject: [PATCH 23/48] move oultier handler into fit(), add outlier label lists for multi outputs, remove check outlier_handler() in base.py --- sklearn/neighbors/base.py | 12 -- sklearn/neighbors/classification.py | 137 ++++++++++++---------- sklearn/neighbors/regression.py | 2 +- sklearn/neighbors/tests/test_neighbors.py | 73 +++++++----- 4 files changed, 122 insertions(+), 102 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index a1606488bc501..e390860d13463 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -7,7 +7,6 @@ # # License: BSD 3 clause (C) INRIA, University of Amsterdam import warnings -import numbers from abc import ABCMeta, abstractmethod import numpy as np @@ -100,17 +99,6 @@ def _get_weights(dist, weights): "'distance', or a callable function") -def _check_outlier_label(outlier_label): - """Check to make sure outlier_handler is valid""" - if outlier_label in ['raise', 'most_frequent']: - return outlier_label - elif isinstance(outlier_label, (numbers.Integral, np.integer)): - return outlier_label - else: - raise ValueError("outlier_label not recognized, should be int " - "'raise', or 'most_frequent'.") - - class NeighborsBase(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for nearest neighbors estimators.""" diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 1cebce8b8988b..8dcd9ae3e0514 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -13,10 +13,11 @@ import numpy as np from scipy import stats from ..utils.extmath import weighted_mode +from ..utils.validation import _is_arraylike, _num_samples +from ..externals.six import string_types from .base import \ _check_weights, _get_weights, \ - _check_outlier_label,\ NeighborsBase, KNeighborsMixin,\ RadiusNeighborsMixin, SupervisedIntegerMixin from ..base import ClassifierMixin @@ -156,7 +157,7 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - n_samples = X.shape[0] + n_samples = _num_samples(X) weights = _get_weights(neigh_dist, self.weights) y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) @@ -200,7 +201,7 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - n_samples = X.shape[0] + n_samples = _num_samples(X) weights = _get_weights(neigh_dist, self.weights) if weights is None: @@ -284,10 +285,13 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : int, 'most_frequent', 'raise', optional (default = 'raise') - - int : manual label, which is given for outlier samples (samples - with no neighbors on given radius). - - 'most_frequent' : assign most frequent label to outlier. + outlier_label : manual label, 'most_frequent', 'raise', optional + (default = 'raise') + - manual label: str or int label (should be the same type as y) + or list of manual labels if multi ouputs are used + label given for outlier samples (samples with no neighbors in + given radius). + - 'most_frequent' : assign the most frequent label to outliers. - 'raise' : when outlier is detected, ValueError is raised. metric_params : dict, optional (default = None) @@ -324,11 +328,11 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label='raise', metric_params=None, **kwargs): - super(RadiusNeighborsClassifier, self).__init__( - radius=radius, - algorithm=algorithm, - leaf_size=leaf_size, - metric=metric, p=p, metric_params=metric_params, **kwargs) + self._init_params(radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, p=p, metric_params=metric_params, + **kwargs) self.weights = _check_weights(weights) self.outlier_label = outlier_label @@ -346,9 +350,44 @@ def fit(self, X, y): Target values of shape = [n_samples] or [n_samples, n_outputs] """ - self.outlier_label = _check_outlier_label(self.outlier_label) + SupervisedIntegerMixin.fit(self, X, y) + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + if self.outlier_label == 'most_frequent': + outlier_label_ = [] + for k, classes_k in enumerate(classes_): + label_count = np.bincount(_y[:, k]) + outlier_label_.append(classes_k[label_count.argmax()]) + + elif self.outlier_label == 'raise': + outlier_label_ = 'raise' + + else: + if (_is_arraylike(self.outlier_label) and + not isinstance(self.outlier_label, string_types)): + if len(self.outlier_label) != len(classes_): + raise ValueError('The length of outlier_label: {} is ' + 'inconsistent with output ' + 'length: {}'.format(self.outlier_label, + len(classes_))) + outlier_label_ = self.outlier_label + else: + outlier_label_ = [self.outlier_label] * len(classes_) + # ensure the dtype of outlier label is consistent with y + if any(np.append(classes, label).dtype != classes.dtype + for classes, label in zip(classes_, outlier_label_)): + raise TypeError('The dtype of outlier_label is' + 'inconsistent with y') + + self.outlier_label_ = outlier_label_ + return self + def predict(self, X): """Predict the class labels for the provided data @@ -366,12 +405,13 @@ def predict(self, X): """ X = check_array(X, accept_sparse='csr') - n_samples = X.shape[0] + n_samples = _num_samples(X) neigh_dist, neigh_ind = self.radius_neighbors(X) - outlier_mask = [len(nind) == 0 for nind in neigh_ind] - outliers = np.arange(n_samples)[outlier_mask] - inliers = np.arange(n_samples)[~outlier_mask] + outlier_mask = np.array([len(nind) == 0 for nind in neigh_ind]) + indecies = np.arange(n_samples) + outliers = indecies[outlier_mask] + inliers = indecies[~outlier_mask] classes_ = self.classes_ _y = self._y @@ -380,11 +420,9 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - if self.outlier_label is not 'raise': - neigh_dist[outliers] = 1e-6 - elif outliers: + if self.outlier_label_ == 'raise' and outliers.size > 0: raise ValueError('No neighbors found for test samples %r, ' - 'you can try using larger radius, ' + 'you can try to use larger radius, ' 'give a label for outliers, ' 'or consider removing them from your dataset.' % outliers) @@ -401,25 +439,15 @@ def predict(self, X): else: mode = np.array([weighted_mode(pl, w)[0] for (pl, w) - in zip(pred_labels[inliers], weights[inliers])], + in zip(pred_labels[inliers], + weights[inliers])], dtype=np.int) - mode = mode.ravel() y_pred[inliers, k] = classes_k.take(mode) + if outliers.size > 0: + y_pred[outliers, k] = self.outlier_label_[k] - if outliers: - if self.outlier_label == 'most_frequent': - prior = np.bincount(_y[:, k]) - frequent_label_index = prior.argmax() - y_pred[outliers, k] = classes_k[frequent_label_index] - - if outliers and isinstance(self.outlier_label, (numbers.Integral, - np.integer)): - y_pred[outliers, :] = self.outlier_label - warns.warn('No neighbors found for test samples %r, ' - 'their labels will be assgined with %d. ' - % (outliers, self.outlier_label)) if not self.outputs_2d_: y_pred = y_pred.ravel() @@ -440,16 +468,16 @@ def predict_proba(self, X): of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. - Outliers will be assign 0s in all class probabilities. """ X = check_array(X, accept_sparse='csr') - n_samples = X.shape[0] + n_samples = _num_samples(X) neigh_dist, neigh_ind = self.radius_neighbors(X) - outlier_mask = [len(nind) == 0 for nind in neigh_ind] - outliers = np.arange(n_samples)[outlier_mask] - inliers = np.arange(n_samples)[~outlier_mask] + outlier_mask = np.array([len(nind) == 0 for nind in neigh_ind]) + indecies = np.arange(n_samples) + outliers = indecies[outlier_mask] + inliers = indecies[~outlier_mask] classes_ = self.classes_ _y = self._y @@ -457,11 +485,11 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - if outliers and self.outlier_label == 'raise': + if self.outlier_label_ == 'raise' and outliers.size > 0: raise ValueError('No neighbors found for test samples %r, ' - 'you can try using larger radius, ' - 'consider removing them from your dataset ' - 'or change oulier_label parameter.' + 'you can try to use larger radius, ' + 'give a label for outliers, ' + 'or consider removing them from your dataset.' % outliers) weights = _get_weights(neigh_dist, self.weights) @@ -488,22 +516,11 @@ def predict_proba(self, X): minlength=classes_k.size) proba_k[inliers, :] = proba_inl - if outliers: - if self.outlier_label == 'most_frequent': - prior = np.bincount(_y[:, k]) - frequent_label_index = prior.argmax() - proba_k[outliers, frequent_label_index] = 1.0 - elif isinstance(self.outlier_label, (numbers.Integral, - np.integer)): - if self.outlier_label in classes_k: - label_index = np.where(classes_k == self.outlier_label) - proba_k[outliers, - label_index[0][0]] = 1.0 - else: - warns.warn('No neighbors found for test samples %r, ' - 'their probabilities will be assgined ' - 'with 0., which may influence scoring.' - % outliers) + if outliers.size > 0: + label_index = np.where(classes_k == self.outlier_label_[k]) + if label_index[0].size != 0: + proba_k[outliers, + label_index[0][0]] = 1.0 # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index ddbc42e88ad19..4b678ba30b23a 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -313,7 +313,7 @@ def predict(self, X): if len(ind) else empty_obs for (i, ind) in enumerate(neigh_ind)]) - if np.max(np.isnan(y_pred)): + if np.any(np.isnan(y_pred)): empty_warning_msg = ("One or more samples have no neighbors " "within specified radius; predicting NaN.") warnings.warn(empty_warning_msg) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 57acc9b69aadb..d7478cadd96b1 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1328,32 +1328,17 @@ def test_pairwise_boolean_distance(): assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) -def test_radius_neighbors_clf_predict_proba(): - # test for #9597 - # weight of uniform - # outlier warnings - def check_warn(w='distance'): - RNC = neighbors.RadiusNeighborsClassifier - clf = RNC(radius=1, weights=w, outlier_label=-1) - X = [[0], [1], [2], [3]] - y = [0, 0, 1, 1] - clf.fit(X, y) - clf.predict_proba([[1], [5]]) - assert_warns(UserWarning, check_warn, w='distance') - assert_warns(UserWarning, check_warn, w='uniform') - - def test_radius_neighbors_outliers(): # outler handlers: 1. uniform, prior, None RNC = neighbors.RadiusNeighborsClassifier X = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]] y = [0, 2, 2, 1, 1, 1, 3, 3, 3, 3] - # test invalid outlier_label parameter + # test invalid outlier_label dtype def check_exception(): - clf = RNC(radius=1, outlier_label='invalid') + clf = RNC(radius=1, outlier_label='a') clf.fit(X, y) - assert_raises(ValueError, check_exception) + assert_raises(TypeError, check_exception) # test outiler Error rasing def check_exception(): @@ -1372,16 +1357,16 @@ def check_exception(): assert_equal(predict[1, 3], 1) # test manual label in y - clf = RNC(radius=1, outlier_label=2) + clf = RNC(radius=1, outlier_label=1) clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) assert_equal(proba[1, 0], 0.0) - assert_equal(proba[1, 1], 0.0) - assert_equal(proba[1, 2], 1.0) + assert_equal(proba[1, 1], 1.0) + assert_equal(proba[1, 2], 0.0) assert_equal(proba[1, 3], 0.0) pred = clf.predict([[1], [15]]) - assert_equal(pred[0], 3.0) - assert_equal(pred[1], 2.0) + assert_equal(pred[0], 2) + assert_equal(pred[1], 1) # test manual label out of y clf = RNC(radius=1, outlier_label=4) @@ -1392,12 +1377,42 @@ def check_exception(): assert_equal(proba[1, 2], 0.0) assert_equal(proba[1, 3], 0.0) pred = clf.predict([[7], [15]]) - assert_equal(pred[1], 4.0) - # test check_outlier_handler error raise - assert_raise_message(ValueError, - 'outlier_label', - lambda: RNC(outlier_label='hello world')) - + # test multi output same outlier label + y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], + [1, 3], [3, 3], [3, 3], [3, 0], [3, 0]] + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_equal(proba[1][1, 0], 0.0) + assert_equal(proba[1][1, 1], 1.0) + assert_equal(proba[1][1, 2], 0.0) + assert_equal(proba[1][1, 3], 0.0) + pred = clf.predict([[7], [15]]) + assert_equal(pred[1, 0], 1) + assert_equal(pred[1, 1], 1) + + # test multi output different outlier label + y_multi = [[0, 0], [2, 2], [2, 2], [1, 1], [1, 1], + [1, 1], [3, 3], [3, 3], [3, 3], [3, 3]] + clf = RNC(radius=1, outlier_label=[0, 1]) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_equal(proba[0][1, 0], 1.0) + assert_equal(proba[0][1, 1], 0.0) + assert_equal(proba[0][1, 2], 0.0) + assert_equal(proba[0][1, 3], 0.0) + assert_equal(proba[1][1, 0], 0.0) + assert_equal(proba[1][1, 1], 1.0) + assert_equal(proba[1][1, 2], 0.0) + assert_equal(proba[1][1, 3], 0.0) + pred = clf.predict([[7], [15]]) + assert_equal(pred[1, 0], 0) + assert_equal(pred[1, 1], 1) + # test inconsistent outlier label list legth + def check_exception(): + clf = RNC(radius=1, outlier_label=[0, 1, 2]) + clf.fit(X, y_multi) + assert_raises(ValueError, check_exception) \ No newline at end of file From 610a722a61a234c41418afde15fad3a6840db085 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 10 Mar 2018 23:32:30 -0600 Subject: [PATCH 24/48] _init_param to super().__init__() --- sklearn/neighbors/classification.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 8dcd9ae3e0514..34f73e93e2481 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -288,7 +288,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, outlier_label : manual label, 'most_frequent', 'raise', optional (default = 'raise') - manual label: str or int label (should be the same type as y) - or list of manual labels if multi ouputs are used + or list of manual labels if multi ouputs are used. label given for outlier samples (samples with no neighbors in given radius). - 'most_frequent' : assign the most frequent label to outliers. @@ -328,11 +328,11 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label='raise', metric_params=None, **kwargs): - self._init_params(radius=radius, - algorithm=algorithm, - leaf_size=leaf_size, - metric=metric, p=p, metric_params=metric_params, - **kwargs) + super(RadiusNeighborsClassifier, self).__init__( + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, p=p, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) self.outlier_label = outlier_label From db5a29ccdc1d74f6bf228208966c9fb93e3b7b41 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 10 Mar 2018 23:50:25 -0600 Subject: [PATCH 25/48] change None to "raise" in testing --- sklearn/neighbors/tests/test_neighbors.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index d7478cadd96b1..51a57605bfb40 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -335,7 +335,7 @@ def test_radius_neighbors_classifier_when_no_neighbors(): weight_func = _weight_func - for outlier_label in [0, -1, None]: + for outlier_label in [0, -1, 'raise']: for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: rnc = neighbors.RadiusNeighborsClassifier @@ -344,7 +344,7 @@ def test_radius_neighbors_classifier_when_no_neighbors(): clf.fit(X, y) assert_array_equal(np.array([1, 2]), clf.predict(z1)) - if outlier_label is None: + if outlier_label == 'raise': assert_raises(ValueError, clf.predict, z2) elif False: assert_array_equal(np.array([1, outlier_label]), @@ -1340,13 +1340,6 @@ def check_exception(): clf.fit(X, y) assert_raises(TypeError, check_exception) - # test outiler Error rasing - def check_exception(): - clf = RNC(radius=1, outlier_label='raise') - clf.fit(X, y) - clf.predict_proba([[1], [15]]) - assert_raises(ValueError, check_exception) - # test most frequent clf = RNC(radius=1, outlier_label='most_frequent') clf.fit(X, y) From 7f491dbf39613d25f7e1deefdcf423cf79c65f8d Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 11 Mar 2018 11:24:17 -0500 Subject: [PATCH 26/48] format --- sklearn/neighbors/classification.py | 8 ++------ sklearn/neighbors/tests/test_neighbors.py | 3 +-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 34f73e93e2481..b46dcab410348 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -8,8 +8,6 @@ # # License: BSD 3 clause (C) INRIA, University of Amsterdam -import numbers -import warnings as warns import numpy as np from scipy import stats from ..utils.extmath import weighted_mode @@ -336,7 +334,6 @@ def __init__(self, radius=1.0, weights='uniform', self.weights = _check_weights(weights) self.outlier_label = outlier_label - def fit(self, X, y): """Fit the model using X as training data and y as target values @@ -370,7 +367,7 @@ def fit(self, X, y): else: if (_is_arraylike(self.outlier_label) and - not isinstance(self.outlier_label, string_types)): + not isinstance(self.outlier_label, string_types)): if len(self.outlier_label) != len(classes_): raise ValueError('The length of outlier_label: {} is ' 'inconsistent with output ' @@ -388,7 +385,6 @@ def fit(self, X, y): self.outlier_label_ = outlier_label_ return self - def predict(self, X): """Predict the class labels for the provided data @@ -518,7 +514,7 @@ def predict_proba(self, X): if outliers.size > 0: label_index = np.where(classes_k == self.outlier_label_[k]) - if label_index[0].size != 0: + if label_index[0].size != 0: proba_k[outliers, label_index[0][0]] = 1.0 diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 51a57605bfb40..52282cce3fc44 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -18,7 +18,6 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_warns_message @@ -1408,4 +1407,4 @@ def check_exception(): def check_exception(): clf = RNC(radius=1, outlier_label=[0, 1, 2]) clf.fit(X, y_multi) - assert_raises(ValueError, check_exception) \ No newline at end of file + assert_raises(ValueError, check_exception) From 9880d51dee56b237d7371d7201e1078b8e54f567 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 11 Mar 2018 12:15:45 -0500 Subject: [PATCH 27/48] format --- sklearn/neighbors/classification.py | 4 ++-- sklearn/neighbors/regression.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index b46dcab410348..565e124cf6412 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -367,7 +367,7 @@ def fit(self, X, y): else: if (_is_arraylike(self.outlier_label) and - not isinstance(self.outlier_label, string_types)): + not isinstance(self.outlier_label, string_types)): if len(self.outlier_label) != len(classes_): raise ValueError('The length of outlier_label: {} is ' 'inconsistent with output ' @@ -381,7 +381,7 @@ def fit(self, X, y): for classes, label in zip(classes_, outlier_label_)): raise TypeError('The dtype of outlier_label is' 'inconsistent with y') - + self.outlier_label_ = outlier_label_ return self diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 4b678ba30b23a..805ad2898b318 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -318,7 +318,6 @@ def predict(self, X): "within specified radius; predicting NaN.") warnings.warn(empty_warning_msg) - if self._y.ndim == 1: y_pred = y_pred.ravel() From 8dcb4f31bfb88978b3d013c660959c4871706e97 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 23 Apr 2018 23:40:21 -0500 Subject: [PATCH 28/48] change back from "raise" to None, fix some typos --- sklearn/neighbors/classification.py | 28 +++++++++++------------ sklearn/neighbors/regression.py | 6 ++--- sklearn/neighbors/tests/test_neighbors.py | 6 ++--- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 565e124cf6412..8bee53ec9b713 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -283,14 +283,14 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : manual label, 'most_frequent', 'raise', optional - (default = 'raise') + outlier_label : manual label, 'most_frequent', None, optional + (default = None) - manual label: str or int label (should be the same type as y) or list of manual labels if multi ouputs are used. label given for outlier samples (samples with no neighbors in given radius). - 'most_frequent' : assign the most frequent label to outliers. - - 'raise' : when outlier is detected, ValueError is raised. + - None : when outlier is detected, ValueError is raised. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -325,7 +325,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', - outlier_label='raise', metric_params=None, **kwargs): + outlier_label=None, metric_params=None, **kwargs): super(RadiusNeighborsClassifier, self).__init__( radius=radius, algorithm=algorithm, @@ -362,8 +362,8 @@ def fit(self, X, y): label_count = np.bincount(_y[:, k]) outlier_label_.append(classes_k[label_count.argmax()]) - elif self.outlier_label == 'raise': - outlier_label_ = 'raise' + elif self.outlier_label is None: + outlier_label_ = None else: if (_is_arraylike(self.outlier_label) and @@ -405,9 +405,9 @@ def predict(self, X): neigh_dist, neigh_ind = self.radius_neighbors(X) outlier_mask = np.array([len(nind) == 0 for nind in neigh_ind]) - indecies = np.arange(n_samples) - outliers = indecies[outlier_mask] - inliers = indecies[~outlier_mask] + indices = np.arange(n_samples) + outliers = indices[outlier_mask] + inliers = indices[~outlier_mask] classes_ = self.classes_ _y = self._y @@ -416,7 +416,7 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - if self.outlier_label_ == 'raise' and outliers.size > 0: + if self.outlier_label_ is None and outliers.size > 0: raise ValueError('No neighbors found for test samples %r, ' 'you can try to use larger radius, ' 'give a label for outliers, ' @@ -471,9 +471,9 @@ def predict_proba(self, X): neigh_dist, neigh_ind = self.radius_neighbors(X) outlier_mask = np.array([len(nind) == 0 for nind in neigh_ind]) - indecies = np.arange(n_samples) - outliers = indecies[outlier_mask] - inliers = indecies[~outlier_mask] + indices = np.arange(n_samples) + outliers = indices[outlier_mask] + inliers = indices[~outlier_mask] classes_ = self.classes_ _y = self._y @@ -481,7 +481,7 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - if self.outlier_label_ == 'raise' and outliers.size > 0: + if self.outlier_label_ is None and outliers.size > 0: raise ValueError('No neighbors found for test samples %r, ' 'you can try to use larger radius, ' 'give a label for outliers, ' diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 805ad2898b318..ae1ceb2df32f1 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -15,10 +15,8 @@ import numpy as np from scipy.sparse import issparse -from .base import \ - _check_weights, _get_weights, \ - NeighborsBase, KNeighborsMixin,\ - RadiusNeighborsMixin, SupervisedFloatMixin +from .base import _check_weights, _get_weights, NeighborsBase, KNeighborsMixin +from .base import RadiusNeighborsMixin, SupervisedFloatMixin from ..base import RegressorMixin from ..utils import check_array diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 52282cce3fc44..3aa1284722ab7 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -334,7 +334,7 @@ def test_radius_neighbors_classifier_when_no_neighbors(): weight_func = _weight_func - for outlier_label in [0, -1, 'raise']: + for outlier_label in [0, -1, None]: for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: rnc = neighbors.RadiusNeighborsClassifier @@ -343,7 +343,7 @@ def test_radius_neighbors_classifier_when_no_neighbors(): clf.fit(X, y) assert_array_equal(np.array([1, 2]), clf.predict(z1)) - if outlier_label == 'raise': + if outlier_label is None: assert_raises(ValueError, clf.predict, z2) elif False: assert_array_equal(np.array([1, outlier_label]), @@ -1403,7 +1403,7 @@ def check_exception(): assert_equal(pred[1, 0], 0) assert_equal(pred[1, 1], 1) - # test inconsistent outlier label list legth + # test inconsistent outlier label list length def check_exception(): clf = RNC(radius=1, outlier_label=[0, 1, 2]) clf.fit(X, y_multi) From e4e28fa23344880a81382d34cc9af84aaeb6c620 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 23 Apr 2018 23:53:06 -0500 Subject: [PATCH 29/48] fix indent conflicts --- sklearn/neighbors/classification.py | 18 +++++++++--------- sklearn/neighbors/regression.py | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 8bee53ec9b713..ea35a386fccbd 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -123,11 +123,11 @@ def __init__(self, n_neighbors=5, **kwargs): super(KNeighborsClassifier, self).__init__( - n_neighbors=n_neighbors, - algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, metric=metric, p=p, + metric_params=metric_params, + n_jobs=n_jobs, **kwargs) self.weights = _check_weights(weights) def predict(self, X): @@ -327,10 +327,10 @@ def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, **kwargs): super(RadiusNeighborsClassifier, self).__init__( - radius=radius, - algorithm=algorithm, - leaf_size=leaf_size, - metric=metric, p=p, metric_params=metric_params, **kwargs) + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, p=p, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) self.outlier_label = outlier_label diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index ae1ceb2df32f1..bc5872925b967 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -268,10 +268,10 @@ def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, **kwargs): super(RadiusNeighborsRegressor, self).__init__( - radius=radius, - algorithm=algorithm, - leaf_size=leaf_size, - p=p, metric=metric, metric_params=metric_params, **kwargs) + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + p=p, metric=metric, metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) def predict(self, X): From 135f0c4f02a559284c65e61d4566ea1ff7d53af1 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 5 May 2018 14:57:23 -0500 Subject: [PATCH 30/48] fix documentation, indices, add predict_proba tests --- sklearn/neighbors/classification.py | 28 +++++------ sklearn/neighbors/tests/test_neighbors.py | 60 +++++++++++------------ 2 files changed, 41 insertions(+), 47 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index ea35a386fccbd..744925c4dbe92 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -283,15 +283,14 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : manual label, 'most_frequent', None, optional - (default = None) + outlier_label : {manual label, 'most_frequent'}, optional (default = None) + label for outlier samples (samples with no neighbors in given radius). - manual label: str or int label (should be the same type as y) or list of manual labels if multi ouputs are used. - label given for outlier samples (samples with no neighbors in - given radius). - 'most_frequent' : assign the most frequent label to outliers. - None : when outlier is detected, ValueError is raised. + metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -306,7 +305,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, >>> print(neigh.predict([[1.5]])) [0] >>> print(neigh.predict_proba([[1.0]])) - [[ 0.66666667 0.33333333]] + [[0.66666667 0.33333333]] See also -------- @@ -356,21 +355,19 @@ def fit(self, X, y): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - if self.outlier_label == 'most_frequent': + if self.outlier_label is None: + outlier_label_ = None + elif self.outlier_label == 'most_frequent': outlier_label_ = [] for k, classes_k in enumerate(classes_): label_count = np.bincount(_y[:, k]) outlier_label_.append(classes_k[label_count.argmax()]) - - elif self.outlier_label is None: - outlier_label_ = None - else: if (_is_arraylike(self.outlier_label) and not isinstance(self.outlier_label, string_types)): if len(self.outlier_label) != len(classes_): raise ValueError('The length of outlier_label: {} is ' - 'inconsistent with output ' + 'inconsistent with the output ' 'length: {}'.format(self.outlier_label, len(classes_))) outlier_label_ = self.outlier_label @@ -404,7 +401,8 @@ def predict(self, X): n_samples = _num_samples(X) neigh_dist, neigh_ind = self.radius_neighbors(X) - outlier_mask = np.array([len(nind) == 0 for nind in neigh_ind]) + outlier_mask = np.zeros(n_samples, dtype=np.bool) + outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] indices = np.arange(n_samples) outliers = indices[outlier_mask] inliers = indices[~outlier_mask] @@ -470,7 +468,8 @@ def predict_proba(self, X): n_samples = _num_samples(X) neigh_dist, neigh_ind = self.radius_neighbors(X) - outlier_mask = np.array([len(nind) == 0 for nind in neigh_ind]) + outlier_mask = np.zeros(n_samples, dtype=np.bool) + outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] indices = np.arange(n_samples) outliers = indices[outlier_mask] inliers = indices[~outlier_mask] @@ -515,8 +514,7 @@ def predict_proba(self, X): if outliers.size > 0: label_index = np.where(classes_k == self.outlier_label_[k]) if label_index[0].size != 0: - proba_k[outliers, - label_index[0][0]] = 1.0 + proba_k[outliers, label_index[0][0]] = 1.0 # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 3aa1284722ab7..fe542893a6c82 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1327,8 +1327,25 @@ def test_pairwise_boolean_distance(): assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) +def test_radius_neighbors_predidct_proba(): + for i in range(5): + X, y = datasets.make_classification(n_samples=50, n_features=5, + n_informative=3, n_redundant=0, + n_classes=3, random_state=i) + X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) + random_label = np.random.randint(-5, 5) + clf = neighbors.RadiusNeighborsClassifier(radius=2, + outlier_label=random_label) + clf.fit(X_tr, y_tr) + pred = clf.predict(X_te) + proba = clf.predict_proba(X_te) + proba_label = proba.argmax(axis=1) + proba_label = np.where(proba.sum(axis=1) == 0, + random_label, proba_label) + assert_array_equal(pred, proba_label) + + def test_radius_neighbors_outliers(): - # outler handlers: 1. uniform, prior, None RNC = neighbors.RadiusNeighborsClassifier X = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]] y = [0, 2, 2, 1, 1, 1, 3, 3, 3, 3] @@ -1342,32 +1359,22 @@ def check_exception(): # test most frequent clf = RNC(radius=1, outlier_label='most_frequent') clf.fit(X, y) - predict = clf.predict_proba([[1], [15]]) - assert_equal(predict[1, 0], 0) - assert_equal(predict[1, 1], 0) - assert_equal(predict[1, 2], 0) - assert_equal(predict[1, 3], 1) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 0, 0, 1]) # test manual label in y clf = RNC(radius=1, outlier_label=1) clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) - assert_equal(proba[1, 0], 0.0) - assert_equal(proba[1, 1], 1.0) - assert_equal(proba[1, 2], 0.0) - assert_equal(proba[1, 3], 0.0) + assert_array_equal(proba[1, :], [0, 1, 0, 0]) pred = clf.predict([[1], [15]]) - assert_equal(pred[0], 2) - assert_equal(pred[1], 1) + assert_array_equal(pred, [2, 1]) # test manual label out of y clf = RNC(radius=1, outlier_label=4) clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) - assert_equal(proba[1, 0], 0.0) - assert_equal(proba[1, 1], 0.0) - assert_equal(proba[1, 2], 0.0) - assert_equal(proba[1, 3], 0.0) + assert_array_equal(proba[1, :], [0, 0, 0, 0]) pred = clf.predict([[7], [15]]) assert_equal(pred[1], 4.0) @@ -1377,13 +1384,9 @@ def check_exception(): clf = RNC(radius=1, outlier_label=1) clf.fit(X, y_multi) proba = clf.predict_proba([[7], [15]]) - assert_equal(proba[1][1, 0], 0.0) - assert_equal(proba[1][1, 1], 1.0) - assert_equal(proba[1][1, 2], 0.0) - assert_equal(proba[1][1, 3], 0.0) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) pred = clf.predict([[7], [15]]) - assert_equal(pred[1, 0], 1) - assert_equal(pred[1, 1], 1) + assert_array_equal(pred[1, :], [1, 1]) # test multi output different outlier label y_multi = [[0, 0], [2, 2], [2, 2], [1, 1], [1, 1], @@ -1391,17 +1394,10 @@ def check_exception(): clf = RNC(radius=1, outlier_label=[0, 1]) clf.fit(X, y_multi) proba = clf.predict_proba([[7], [15]]) - assert_equal(proba[0][1, 0], 1.0) - assert_equal(proba[0][1, 1], 0.0) - assert_equal(proba[0][1, 2], 0.0) - assert_equal(proba[0][1, 3], 0.0) - assert_equal(proba[1][1, 0], 0.0) - assert_equal(proba[1][1, 1], 1.0) - assert_equal(proba[1][1, 2], 0.0) - assert_equal(proba[1][1, 3], 0.0) + assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) pred = clf.predict([[7], [15]]) - assert_equal(pred[1, 0], 0) - assert_equal(pred[1, 1], 1) + assert_array_equal(pred[1, :], [0, 1]) # test inconsistent outlier label list length def check_exception(): From d86e1dd01d7baeef315e64288e656acc0afee2ce Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 5 May 2018 16:44:57 -0500 Subject: [PATCH 31/48] add a space in doc --- sklearn/neighbors/classification.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 744925c4dbe92..642b72fb9141a 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -285,6 +285,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, outlier_label : {manual label, 'most_frequent'}, optional (default = None) label for outlier samples (samples with no neighbors in given radius). + - manual label: str or int label (should be the same type as y) or list of manual labels if multi ouputs are used. - 'most_frequent' : assign the most frequent label to outliers. From 7f7a8406e7a2ea98480d41cada136e1b34bc52fc Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 7 May 2018 21:09:25 -0500 Subject: [PATCH 32/48] improve documentation --- sklearn/neighbors/classification.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 642b72fb9141a..fd6f3a0c3efde 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -287,9 +287,9 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, label for outlier samples (samples with no neighbors in given radius). - manual label: str or int label (should be the same type as y) - or list of manual labels if multi ouputs are used. - - 'most_frequent' : assign the most frequent label to outliers. - - None : when outlier is detected, ValueError is raised. + or list of manual labels if multi-output is used. + - 'most_frequent' : assign the most frequent label of y to outliers. + - None : when any outlier is detected, ValueError will be raised. metric_params : dict, optional (default = None) From 40c41b4a3e606d7c343045930f2dcd15a5095748 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 12 May 2018 11:25:13 -0500 Subject: [PATCH 33/48] add warning for zero probas, simplify outlier indexing, merge testing, add whats new, improve doc --- doc/whats_new/v0.20.rst | 6 + sklearn/neighbors/classification.py | 20 ++-- sklearn/neighbors/tests/test_neighbors.py | 128 +++++++++++----------- 3 files changed, 82 insertions(+), 72 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 7e7d39dbf1759..a3bc5abb23901 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -125,6 +125,12 @@ Classifiers and regressors only require X to be an object with finite length or shape. :issue:`9832` by :user:`Vrishank Bhardwaj `. +- :class:`neighbors.RadiusNeighborsClassifier` now supports + predicting probabilities by using predict_proba() and supports more + outlier_label options: 'most_frequent', different oulier_labels + for multi-outputs. + :issue:`9629` by :user:`Wenbo Zhao `. + Preprocessing - :class:`preprocessing.PolynomialFeatures` now supports sparse input. diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index fd6f3a0c3efde..fc9ce1daeb436 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -14,6 +14,7 @@ from ..utils.validation import _is_arraylike, _num_samples from ..externals.six import string_types +import warnings from .base import \ _check_weights, _get_weights, \ NeighborsBase, KNeighborsMixin,\ @@ -184,7 +185,7 @@ def predict_proba(self, X): Returns ------- - p : array of shape = [n_samples, n_classes], or a list of n_outputs + p : array of shape = [n_samples, n_classes], or a list with n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -404,9 +405,8 @@ def predict(self, X): neigh_dist, neigh_ind = self.radius_neighbors(X) outlier_mask = np.zeros(n_samples, dtype=np.bool) outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] - indices = np.arange(n_samples) - outliers = indices[outlier_mask] - inliers = indices[~outlier_mask] + outliers = np.flatnonzero(outlier_mask) + inliers = np.flatnonzero(~outlier_mask) classes_ = self.classes_ _y = self._y @@ -459,7 +459,7 @@ def predict_proba(self, X): Returns ------- - p : array of shape = [n_samples, n_classes], or a list of n_outputs + p : array of shape = [n_samples, n_classes], or a list with n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -471,9 +471,8 @@ def predict_proba(self, X): neigh_dist, neigh_ind = self.radius_neighbors(X) outlier_mask = np.zeros(n_samples, dtype=np.bool) outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] - indices = np.arange(n_samples) - outliers = indices[outlier_mask] - inliers = indices[~outlier_mask] + outliers = np.flatnonzero(outlier_mask) + inliers = np.flatnonzero(~outlier_mask) classes_ = self.classes_ _y = self._y @@ -516,6 +515,11 @@ def predict_proba(self, X): label_index = np.where(classes_k == self.outlier_label_[k]) if label_index[0].size != 0: proba_k[outliers, label_index[0][0]] = 1.0 + else: + warnings.warn('Outlier label {} is not in training ' + 'classes. All class probabilities of ' + 'outliers will be assigned with 0.' + ''.format(self.outlier_label_[k])) # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index fe542893a6c82..eb8bc1002ae90 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -363,6 +363,7 @@ def test_radius_neighbors_classifier_outlier_labeling(): z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]]) # one outlier correct_labels1 = np.array([1, 2]) correct_labels2 = np.array([-1, 1, 2]) + outlier_proba = np.array([0, 0]) weight_func = _weight_func @@ -375,6 +376,66 @@ def test_radius_neighbors_classifier_outlier_labeling(): clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1)) assert_array_equal(correct_labels2, clf.predict(z2)) + assert_array_equal(outlier_proba, clf.predict_proba(z2)[0]) + + # test outlier_labeling of using predict_proba() + RNC = neighbors.RadiusNeighborsClassifier + X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]) + y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3]) + + # test invalid outlier_label dtype + def check_exception(): + clf = RNC(radius=1, outlier_label='a') + clf.fit(X, y) + assert_raises(TypeError, check_exception) + + # test most frequent + clf = RNC(radius=1, outlier_label='most_frequent') + clf.fit(X, y) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 0, 0, 1]) + + # test manual label in y + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 1, 0, 0]) + pred = clf.predict([[1], [15]]) + assert_array_equal(pred, [2, 1]) + + # test manual label out of y warning + def check_warning(): + clf = RNC(radius=1, outlier_label=4) + clf.fit(X, y) + clf.predict_proba([[1], [15]]) + assert_warns(check_warning) + + # test multi output same outlier label + y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], + [1, 3], [3, 3], [3, 3], [3, 0], [3, 0]] + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) + pred = clf.predict([[7], [15]]) + assert_array_equal(pred[1, :], [1, 1]) + + # test multi output different outlier label + y_multi = [[0, 0], [2, 2], [2, 2], [1, 1], [1, 1], + [1, 1], [3, 3], [3, 3], [3, 3], [3, 3]] + clf = RNC(radius=1, outlier_label=[0, 1]) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) + pred = clf.predict([[7], [15]]) + assert_array_equal(pred[1, :], [0, 1]) + + # test inconsistent outlier label list length + def check_exception(): + clf = RNC(radius=1, outlier_label=[0, 1, 2]) + clf.fit(X, y_multi) + assert_raises(ValueError, check_exception) def test_radius_neighbors_classifier_zero_distance(): @@ -1333,74 +1394,13 @@ def test_radius_neighbors_predidct_proba(): n_informative=3, n_redundant=0, n_classes=3, random_state=i) X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) - random_label = np.random.randint(-5, 5) + outlier_label = int(2 - i) clf = neighbors.RadiusNeighborsClassifier(radius=2, - outlier_label=random_label) + outlier_label=outlier_label) clf.fit(X_tr, y_tr) pred = clf.predict(X_te) proba = clf.predict_proba(X_te) proba_label = proba.argmax(axis=1) proba_label = np.where(proba.sum(axis=1) == 0, - random_label, proba_label) + outlier_label, proba_label) assert_array_equal(pred, proba_label) - - -def test_radius_neighbors_outliers(): - RNC = neighbors.RadiusNeighborsClassifier - X = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]] - y = [0, 2, 2, 1, 1, 1, 3, 3, 3, 3] - - # test invalid outlier_label dtype - def check_exception(): - clf = RNC(radius=1, outlier_label='a') - clf.fit(X, y) - assert_raises(TypeError, check_exception) - - # test most frequent - clf = RNC(radius=1, outlier_label='most_frequent') - clf.fit(X, y) - proba = clf.predict_proba([[1], [15]]) - assert_array_equal(proba[1, :], [0, 0, 0, 1]) - - # test manual label in y - clf = RNC(radius=1, outlier_label=1) - clf.fit(X, y) - proba = clf.predict_proba([[1], [15]]) - assert_array_equal(proba[1, :], [0, 1, 0, 0]) - pred = clf.predict([[1], [15]]) - assert_array_equal(pred, [2, 1]) - - # test manual label out of y - clf = RNC(radius=1, outlier_label=4) - clf.fit(X, y) - proba = clf.predict_proba([[1], [15]]) - assert_array_equal(proba[1, :], [0, 0, 0, 0]) - pred = clf.predict([[7], [15]]) - assert_equal(pred[1], 4.0) - - # test multi output same outlier label - y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], - [1, 3], [3, 3], [3, 3], [3, 0], [3, 0]] - clf = RNC(radius=1, outlier_label=1) - clf.fit(X, y_multi) - proba = clf.predict_proba([[7], [15]]) - assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) - pred = clf.predict([[7], [15]]) - assert_array_equal(pred[1, :], [1, 1]) - - # test multi output different outlier label - y_multi = [[0, 0], [2, 2], [2, 2], [1, 1], [1, 1], - [1, 1], [3, 3], [3, 3], [3, 3], [3, 3]] - clf = RNC(radius=1, outlier_label=[0, 1]) - clf.fit(X, y_multi) - proba = clf.predict_proba([[7], [15]]) - assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) - assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) - pred = clf.predict([[7], [15]]) - assert_array_equal(pred[1, :], [0, 1]) - - # test inconsistent outlier label list length - def check_exception(): - clf = RNC(radius=1, outlier_label=[0, 1, 2]) - clf.fit(X, y_multi) - assert_raises(ValueError, check_exception) From f7e58915946cf23eaaf88bc4fe360f38d1381c6d Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 12 May 2018 12:01:03 -0500 Subject: [PATCH 34/48] fix assert_warns --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index eb8bc1002ae90..7b57a27b521d7 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -408,7 +408,7 @@ def check_warning(): clf = RNC(radius=1, outlier_label=4) clf.fit(X, y) clf.predict_proba([[1], [15]]) - assert_warns(check_warning) + assert_warns(UserWarning, check_warning) # test multi output same outlier label y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], From b32b45c4cedf51de0e504f750db6e33e2eb5cfdc Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sat, 20 Jul 2019 22:30:37 -0500 Subject: [PATCH 35/48] change predict() implementaion, add docs in whats_new --- doc/whats_new/v0.22.rst | 15 ++++ sklearn/neighbors/classification.py | 104 ++++++++-------------- sklearn/neighbors/tests/test_neighbors.py | 6 +- 3 files changed, 57 insertions(+), 68 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f0670e1293369..ec9ef189d20f0 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -117,6 +117,7 @@ Changelog :mod:`sklearn.feature_selection` ................................ + - |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not remove constant features due to numerical instability, by using range rather than variance in this case. @@ -130,6 +131,20 @@ Changelog NumPy array, SciPy sparse matrix, and Pandas DataFrame. :pr:`14035` by `Guillaume Lemaitre `. +:mod:`sklearn.neighbors` +.................... + +- |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports + predicting probabilities by using predict_proba() and supports more + outlier_label options: 'most_frequent', different oulier_labels + for multi-outputs. + :issue:`9629` by :user:`Wenbo Zhao `. + +- |Efficiency| Efficiency improvements for + :func:`neighbors.RadiusNeighborsClassifier.prdict` by changing + implementation from scipy.stats.mode to numpy.bincount. + :pr:`9597` by :user:`Wenbo Zhao `. + :mod:`sklearn.neural_network` ............................. diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index df4bdaa1ca529..7999380e0ac8b 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -292,7 +292,6 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, - 'most_frequent' : assign the most frequent label of y to outliers. - None : when any outlier is detected, ValueError will be raised. - metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -392,70 +391,6 @@ def fit(self, X, y): self.outlier_label_ = outlier_label_ return self - def predict(self, X): - """Predict the class labels for the provided data - - Parameters - ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' - Test samples. - - Returns - ------- - y : array of shape [n_samples] or [n_samples, n_outputs] - Class labels for each data sample. - - """ - X = check_array(X, accept_sparse='csr') - n_samples = _num_samples(X) - - neigh_dist, neigh_ind = self.radius_neighbors(X) - outlier_mask = np.zeros(n_samples, dtype=np.bool) - outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] - outliers = np.flatnonzero(outlier_mask) - inliers = np.flatnonzero(~outlier_mask) - - classes_ = self.classes_ - _y = self._y - if not self.outputs_2d_: - _y = self._y.reshape((-1, 1)) - classes_ = [self.classes_] - n_outputs = len(classes_) - - if self.outlier_label_ is None and outliers.size > 0: - raise ValueError('No neighbors found for test samples %r, ' - 'you can try to use larger radius, ' - 'give a label for outliers, ' - 'or consider removing them from your dataset.' - % outliers) - - weights = _get_weights(neigh_dist, self.weights) - - y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) - for k, classes_k in enumerate(classes_): - pred_labels = np.zeros(len(neigh_ind), dtype=object) - pred_labels[:] = [_y[ind, k] for ind in neigh_ind] - if weights is None: - mode = np.array([stats.mode(pl)[0] - for pl in pred_labels[inliers]], dtype=np.int) - else: - mode = np.array( - [weighted_mode(pl, w)[0] - for (pl, w) in zip(pred_labels[inliers], weights[inliers]) - ], dtype=np.int) - - mode = mode.ravel() - - y_pred[inliers, k] = classes_k.take(mode) - if outliers.size > 0: - y_pred[outliers, k] = self.outlier_label_[k] - - if not self.outputs_2d_: - y_pred = y_pred.ravel() - - return y_pred - def predict_proba(self, X): """Return probability estimates for the test data X. @@ -540,3 +475,42 @@ def predict_proba(self, X): probabilities = probabilities[0] return probabilities + + def predict(self, X): + """Predict the class labels for the provided data + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Test samples. + Returns + ------- + y : array of shape [n_samples] or [n_samples, n_outputs] + Class labels for each data sample. + """ + + probs = self.predict_proba(X) + classes_ = self.classes_ + + if not self.outputs_2d_: + probs = [probs] + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_samples = probs[0].shape[0] + y_pred = np.empty((n_samples, n_outputs), + dtype=classes_[0].dtype) + + for k, prob in enumerate(probs): + max_prob_index = prob.argmax(axis=1) + y_pred[:, k] = classes_[k].take(max_prob_index) + + outlier_zero_probs = (prob == 0).all(axis=1) + if outlier_zero_probs.any(): + zero_prob_index = np.flatnonzero(outlier_zero_probs) + y_pred[zero_prob_index, k] = self.outlier_label_[k] + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 1996d7dbf5e0b..770a8c2f195a3 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1477,12 +1477,12 @@ def test_pairwise_boolean_distance(): def test_radius_neighbors_predidct_proba(): - for i in range(5): + for seed in range(5): X, y = datasets.make_classification(n_samples=50, n_features=5, n_informative=3, n_redundant=0, - n_classes=3, random_state=i) + n_classes=3, random_state=seed) X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) - outlier_label = int(2 - i) + outlier_label = int(2 - seed) clf = neighbors.RadiusNeighborsClassifier(radius=2, outlier_label=outlier_label) clf.fit(X_tr, y_tr) From 19d95e5c3530b51a95c9b0b771499c75789b7274 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 21 Jul 2019 00:23:47 -0500 Subject: [PATCH 36/48] remove changes in 0.20 docs --- doc/whats_new/v0.20.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index b73d300742cf5..d48f0d3b01485 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -1224,12 +1224,6 @@ Support for Python 3.3 has been officially dropped. memory efficient when ``algorithm='brute'``. :issue:`11136` by `Joel Nothman`_ and :user:`Aman Dalmia `. -- |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports - predicting probabilities by using predict_proba() and supports more - outlier_label options: 'most_frequent', different oulier_labels - for multi-outputs. - :issue:`9629` by :user:`Wenbo Zhao `. - - |Feature| Add ``sample_weight`` parameter to the fit method of :class:`neighbors.KernelDensity` to enable weighting in kernel density estimation. From 69b88e61d470ba3b4b723e0a2a544a9a67f419e0 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 21 Jul 2019 00:39:06 -0500 Subject: [PATCH 37/48] change external.six to six --- sklearn/neighbors/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 7999380e0ac8b..a29ee75fb6b89 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -10,9 +10,9 @@ import numpy as np from scipy import stats +from six import string_types from ..utils.extmath import weighted_mode from ..utils.validation import _is_arraylike, _num_samples -from ..externals.six import string_types import warnings from .base import \ From 8c8163bd047a2349bc3491c0c36b0b85191e67cc Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 21 Jul 2019 08:58:44 -0500 Subject: [PATCH 38/48] format doc strings --- sklearn/neighbors/classification.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index a29ee75fb6b89..c32c3af07ac33 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -133,7 +133,7 @@ def __init__(self, n_neighbors=5, self.weights = _check_weights(weights) def predict(self, X): - """Predict the class labels for the provided data + """Predict the class labels for the provided data. Parameters ---------- @@ -477,12 +477,14 @@ def predict_proba(self, X): return probabilities def predict(self, X): - """Predict the class labels for the provided data + """Predict the class labels for the provided data. + Parameters ---------- X : array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. + Returns ------- y : array of shape [n_samples] or [n_samples, n_outputs] From 543e1ed0e242155b4d755b1c80f8b6770204dd53 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Sun, 21 Jul 2019 10:43:25 -0500 Subject: [PATCH 39/48] fix grammer in docs --- sklearn/neighbors/classification.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index c32c3af07ac33..87a71abaf9bd7 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -185,7 +185,7 @@ def predict_proba(self, X): Returns ------- - p : array of shape = [n_samples, n_classes], or a list with n_outputs + p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -402,7 +402,7 @@ def predict_proba(self, X): Returns ------- - p : array of shape = [n_samples, n_classes], or a list with n_outputs + p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -425,9 +425,9 @@ def predict_proba(self, X): if self.outlier_label_ is None and outliers.size > 0: raise ValueError('No neighbors found for test samples %r, ' - 'you can try to use larger radius, ' - 'give a label for outliers, ' - 'or consider removing them from your dataset.' + 'you can try using larger radius, ' + 'giving a label for outliers, ' + 'or considering removing them from your dataset.' % outliers) weights = _get_weights(neigh_dist, self.weights) From e73a20a7cc1a11082f589404da9a355a2ad6d1b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Mon, 22 Jul 2019 10:04:01 -0700 Subject: [PATCH 40/48] Move predict function for cleaner diff --- sklearn/neighbors/classification.py | 82 ++++++++++++++--------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 87a71abaf9bd7..89133a2f90629 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -391,6 +391,47 @@ def fit(self, X, y): self.outlier_label_ = outlier_label_ return self + def predict(self, X): + """Predict the class labels for the provided data. + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Test samples. + + Returns + ------- + y : array of shape [n_samples] or [n_samples, n_outputs] + Class labels for each data sample. + """ + + probs = self.predict_proba(X) + classes_ = self.classes_ + + if not self.outputs_2d_: + probs = [probs] + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_samples = probs[0].shape[0] + y_pred = np.empty((n_samples, n_outputs), + dtype=classes_[0].dtype) + + for k, prob in enumerate(probs): + max_prob_index = prob.argmax(axis=1) + y_pred[:, k] = classes_[k].take(max_prob_index) + + outlier_zero_probs = (prob == 0).all(axis=1) + if outlier_zero_probs.any(): + zero_prob_index = np.flatnonzero(outlier_zero_probs) + y_pred[zero_prob_index, k] = self.outlier_label_[k] + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred + def predict_proba(self, X): """Return probability estimates for the test data X. @@ -475,44 +516,3 @@ def predict_proba(self, X): probabilities = probabilities[0] return probabilities - - def predict(self, X): - """Predict the class labels for the provided data. - - Parameters - ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' - Test samples. - - Returns - ------- - y : array of shape [n_samples] or [n_samples, n_outputs] - Class labels for each data sample. - """ - - probs = self.predict_proba(X) - classes_ = self.classes_ - - if not self.outputs_2d_: - probs = [probs] - classes_ = [self.classes_] - - n_outputs = len(classes_) - n_samples = probs[0].shape[0] - y_pred = np.empty((n_samples, n_outputs), - dtype=classes_[0].dtype) - - for k, prob in enumerate(probs): - max_prob_index = prob.argmax(axis=1) - y_pred[:, k] = classes_[k].take(max_prob_index) - - outlier_zero_probs = (prob == 0).all(axis=1) - if outlier_zero_probs.any(): - zero_prob_index = np.flatnonzero(outlier_zero_probs) - y_pred[zero_prob_index, k] = self.outlier_label_[k] - - if not self.outputs_2d_: - y_pred = y_pred.ravel() - - return y_pred From 1638ee55459276c337f6a8f8bb8d7c75d6a34bfa Mon Sep 17 00:00:00 2001 From: Wenbo Zhao Date: Sun, 4 Aug 2019 19:08:40 -0500 Subject: [PATCH 41/48] Update doc/whats_new/v0.22.rst fix typo Co-Authored-By: Joel Nothman --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index eb348321ebf4b..8d11258966a03 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -285,7 +285,7 @@ Changelog :issue:`9629` by :user:`Wenbo Zhao `. - |Efficiency| Efficiency improvements for - :func:`neighbors.RadiusNeighborsClassifier.prdict` by changing + :func:`neighbors.RadiusNeighborsClassifier.predict` by changing implementation from scipy.stats.mode to numpy.bincount. :pr:`9597` by :user:`Wenbo Zhao `. From 39180ac0d1edce4792baa4ba100e349b845e4849 Mon Sep 17 00:00:00 2001 From: Wenbo Zhao Date: Mon, 5 Aug 2019 21:22:45 -0500 Subject: [PATCH 42/48] Update sklearn/neighbors/tests/test_neighbors.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit typo fix Co-Authored-By: Tom Dupré la Tour --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 770a8c2f195a3..ef97718ba53c7 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1476,7 +1476,7 @@ def test_pairwise_boolean_distance(): assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) -def test_radius_neighbors_predidct_proba(): +def test_radius_neighbors_predict_proba(): for seed in range(5): X, y = datasets.make_classification(n_samples=50, n_features=5, n_informative=3, n_redundant=0, From 644a9f1dc90a6d29058d7280416a2cdd400bc67b Mon Sep 17 00:00:00 2001 From: Wenbo Zhao Date: Mon, 5 Aug 2019 21:23:05 -0500 Subject: [PATCH 43/48] Update doc/whats_new/v0.22.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Tom Dupré la Tour --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 8d11258966a03..983b766edf699 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -279,7 +279,7 @@ Changelog .................... - |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports - predicting probabilities by using predict_proba() and supports more + predicting probabilities by using `predict_proba` and supports more outlier_label options: 'most_frequent', different oulier_labels for multi-outputs. :issue:`9629` by :user:`Wenbo Zhao `. From a21eb5c709fd8c17c02d44f4d6782252e2d6aa9a Mon Sep 17 00:00:00 2001 From: Wenbo Zhao Date: Mon, 5 Aug 2019 21:23:33 -0500 Subject: [PATCH 44/48] Update doc/whats_new/v0.22.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix typo Co-Authored-By: Tom Dupré la Tour --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 983b766edf699..81afc995f848b 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -280,7 +280,7 @@ Changelog - |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports predicting probabilities by using `predict_proba` and supports more - outlier_label options: 'most_frequent', different oulier_labels + outlier_label options: 'most_frequent', or different outlier_labels for multi-outputs. :issue:`9629` by :user:`Wenbo Zhao `. From 87b52886fc3dc7bada2c3173e14f01eac15b12c8 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 5 Aug 2019 21:51:36 -0500 Subject: [PATCH 45/48] add doc for iterations over multi-outputs, use np.flatnonzero instead of np.where in outlier_label checking --- sklearn/neighbors/classification.py | 12 +++++++++--- sklearn/neighbors/regression.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 89133a2f90629..c1888542f54ac 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -366,11 +366,15 @@ def fit(self, X, y): if self.outlier_label is None: outlier_label_ = None + elif self.outlier_label == 'most_frequent': outlier_label_ = [] + # iterate over multi-outputs to get the most frequest label + # for each output. for k, classes_k in enumerate(classes_): label_count = np.bincount(_y[:, k]) outlier_label_.append(classes_k[label_count.argmax()]) + else: if (_is_arraylike(self.outlier_label) and not isinstance(self.outlier_label, string_types)): @@ -382,6 +386,7 @@ def fit(self, X, y): outlier_label_ = self.outlier_label else: outlier_label_ = [self.outlier_label] * len(classes_) + # ensure the dtype of outlier label is consistent with y if any(np.append(classes, label).dtype != classes.dtype for classes, label in zip(classes_, outlier_label_)): @@ -476,6 +481,7 @@ def predict_proba(self, X): weights = weights[inliers] probabilities = [] + # iterate over multi-outputs for k, classes_k in enumerate(classes_): pred_labels = np.zeros(len(neigh_ind), dtype=object) pred_labels[:] = [_y[ind, k] for ind in neigh_ind] @@ -496,9 +502,9 @@ def predict_proba(self, X): proba_k[inliers, :] = proba_inl if outliers.size > 0: - label_index = np.where(classes_k == self.outlier_label_[k]) - if label_index[0].size != 0: - proba_k[outliers, label_index[0][0]] = 1.0 + label_index = np.flatnonzero(classes_k == self.outlier_label_[k]) + if label_index.size == 1: + proba_k[outliers, label_index[0]] = 1.0 else: warnings.warn('Outlier label {} is not in training ' 'classes. All class probabilities of ' diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 392bfd76cb7c2..67d9467f14556 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -15,7 +15,7 @@ import numpy as np from scipy.sparse import issparse -from .base import _check_weights, _get_weights, NeighborsBase, KNeighborsMixin +from .base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin from .base import RadiusNeighborsMixin, SupervisedFloatMixin from ..base import RegressorMixin from ..utils import check_array From ab7344029dedd1cdfbbf73b99e9474e9c8cd9ca9 Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 5 Aug 2019 22:55:02 -0500 Subject: [PATCH 46/48] add outlier_label scalar verification and unit test, fix format, add better comments --- sklearn/neighbors/classification.py | 32 ++++++++++++++--------- sklearn/neighbors/tests/test_neighbors.py | 10 +++++-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 418965cfa148a..589198739796a 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -409,8 +409,8 @@ def fit(self, X, y): elif self.outlier_label == 'most_frequent': outlier_label_ = [] - # iterate over multi-outputs to get the most frequest label - # for each output. + # iterate over multi-output, get the most frequest label for each + # output. for k, classes_k in enumerate(classes_): label_count = np.bincount(_y[:, k]) outlier_label_.append(classes_k[label_count.argmax()]) @@ -419,19 +419,26 @@ def fit(self, X, y): if (_is_arraylike(self.outlier_label) and not isinstance(self.outlier_label, string_types)): if len(self.outlier_label) != len(classes_): - raise ValueError('The length of outlier_label: {} is ' - 'inconsistent with the output ' - 'length: {}'.format(self.outlier_label, + raise ValueError("The length of outlier_label: {} is " + "inconsistent with the output " + "length: {}".format(self.outlier_label, len(classes_))) outlier_label_ = self.outlier_label else: outlier_label_ = [self.outlier_label] * len(classes_) - # ensure the dtype of outlier label is consistent with y - if any(np.append(classes, label).dtype != classes.dtype - for classes, label in zip(classes_, outlier_label_)): - raise TypeError('The dtype of outlier_label is' - 'inconsistent with y') + for classes, label in zip(classes_, outlier_label_): + if (_is_arraylike(label) and + not isinstance(label, string_types)): + # ensure the outlier lable for each output is a scalar. + raise TypeError("The outlier_label of classes {} is " + "supposed to be a scalar, got " + "{}.".format(classes, label)) + if np.append(classes, label).dtype != classes.dtype: + # ensure the dtype of outlier label is consistent with y. + raise TypeError("The dtype of outlier_label {} is " + "inconsistent with classes {} in " + "y.".format(label, classes)) self.outlier_label_ = outlier_label_ return self @@ -521,7 +528,7 @@ def predict_proba(self, X): weights = weights[inliers] probabilities = [] - # iterate over multi-outputs + # iterate over multi-output, measure probabilities of the k-th output. for k, classes_k in enumerate(classes_): pred_labels = np.zeros(len(neigh_ind), dtype=object) pred_labels[:] = [_y[ind, k] for ind in neigh_ind] @@ -542,7 +549,8 @@ def predict_proba(self, X): proba_k[inliers, :] = proba_inl if outliers.size > 0: - label_index = np.flatnonzero(classes_k == self.outlier_label_[k]) + _outlier_label = self.outlier_label_[k] + label_index = np.flatnonzero(classes_k == _outlier_label) if label_index.size == 1: proba_k[outliers, label_index[0]] = 1.0 else: diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index ef97718ba53c7..3da1c2579700f 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -405,11 +405,17 @@ def test_radius_neighbors_classifier_outlier_labeling(): X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]) y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3]) + # test outlier_label scalar verification + def check_array_exception(): + clf = RNC(radius=1, outlier_label=[[5]]) + clf.fit(X, y) + assert_raises(TypeError, check_array_exception) + # test invalid outlier_label dtype - def check_exception(): + def check_dtype_exception(): clf = RNC(radius=1, outlier_label='a') clf.fit(X, y) - assert_raises(TypeError, check_exception) + assert_raises(TypeError, check_dtype_exception) # test most frequent clf = RNC(radius=1, outlier_label='most_frequent') From 5f2d33e7eb766a062bdad607784c0099b8580afc Mon Sep 17 00:00:00 2001 From: Webber_Windows Date: Mon, 5 Aug 2019 23:16:20 -0500 Subject: [PATCH 47/48] better docs --- sklearn/neighbors/classification.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 589198739796a..a72f710ae57ea 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -471,6 +471,8 @@ def predict(self, X): dtype=classes_[0].dtype) for k, prob in enumerate(probs): + # iterate over multi-output, assign labels based on probabilities + # of each output. max_prob_index = prob.argmax(axis=1) y_pred[:, k] = classes_[k].take(max_prob_index) From 6ac2ffb7f045a699ccadcf8837be047d49f01d65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Wed, 7 Aug 2019 10:29:48 -0700 Subject: [PATCH 48/48] Update v0.22.rst --- doc/whats_new/v0.22.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 81afc995f848b..918696cbc83d2 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -265,7 +265,7 @@ Changelog - |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not remove constant features due to numerical instability, by using range rather than variance in this case. - :pr:`13704` by `Roddy MacSween `. + :pr:`13704` by :user:`Roddy MacSween `. :mod:`sklearn.utils` .................... @@ -273,7 +273,7 @@ Changelog - |Enhancement| :func:`utils.safe_indexing` accepts an ``axis`` parameter to index array-like across rows and columns. The column indexing can be done on NumPy array, SciPy sparse matrix, and Pandas DataFrame. - :pr:`14035` by `Guillaume Lemaitre `. + :pr:`14035` by :user:`Guillaume Lemaitre `. :mod:`sklearn.neighbors` .................... @@ -282,11 +282,10 @@ Changelog predicting probabilities by using `predict_proba` and supports more outlier_label options: 'most_frequent', or different outlier_labels for multi-outputs. - :issue:`9629` by :user:`Wenbo Zhao `. + :pr:`9597` by :user:`Wenbo Zhao `. - |Efficiency| Efficiency improvements for - :func:`neighbors.RadiusNeighborsClassifier.predict` by changing - implementation from scipy.stats.mode to numpy.bincount. + :func:`neighbors.RadiusNeighborsClassifier.predict`. :pr:`9597` by :user:`Wenbo Zhao `. - |Fix| KNearestRegressor now throws error when fit on non-square data and