diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index c28231adbc1cd..918696cbc83d2 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -261,10 +261,11 @@ Changelog :mod:`sklearn.feature_selection` ................................ + - |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not remove constant features due to numerical instability, by using range rather than variance in this case. - :pr:`13704` by `Roddy MacSween `. + :pr:`13704` by :user:`Roddy MacSween `. :mod:`sklearn.utils` .................... @@ -272,10 +273,20 @@ Changelog - |Enhancement| :func:`utils.safe_indexing` accepts an ``axis`` parameter to index array-like across rows and columns. The column indexing can be done on NumPy array, SciPy sparse matrix, and Pandas DataFrame. - :pr:`14035` by `Guillaume Lemaitre `. + :pr:`14035` by :user:`Guillaume Lemaitre `. :mod:`sklearn.neighbors` -............................. +.................... + +- |Feature| :class:`neighbors.RadiusNeighborsClassifier` now supports + predicting probabilities by using `predict_proba` and supports more + outlier_label options: 'most_frequent', or different outlier_labels + for multi-outputs. + :pr:`9597` by :user:`Wenbo Zhao `. + +- |Efficiency| Efficiency improvements for + :func:`neighbors.RadiusNeighborsClassifier.predict`. + :pr:`9597` by :user:`Wenbo Zhao `. - |Fix| KNearestRegressor now throws error when fit on non-square data and metric = precomputed. :class:`neighbors.NeighborsBase` diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index f0fd0b084365a..a72f710ae57ea 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -10,8 +10,11 @@ import numpy as np from scipy import stats +from six import string_types from ..utils.extmath import weighted_mode +from ..utils.validation import _is_arraylike, _num_samples +import warnings from .base import \ _check_weights, _get_weights, \ NeighborsBase, KNeighborsMixin,\ @@ -141,7 +144,6 @@ def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs): - super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, @@ -151,7 +153,7 @@ def __init__(self, n_neighbors=5, self.weights = _check_weights(weights) def predict(self, X): - """Predict the class labels for the provided data + """Predict the class labels for the provided data. Parameters ---------- @@ -174,7 +176,7 @@ def predict(self, X): classes_ = [self.classes_] n_outputs = len(classes_) - n_samples = X.shape[0] + n_samples = _num_samples(X) weights = _get_weights(neigh_dist, self.weights) y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) @@ -218,7 +220,7 @@ def predict_proba(self, X): _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - n_samples = X.shape[0] + n_samples = _num_samples(X) weights = _get_weights(neigh_dist, self.weights) if weights is None: @@ -302,10 +304,13 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : int, optional (default = None) - Label, which is given for outlier samples (samples with no - neighbors on given radius). - If set to None, ValueError is raised, when outlier is detected. + outlier_label : {manual label, 'most_frequent'}, optional (default = None) + label for outlier samples (samples with no neighbors in given radius). + + - manual label: str or int label (should be the same type as y) + or list of manual labels if multi-output is used. + - 'most_frequent' : assign the most frequent label of y to outliers. + - None : when any outlier is detected, ValueError will be raised. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. @@ -346,6 +351,8 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, RadiusNeighborsClassifier(...) >>> print(neigh.predict([[1.5]])) [0] + >>> print(neigh.predict_proba([[1.0]])) + [[0.66666667 0.33333333]] See also -------- @@ -375,8 +382,69 @@ def __init__(self, radius=1.0, weights='uniform', self.weights = _check_weights(weights) self.outlier_label = outlier_label + def fit(self, X, y): + """Fit the model using X as training data and y as target values + + Parameters + ---------- + X : {array-like, sparse matrix, BallTree, KDTree} + Training data. If array or matrix, shape [n_samples, n_features], + or [n_samples, n_samples] if metric='precomputed'. + + y : {array-like, sparse matrix} + Target values of shape = [n_samples] or [n_samples, n_outputs] + + """ + + SupervisedIntegerMixin.fit(self, X, y) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + if self.outlier_label is None: + outlier_label_ = None + + elif self.outlier_label == 'most_frequent': + outlier_label_ = [] + # iterate over multi-output, get the most frequest label for each + # output. + for k, classes_k in enumerate(classes_): + label_count = np.bincount(_y[:, k]) + outlier_label_.append(classes_k[label_count.argmax()]) + + else: + if (_is_arraylike(self.outlier_label) and + not isinstance(self.outlier_label, string_types)): + if len(self.outlier_label) != len(classes_): + raise ValueError("The length of outlier_label: {} is " + "inconsistent with the output " + "length: {}".format(self.outlier_label, + len(classes_))) + outlier_label_ = self.outlier_label + else: + outlier_label_ = [self.outlier_label] * len(classes_) + + for classes, label in zip(classes_, outlier_label_): + if (_is_arraylike(label) and + not isinstance(label, string_types)): + # ensure the outlier lable for each output is a scalar. + raise TypeError("The outlier_label of classes {} is " + "supposed to be a scalar, got " + "{}.".format(classes, label)) + if np.append(classes, label).dtype != classes.dtype: + # ensure the dtype of outlier label is consistent with y. + raise TypeError("The dtype of outlier_label {} is " + "inconsistent with classes {} in " + "y.".format(label, classes)) + + self.outlier_label_ = outlier_label_ + return self + def predict(self, X): - """Predict the class labels for the provided data + """Predict the class labels for the provided data. Parameters ---------- @@ -388,54 +456,119 @@ def predict(self, X): ------- y : array of shape [n_samples] or [n_samples, n_outputs] Class labels for each data sample. + """ + + probs = self.predict_proba(X) + classes_ = self.classes_ + + if not self.outputs_2d_: + probs = [probs] + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_samples = probs[0].shape[0] + y_pred = np.empty((n_samples, n_outputs), + dtype=classes_[0].dtype) + + for k, prob in enumerate(probs): + # iterate over multi-output, assign labels based on probabilities + # of each output. + max_prob_index = prob.argmax(axis=1) + y_pred[:, k] = classes_[k].take(max_prob_index) + + outlier_zero_probs = (prob == 0).all(axis=1) + if outlier_zero_probs.any(): + zero_prob_index = np.flatnonzero(outlier_zero_probs) + y_pred[zero_prob_index, k] = self.outlier_label_[k] + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred + def predict_proba(self, X): + """Return probability estimates for the test data X. + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Test samples. + + Returns + ------- + p : array of shape = [n_samples, n_classes], or a list of n_outputs + of such arrays if n_outputs > 1. + The class probabilities of the input samples. Classes are ordered + by lexicographic order. """ + X = check_array(X, accept_sparse='csr') - n_samples = X.shape[0] + n_samples = _num_samples(X) neigh_dist, neigh_ind = self.radius_neighbors(X) - inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0] - outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0] + outlier_mask = np.zeros(n_samples, dtype=np.bool) + outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] + outliers = np.flatnonzero(outlier_mask) + inliers = np.flatnonzero(~outlier_mask) classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] - n_outputs = len(classes_) - if self.outlier_label is not None: - neigh_dist[outliers] = 1e-6 - elif outliers: + if self.outlier_label_ is None and outliers.size > 0: raise ValueError('No neighbors found for test samples %r, ' 'you can try using larger radius, ' - 'give a label for outliers, ' - 'or consider removing them from your dataset.' + 'giving a label for outliers, ' + 'or considering removing them from your dataset.' % outliers) weights = _get_weights(neigh_dist, self.weights) + if weights is not None: + weights = weights[inliers] - y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype) + probabilities = [] + # iterate over multi-output, measure probabilities of the k-th output. for k, classes_k in enumerate(classes_): pred_labels = np.zeros(len(neigh_ind), dtype=object) pred_labels[:] = [_y[ind, k] for ind in neigh_ind] + + proba_k = np.zeros((n_samples, classes_k.size)) + proba_inl = np.zeros((len(inliers), classes_k.size)) + + # samples have different size of neighbors within the same radius if weights is None: - mode = np.array([stats.mode(pl)[0] - for pl in pred_labels[inliers]], dtype=np.int) + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] = np.bincount(idx, + minlength=classes_k.size) else: - mode = np.array( - [weighted_mode(pl, w)[0] - for (pl, w) in zip(pred_labels[inliers], weights[inliers]) - ], dtype=np.int) + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] = np.bincount(idx, + weights[i], + minlength=classes_k.size) + proba_k[inliers, :] = proba_inl + + if outliers.size > 0: + _outlier_label = self.outlier_label_[k] + label_index = np.flatnonzero(classes_k == _outlier_label) + if label_index.size == 1: + proba_k[outliers, label_index[0]] = 1.0 + else: + warnings.warn('Outlier label {} is not in training ' + 'classes. All class probabilities of ' + 'outliers will be assigned with 0.' + ''.format(self.outlier_label_[k])) - mode = mode.ravel() - - y_pred[inliers, k] = classes_k.take(mode) + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer - if outliers: - y_pred[outliers, :] = self.outlier_label + probabilities.append(proba_k) if not self.outputs_2d_: - y_pred = y_pred.ravel() + probabilities = probabilities[0] - return y_pred + return probabilities diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 09143ad28b1bc..006f98171a95a 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -347,12 +347,11 @@ def predict(self, X): if len(ind) else empty_obs for (i, ind) in enumerate(neigh_ind)]) - if np.max(np.isnan(y_pred)): + if np.any(np.isnan(y_pred)): empty_warning_msg = ("One or more samples have no neighbors " "within specified radius; predicting NaN.") warnings.warn(empty_warning_msg) - if self._y.ndim == 1: y_pred = y_pred.ravel() diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index d22c0d1d9acac..3da1c2579700f 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -385,6 +385,7 @@ def test_radius_neighbors_classifier_outlier_labeling(): z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]]) # one outlier correct_labels1 = np.array([1, 2]) correct_labels2 = np.array([-1, 1, 2]) + outlier_proba = np.array([0, 0]) weight_func = _weight_func @@ -397,6 +398,72 @@ def test_radius_neighbors_classifier_outlier_labeling(): clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1)) assert_array_equal(correct_labels2, clf.predict(z2)) + assert_array_equal(outlier_proba, clf.predict_proba(z2)[0]) + + # test outlier_labeling of using predict_proba() + RNC = neighbors.RadiusNeighborsClassifier + X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]) + y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3]) + + # test outlier_label scalar verification + def check_array_exception(): + clf = RNC(radius=1, outlier_label=[[5]]) + clf.fit(X, y) + assert_raises(TypeError, check_array_exception) + + # test invalid outlier_label dtype + def check_dtype_exception(): + clf = RNC(radius=1, outlier_label='a') + clf.fit(X, y) + assert_raises(TypeError, check_dtype_exception) + + # test most frequent + clf = RNC(radius=1, outlier_label='most_frequent') + clf.fit(X, y) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 0, 0, 1]) + + # test manual label in y + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 1, 0, 0]) + pred = clf.predict([[1], [15]]) + assert_array_equal(pred, [2, 1]) + + # test manual label out of y warning + def check_warning(): + clf = RNC(radius=1, outlier_label=4) + clf.fit(X, y) + clf.predict_proba([[1], [15]]) + assert_warns(UserWarning, check_warning) + + # test multi output same outlier label + y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], + [1, 3], [3, 3], [3, 3], [3, 0], [3, 0]] + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) + pred = clf.predict([[7], [15]]) + assert_array_equal(pred[1, :], [1, 1]) + + # test multi output different outlier label + y_multi = [[0, 0], [2, 2], [2, 2], [1, 1], [1, 1], + [1, 1], [3, 3], [3, 3], [3, 3], [3, 3]] + clf = RNC(radius=1, outlier_label=[0, 1]) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) + pred = clf.predict([[7], [15]]) + assert_array_equal(pred[1, :], [0, 1]) + + # test inconsistent outlier label list length + def check_exception(): + clf = RNC(radius=1, outlier_label=[0, 1, 2]) + clf.fit(X, y_multi) + assert_raises(ValueError, check_exception) def test_radius_neighbors_classifier_zero_distance(): @@ -1413,3 +1480,21 @@ def test_pairwise_boolean_distance(): nn1 = NN(metric="jaccard", algorithm='brute').fit(X) nn2 = NN(metric="jaccard", algorithm='ball_tree').fit(X) assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) + + +def test_radius_neighbors_predict_proba(): + for seed in range(5): + X, y = datasets.make_classification(n_samples=50, n_features=5, + n_informative=3, n_redundant=0, + n_classes=3, random_state=seed) + X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) + outlier_label = int(2 - seed) + clf = neighbors.RadiusNeighborsClassifier(radius=2, + outlier_label=outlier_label) + clf.fit(X_tr, y_tr) + pred = clf.predict(X_te) + proba = clf.predict_proba(X_te) + proba_label = proba.argmax(axis=1) + proba_label = np.where(proba.sum(axis=1) == 0, + outlier_label, proba_label) + assert_array_equal(pred, proba_label)