diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 0c726cdc0a62c..de24120718401 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -15,23 +15,22 @@ from ..base import BaseEstimator, ClassifierMixin from ..metrics.pairwise import pairwise_distances from ..preprocessing import LabelEncoder -from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args +from ..utils.validation import check_X_y, check_is_fitted, column_or_1d from ..utils.sparsefuncs import csc_median_axis_0 -from ..utils.multiclass import check_classification_targets +from ..utils.multiclass import (check_classification_targets, + _check_partial_fit_first_call) -class NearestCentroid(ClassifierMixin, BaseEstimator): +class NearestCentroid(BaseEstimator, ClassifierMixin): """Nearest centroid classifier. Each class is represented by its centroid, with test samples classified to the class with the nearest centroid. - - Read more in the :ref:`User Guide `. + Read more in the :ref:` >`. Parameters ---------- - metric : str or callable + metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its @@ -42,23 +41,21 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): If the "manhattan" metric is provided, this centroid is the median and for all other metrics, the centroid is now set to be the mean. - .. versionchanged:: 0.19 - ``metric='precomputed'`` was deprecated and now raises an error - - shrink_threshold : float, default=None + shrink_threshold : float, optional (default = None) Threshold for shrinking centroids to remove features. + true_classes_ : array-like, shape = [n_classes] + List of all the possible classes, used as memory for partial_fit + and passed during the first call. + Attributes ---------- - centroids_ : array-like of shape (n_classes, n_features) - Centroid of each class. - - classes_ : array of shape (n_classes,) - The unique classes labels. + centroids_ : array-like, shape = [n_classes, n_features] + Centroid of each class Examples -------- - >>> from sklearn.neighbors import NearestCentroid + >>> from sklearn.neighbors._nearest_centroid import NearestCentroid >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> y = np.array([1, 1, 1, 2, 2, 2]) @@ -68,9 +65,9 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): >>> print(clf.predict([[-0.8, -1]])) [1] - See Also + See also -------- - KNeighborsClassifier : Nearest neighbors classifier. + sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier Notes ----- @@ -83,11 +80,9 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): multiple cancer types by shrunken centroids of gene expression. Proceedings of the National Academy of Sciences of the United States of America, 99(10), 6567-6572. The National Academy of Sciences. - """ - @_deprecate_positional_args - def __init__(self, metric='euclidean', *, shrink_threshold=None): + def __init__(self, metric='euclidean', shrink_threshold=None): self.metric = metric self.shrink_threshold = shrink_threshold @@ -97,21 +92,76 @@ def fit(self, X, y): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vector, where n_samples is the number of samples and + n_features is the number of features. + Note that centroid shrinking cannot be used with sparse matrices. + + y : array, shape = [n_samples] + Target values (integers) + """ + X, y = self._validate_data(X, y) + y = column_or_1d(y, warn=True) + return self._partial_fit(X, y, np.unique(y), _refit=True) + + def partial_fit(self, X, y, classes=None): + """Incremental fit on a batch of samples. + + This method is expected to be called several times consecutively + on different chunks of a dataset so as to implement out-of-core + or online learning. + This is especially useful when the whole dataset is too big to fit in + memory at once. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vector, where n_samples is the number of samples and + n_features is the number of features. + Note that centroid shrinking cannot be used with sparse matrices. + + y : array, shape = [n_samples] + Target values (integers) + + classes : array-like, shape = [n_classes], optional (default=None) + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + """ + if self.metric == 'manhattan': + raise ValueError("Partial fitting with manhattan not supported.") + return self._partial_fit(X, y, classes, _refit=False) + + def _partial_fit(self, X, y, classes=None, _refit=False): + """Actual implementation of the Nearest Centroid fitting. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. - y : array-like of shape (n_samples,) + + y : array, shape = [n_samples] Target values (integers) + + classes : array-like, shape = [n_classes], optional (default=None) + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + _refit : bool, optional (default=False) + If true, act as though this were the first time we called + _partial_fit (ie, throw away any past fitting and start over). """ if self.metric == 'precomputed': raise ValueError("Precomputed is not supported.") # If X is sparse and the metric is "manhattan", store it in a csc # format is easier to calculate the median. if self.metric == 'manhattan': - X, y = self._validate_data(X, y, accept_sparse=['csc']) + X, y = check_X_y(X, y, ['csc']) else: - X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) + X, y = check_X_y(X, y, ['csr', 'csc']) is_X_sparse = sp.issparse(X) if is_X_sparse and self.shrink_threshold: raise ValueError("threshold shrinking not supported" @@ -119,56 +169,98 @@ def fit(self, X, y): check_classification_targets(y) n_samples, n_features = X.shape + + if _refit or _check_partial_fit_first_call(self, classes): + if self.metric != 'euclidean': + warnings.warn("Averaging for metrics other than " + "euclidean and manhattan not supported. " + "The average is set to be the mean." + ) + self.true_classes_ = classes = np.asarray(classes) + # Mask mapping each class to its members. + self.true_centroids_ = np.zeros((classes.size, n_features), + dtype=np.float64) + # Number of clusters in each class. + self.nk_ = np.zeros(classes.size) + + if self.shrink_threshold: + self.ssd_ = np.zeros((classes.size, n_features), + dtype=np.float64) + self.dataset_centroid_ = np.mean(X, axis=0) + le = LabelEncoder() - y_ind = le.fit_transform(y) - self.classes_ = classes = le.classes_ - n_classes = classes.size + le.fit(self.true_classes_) + y_ind = le.transform(y) + n_classes = self.true_classes_.size if n_classes < 2: raise ValueError('The number of classes has to be greater than' ' one; got %d class' % (n_classes)) + if self.shrink_threshold: + old_nk = self.nk_.copy() + old_centroids = self.true_centroids_.copy() - # Mask mapping each class to its members. - self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64) - # Number of clusters in each class. - nk = np.zeros(n_classes) + for cl in range(n_classes): + cl_mask = y_ind == cl - for cur_class in range(n_classes): - center_mask = y_ind == cur_class - nk[cur_class] = np.sum(center_mask) + # Ignore if no data for this class + if X[cl_mask].size == 0: + continue if is_X_sparse: - center_mask = np.where(center_mask)[0] + cl_mask = np.where(cl_mask)[0] # XXX: Update other averaging methods according to the metrics. if self.metric == "manhattan": + self.nk_[cl] += np.sum(cl_mask) # NumPy does not calculate median of sparse matrices. if not is_X_sparse: - self.centroids_[cur_class] = np.median(X[center_mask], axis=0) + self.true_centroids_[cl] = np.median(X[cl_mask], axis=0) else: - self.centroids_[cur_class] = csc_median_axis_0(X[center_mask]) + self.true_centroids_[cl] = csc_median_axis_0(X[cl_mask]) else: - if self.metric != 'euclidean': - warnings.warn("Averaging for metrics other than " - "euclidean and manhattan not supported. " - "The average is set to be the mean." - ) - self.centroids_[cur_class] = X[center_mask].mean(axis=0) + # Update each centroid weighted by the number of samples + self.true_centroids_[cl] = (X[cl_mask].mean(axis=0) * + np.sum(cl_mask) + + self.true_centroids_[cl] * + self.nk_[cl]) + self.nk_[cl] += np.sum(cl_mask) + self.true_centroids_[cl] /= self.nk_[cl] + + # Filtering out centroids without any data + self.classes_ = self.true_classes_[self.nk_ != 0] + self.centroids_ = self.true_centroids_[self.nk_ != 0] if self.shrink_threshold: - if np.all(np.ptp(X, axis=0) == 0): - raise ValueError("All features have zero variance. " - "Division by zero.") - dataset_centroid_ = np.mean(X, axis=0) + n_total = np.sum(self.nk_) + self.dataset_centroid_ = (self.dataset_centroid_ * + old_nk.sum(axis=0) + + np.sum(X, axis=0)) / n_total + + # Update sum of square distances of each class + for cl in range(n_classes): + n_old = old_nk[cl] + n_new = self.nk_[cl] - n_old + if n_new == 0: + continue + center_mask = y_ind == cl + old_ssd = self.ssd_[cl] + new_ssd = ((X[center_mask] - X[center_mask].mean(axis=0))**2) + new_ssd = new_ssd.sum(axis=0) + self.ssd_[cl] = (old_ssd + new_ssd + + (n_old / float(n_new * (n_new + n_old))) * + (n_new * old_centroids[cl] - n_new * + X[center_mask].mean(axis=0)) ** 2) # m parameter for determining deviation - m = np.sqrt((1. / nk) - (1. / n_samples)) + m = np.sqrt((1. / self.nk_) - (1. / np.sum(self.nk_))) + # Calculate deviation using the standard deviation of centroids. - variance = (X - self.centroids_[y_ind]) ** 2 - variance = variance.sum(axis=0) - s = np.sqrt(variance / (n_samples - n_classes)) + ssd = self.ssd_.sum(axis=0) + s = np.sqrt(ssd / (n_total - n_classes)) s += np.median(s) # To deter outliers from affecting the results. mm = m.reshape(len(m), 1) # Reshape to allow broadcasting. ms = mm * s - deviation = ((self.centroids_ - dataset_centroid_) / ms) + deviation = ((self.true_centroids_ - self.dataset_centroid_) / ms) + # Soft thresholding: if the deviation crosses 0 during shrinking, # it becomes zero. signs = np.sign(deviation) @@ -177,7 +269,7 @@ def fit(self, X, y): deviation *= signs # Now adjust the centroids using the deviation msd = ms * deviation - self.centroids_ = dataset_centroid_[np.newaxis, :] + msd + self.centroids_ = self.dataset_centroid_[np.newaxis, :] + msd return self def predict(self, X): @@ -187,11 +279,11 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : array-like, shape = [n_samples, n_features] Returns ------- - C : ndarray of shape (n_samples,) + C : array, shape = [n_samples] Notes ----- @@ -199,7 +291,7 @@ def predict(self, X): be the distance matrix between the data to be predicted and ``self.centroids_``. """ - check_is_fitted(self) + check_is_fitted(self, 'centroids_') X = self._validate_data(X, accept_sparse='csr', reset=False) return self.classes_[pairwise_distances( diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py index 451aeff377e19..1dc0420e7a3af 100644 --- a/sklearn/neighbors/tests/test_nearest_centroid.py +++ b/sklearn/neighbors/tests/test_nearest_centroid.py @@ -5,6 +5,7 @@ import numpy as np from scipy import sparse as sp from numpy.testing import assert_array_equal +from numpy.testing import assert_equal from sklearn.neighbors import NearestCentroid from sklearn import datasets @@ -89,7 +90,7 @@ def test_pickle(): s = pickle.dumps(obj) obj2 = pickle.loads(s) - assert type(obj2) == obj.__class__ + assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_array_equal(score, score2, "Failed to generate same score" @@ -101,7 +102,6 @@ def test_shrinkage_correct(): # The expected result is calculated by R (pamr), # which is implemented by the author of the original paper. # (One need to modify the code to output the new centroid in pamr.predict) - X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]]) y = np.array([1, 1, 2, 2, 2]) clf = NearestCentroid(shrink_threshold=0.1) @@ -148,15 +148,38 @@ def test_manhattan_metric(): assert_array_equal(dense_centroid, [[-1, -1], [1, 1]]) -def test_features_zero_var(): - # Test that features with 0 variance throw error +def test_partial_fit(): + # Test the partial fitting + + clf = NearestCentroid() + + clf.partial_fit(X[:3], y[:3], classes=[-1, 1]) + clf_partial_pred = clf.predict(T) + assert(not np.array_equal(clf_partial_pred, true_result)) + + clf.partial_fit(X[3:], y[3:]) + clf_complete_pred = clf.predict(T) + assert_array_equal(clf_complete_pred, true_result) + assert(not np.array_equal(clf_complete_pred, clf_partial_pred)) - X = np.empty((10, 2)) - X[:, 0] = -0.13725701 - X[:, 1] = -0.9853293 - y = np.zeros((10)) - y[0] = 1 + clf_fit = NearestCentroid() + clf_fit = clf_fit.fit(X, y) + assert_array_equal(clf_fit.predict(T), clf_complete_pred) + +def test_partial_shrinkage_correct(): + # Ensure that the shrinking is correct. + # The expected result is calculated by R (pamr), + # which is implemented by the author of the original paper. + # (One need to modify the code to output the new centroid in pamr.predict) + X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]]) + y = np.array([1, 1, 2, 2, 2]) clf = NearestCentroid(shrink_threshold=0.1) - with assert_raises(ValueError): - clf.fit(X, y) + clf.partial_fit(X[:3], y[:3], classes=[1, 2]) + expected_result = np.array([[0.55773503, 0.55773503], + [0.88452995, 0.88452995]]) + np.testing.assert_array_almost_equal(clf.centroids_, expected_result) + + clf.partial_fit(X[3:], y[3:]) + expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]]) + np.testing.assert_array_almost_equal(clf.centroids_, expected_result)