diff --git a/.gitignore b/.gitignore index 76f8a60158209..199c2bd85d997 100644 --- a/.gitignore +++ b/.gitignore @@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx +sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx sklearn/neighbors/_ball_tree.pyx sklearn/neighbors/_binary_tree.pxi sklearn/neighbors/_kd_tree.pyx diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 0778681308b6a..49d05828eb237 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -263,6 +263,11 @@ Changelog :class:`metric.DistanceMetric` objects. :pr:`26267` by :user:`Meekail Zain ` +- |Efficiency| The performance of :meth:`neighbors.RadiusNeighborsClassifier.predict` + and of :meth:`neighbors.RadiusNeighborsClassifier.predict_proba` has been improved + when `radius` is large and `algorithm="brute"` with non-Euclidean metrics. + :pr:`26828` by :user:`Omar Salman `. + :mod:`sklearn.preprocessing` ............................ diff --git a/setup.cfg b/setup.cfg index b7705781dbb7d..28c28fc049fb9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,6 +53,7 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx + sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx sklearn/neighbors/_ball_tree.pyx sklearn/neighbors/_binary_tree.pxi sklearn/neighbors/_kd_tree.pyx diff --git a/setup.py b/setup.py index f9ae13c94502b..1ba7c10321fa1 100755 --- a/setup.py +++ b/setup.py @@ -295,6 +295,12 @@ def check_package_status(package, min_version): "include_np": True, "extra_compile_args": ["-std=c++11"], }, + { + "sources": ["_radius_neighbors_classmode.pyx.tp"], + "language": "c++", + "include_np": True, + "extra_compile_args": ["-std=c++11"], + }, ], "preprocessing": [ {"sources": ["_csr_polynomial_expansion.pyx"]}, diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py index 68972de0a1a51..5c9366945e8cc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/__init__.py +++ b/sklearn/metrics/_pairwise_distances_reduction/__init__.py @@ -91,6 +91,7 @@ ArgKminClassMode, BaseDistancesReductionDispatcher, RadiusNeighbors, + RadiusNeighborsClassMode, sqeuclidean_row_norms, ) @@ -99,5 +100,6 @@ "ArgKmin", "RadiusNeighbors", "ArgKminClassMode", + "RadiusNeighborsClassMode", "sqeuclidean_row_norms", ] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 937a6fd083260..1088fa86e7c9c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -23,6 +23,10 @@ RadiusNeighbors32, RadiusNeighbors64, ) +from ._radius_neighbors_classmode import ( + RadiusNeighborsClassMode32, + RadiusNeighborsClassMode64, +) def sqeuclidean_row_norms(X, num_threads): @@ -597,3 +601,153 @@ def compute( "Only float64 or float32 datasets pairs are supported at this time, " f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." ) + + +class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher): + """Compute radius-based class modes of row vectors of X using the + those of Y. + + For each row-vector X[i] of the queries X, find all the indices j of + row-vectors in Y such that: + + dist(X[i], Y[j]) <= radius + + RadiusNeighborsClassMode is typically used to perform bruteforce + radius neighbors queries when the weighted mode of the labels for + the nearest neighbors within the specified radius are required, + such as in `predict` methods. + + This class is not meant to be instantiated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + # Euclidean is technically usable for RadiusNeighborsClassMode + # but it would not be competitive. + # TODO: implement Euclidean specialization using GEMM. + "euclidean", + "sqeuclidean", + } + return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded) + + @classmethod + def compute( + cls, + X, + Y, + radius, + weights, + Y_labels, + unique_Y_labels, + outlier_label, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + ): + """Return the results of the reduction for the given arguments. + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + The input array to be labelled. + Y : ndarray of shape (n_samples_Y, n_features) + The input array whose class membership is provided through + the `Y_labels` parameter. + radius : float + The radius defining the neighborhood. + weights : ndarray + The weights applied to the `Y_labels` when computing the + weighted mode of the labels. + Y_labels : ndarray + An array containing the index of the class membership of the + associated samples in `Y`. This is used in labeling `X`. + unique_Y_labels : ndarray + An array containing all unique class labels. + outlier_label : int, default=None + Label for outlier samples (samples with no neighbors in given + radius). In the default case when the value is None if any + outlier is detected, a ValueError will be raised. The outlier + label should be selected from among the unique 'Y' labels. If + it is specified with a different value a warning will be raised + and all class probabilities of outliers will be assigned to be 0. + metric : str, default='euclidean' + The distance metric to use. For a list of available metrics, see + the documentation of :class:`~sklearn.metrics.DistanceMetric`. + Currently does not support `'precomputed'`. + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures + synchronisation. + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' + brings more opportunity for parallelism and is therefore more efficient + despite the synchronization step at each iteration of the outer loop + on chunks of `X`. + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + Returns + ------- + probabilities : ndarray of shape (n_samples_X, n_classes) + An array containing the class probabilities for each sample. + """ + if weights not in {"uniform", "distance"}: + raise ValueError( + "Only the 'uniform' or 'distance' weights options are supported" + f" at this time. Got: {weights=}." + ) + if X.dtype == Y.dtype == np.float64: + return RadiusNeighborsClassMode64.compute( + X=X, + Y=Y, + radius=radius, + weights=weights, + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), + outlier_label=outlier_label, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + if X.dtype == Y.dtype == np.float32: + return RadiusNeighborsClassMode32.compute( + X=X, + Y=Y, + radius=radius, + weights=weights, + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), + outlier_label=outlier_label, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + raise ValueError( + "Only float64 or float32 datasets pairs are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp new file mode 100644 index 0000000000000..25067b43cd20c --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp @@ -0,0 +1,217 @@ +import warnings + +from cython cimport floating, final, integral +from cython.operator cimport dereference as deref +from cython.parallel cimport parallel, prange +from ._classmode cimport WeightingStrategy +from ...utils._typedefs cimport intp_t, float64_t + +import numpy as np +from scipy.sparse import issparse +from ...utils.fixes import threadpool_limits + + +{{for name_suffix in ["32", "64"]}} +from ._radius_neighbors cimport RadiusNeighbors{{name_suffix}} +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + +cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}}): + """ + {{name_suffix}}bit implementation of RadiusNeighborsClassMode. + """ + cdef: + const intp_t[::1] Y_labels + const intp_t[::1] unique_Y_labels + intp_t outlier_label_index + bint outlier_label_exists + bint outliers_exist + unsigned char[::1] outliers + object outlier_label + float64_t[:, ::1] class_scores + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + float64_t radius, + weights, + Y_labels, + unique_Y_labels, + outlier_label=None, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + # Use a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = RadiusNeighborsClassMode{{name_suffix}}( + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + radius=radius, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=outlier_label, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + const intp_t[::1] Y_labels, + const intp_t[::1] unique_Y_labels, + float64_t radius, + chunk_size=None, + strategy=None, + weights=None, + outlier_label=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + radius=radius, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.callable + + self.Y_labels = Y_labels + self.unique_Y_labels = unique_Y_labels + self.outlier_label_index = -1 + self.outliers_exist = False + self.outlier_label = outlier_label + self.outliers = np.zeros(self.n_samples_X, dtype=np.bool_) + + cdef intp_t idx + if self.outlier_label is not None: + for idx in range(self.unique_Y_labels.shape[0]): + if self.unique_Y_labels[idx] == outlier_label: + self.outlier_label_index = idx + + # Map from set of unique labels to their indices in `class_scores` + # Buffer used in building a histogram for one-pass weighted mode + self.class_scores = np.zeros( + (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64, + ) + + + cdef inline void weighted_histogram_mode( + self, + intp_t sample_index, + intp_t sample_n_neighbors, + intp_t* indices, + float64_t* distances, + ) noexcept nogil: + cdef: + intp_t neighbor_idx, neighbor_class_idx, label_index + float64_t score_incr = 1 + bint use_distance_weighting = ( + self.weight_type == WeightingStrategy.distance + ) + + if sample_n_neighbors == 0: + self.outliers_exist = True + self.outliers[sample_index] = True + if self.outlier_label_index >= 0: + self.class_scores[sample_index][self.outlier_label_index] = score_incr + + return + + # Iterate over the neighbors. This can be different for + # each of the samples as they are based on the radius. + for neighbor_rank in range(sample_n_neighbors): + if use_distance_weighting: + score_incr = 1 / distances[neighbor_rank] + + neighbor_idx = indices[neighbor_rank] + neighbor_class_idx = self.Y_labels[neighbor_idx] + self.class_scores[sample_index][neighbor_class_idx] += score_incr + + return + + @final + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + cdef: + intp_t idx + + for idx in range(X_start, X_end): + self.weighted_histogram_mode( + sample_index=idx, + sample_n_neighbors=deref(self.neigh_indices)[idx].size(), + indices=deref(self.neigh_indices)[idx].data(), + distances=deref(self.neigh_distances)[idx].data(), + ) + + return + + @final + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil: + cdef: + intp_t idx + + with nogil, parallel(num_threads=self.effective_n_threads): + # Merge vectors used in threads into the main ones. + # This is done in parallel sample-wise (no need for locks). + for idx in prange(self.n_samples_X, schedule='static'): + self._merge_vectors(idx, self.chunks_n_threads) + + for idx in prange(self.n_samples_X, schedule='static'): + self.weighted_histogram_mode( + sample_index=idx, + sample_n_neighbors=deref(self.neigh_indices)[idx].size(), + indices=deref(self.neigh_indices)[idx].data(), + distances=deref(self.neigh_distances)[idx].data(), + ) + + return + + def _finalize_results(self): + if self.outliers_exist and self.outlier_label is None: + raise ValueError( + "No neighbors found for test samples %r, " + "you can try using larger radius, " + "giving a label for outliers, " + "or considering removing them from your dataset." + % np.where(self.outliers)[0] + ) + + if self.outliers_exist and self.outlier_label_index < 0: + warnings.warn( + "Outlier label %s is not in training " + "classes. All class probabilities of " + "outliers will be assigned with 0." + % self.outlier_label + ) + + probabilities = np.asarray(self.class_scores) + normalizer = probabilities.sum(axis=1, keepdims=True) + normalizer[normalizer == 0.0] = 1.0 + probabilities /= normalizer + return probabilities + +{{endfor}} diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 67d25adeba73d..f9b4b9fb242fe 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -15,6 +15,7 @@ ArgKminClassMode, BaseDistancesReductionDispatcher, RadiusNeighbors, + RadiusNeighborsClassMode, sqeuclidean_row_norms, ) from sklearn.utils._testing import ( @@ -852,6 +853,116 @@ def test_radius_neighbors_factory_method_wrong_usages(): ) +def test_radius_neighbors_classmode_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "manhattan" + weights = "uniform" + Y_labels = rng.randint(low=0, high=10, size=100) + unique_Y_labels = np.unique(Y_labels) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises(ValueError, match=msg): + RadiusNeighborsClassMode.compute( + X=X.astype(np.float32), + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises(ValueError, match=msg): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y.astype(np.int32), + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=-1, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises(ValueError, match="Unrecognized metric"): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=-1, + metric="wrong_metric", + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + RadiusNeighborsClassMode.compute( + X=np.array([1.0, 2.0]), + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + RadiusNeighborsClassMode.compute( + X=np.asfortranarray(X), + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + non_existent_weights_strategy = "non_existent_weights_strategy" + msg = ( + "Only the 'uniform' or 'distance' weights options are supported at this time. " + f"Got: weights='{non_existent_weights_strategy}'." + ) + with pytest.raises(ValueError, match=msg): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=radius, + metric="wrong_metric", + weights=non_existent_weights_strategy, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + @pytest.mark.parametrize( "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)] ) @@ -1362,3 +1473,39 @@ def test_argkmin_classmode_strategy_consistent(): strategy="parallel_on_Y", ) assert_array_equal(results_X, results_Y) + + +@pytest.mark.parametrize("outlier_label", [None, 0, 3, 6, 9]) +def test_radius_neighbors_classmode_strategy_consistent(outlier_label): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "manhattan" + + weights = "uniform" + Y_labels = rng.randint(low=0, high=10, size=100) + unique_Y_labels = np.unique(Y_labels) + results_X = RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=outlier_label, + strategy="parallel_on_X", + ) + results_Y = RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=outlier_label, + strategy="parallel_on_Y", + ) + assert_allclose(results_X, results_Y) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index be55b9b923400..e921ec3a9d165 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -15,7 +15,10 @@ from sklearn.neighbors._base import _check_precomputed from ..base import ClassifierMixin, _fit_context -from ..metrics._pairwise_distances_reduction import ArgKminClassMode +from ..metrics._pairwise_distances_reduction import ( + ArgKminClassMode, + RadiusNeighborsClassMode, +) from ..utils._param_validation import StrOptions from ..utils.extmath import weighted_mode from ..utils.fixes import _mode @@ -471,6 +474,10 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors - 'most_frequent' : assign the most frequent label of y to outliers. - None : when any outlier is detected, ValueError will be raised. + The outlier label should be selected from among the unique 'Y' labels. + If it is specified with a different value a warning will be raised and + all class probabilities of outliers will be assigned to be 0. + metric_params : dict, default=None Additional keyword arguments for the metric function. @@ -714,9 +721,39 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ - + check_is_fitted(self, "_fit_method") n_queries = _num_samples(X) + metric, metric_kwargs = _adjusted_metric( + metric=self.metric, metric_kwargs=self.metric_params, p=self.p + ) + + if ( + self.weights == "uniform" + and self._fit_method == "brute" + and not self.outputs_2d_ + and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric) + ): + probabilities = RadiusNeighborsClassMode.compute( + X=X, + Y=self._fit_X, + radius=self.radius, + weights=self.weights, + Y_labels=self._y, + unique_Y_labels=self.classes_, + outlier_label=self.outlier_label, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="parallel_on_X", + # `strategy="parallel_on_X"` has in practice be shown + # to be more efficient than `strategy="parallel_on_Y`` + # on many combination of datasets. + # Hence, we choose to enforce it here. + # For more information, see: + # https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471 # noqa + ) + return probabilities + neigh_dist, neigh_ind = self.radius_neighbors(X) outlier_mask = np.zeros(n_queries, dtype=bool) outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]