diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index e6911d90a7d77..59071717fb8c8 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -128,6 +128,12 @@ Changelog when the provided `sample_weight` reduces the problem to a single class in `fit`. :pr:`24140` by :user:`Jonathan Ohayon ` and :user:`Chiara Marmo `. +:mod:`sklearn.neighbors` +........................ +- |Fix| Remove support for `"KulsinskiDistance"` in :class:`neighbors.BallTree`. This + metric is not a proper metric and cannot be supported by the BallTree. + :pr:`25212` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.neural_network` ............................. diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index d2d8e35d9acb0..7852461082f4b 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -81,7 +81,7 @@ class OPTICS(ClusterMixin, BaseEstimator): 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'kulczynski1', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] @@ -90,6 +90,10 @@ class OPTICS(ClusterMixin, BaseEstimator): See the documentation for scipy.spatial.distance for details on these metrics. + .. note:: + `'kulsinski'` is deprecated from SciPy 1.8. Use `'kulczynski1'` instead. + Note that the two metrics are not identical. + p : float, default=2 Parameter for the Minkowski metric from :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is @@ -481,7 +485,7 @@ def compute_optics_graph( 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'kulczynski1', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] @@ -489,6 +493,10 @@ def compute_optics_graph( See the documentation for scipy.spatial.distance for details on these metrics. + .. note:: + `'kulsinski'` is deprecated from SciPy 1.8. Use `'kulczynski1'` instead. + Note that the two metrics are not identical. + p : int, default=2 Parameter for the Minkowski metric from :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 1e4a9429af03f..dbfb241aa2799 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -42,6 +42,7 @@ from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DTYPECODE from ..utils._typedefs import DTYPE, ITYPE from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper from ..utils import check_array +from ..utils.fixes import parse_version, sp_version cdef inline double fmax(double a, double b) nogil: return max(a, b) @@ -59,12 +60,17 @@ BOOL_METRICS = [ "matching", "jaccard", "dice", - "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", ] +if sp_version >= parse_version("1.8"): + # Introduced in SciPy 1.8 + BOOL_METRICS += ["kulczynski1"] +if sp_version < parse_version("1.11"): + # Deprecated in SciPy 1.8 and removed in SciPy 1.10 + BOOL_METRICS += ["kulsinski"] def get_valid_metric_ids(L): """Given an iterable of metric class names or class identifiers, @@ -105,6 +111,7 @@ METRIC_MAPPING{{name_suffix}} = { 'jaccard': JaccardDistance{{name_suffix}}, 'dice': DiceDistance{{name_suffix}}, 'kulsinski': KulsinskiDistance{{name_suffix}}, + 'kulczynski1': Kulczynski1Distance{{name_suffix}}, 'rogerstanimoto': RogersTanimotoDistance{{name_suffix}}, 'russellrao': RussellRaoDistance{{name_suffix}}, 'sokalmichener': SokalMichenerDistance{{name_suffix}}, @@ -213,6 +220,7 @@ cdef class DistanceMetric{{name_suffix}}: "matching" MatchingDistance NNEQ / N "dice" DiceDistance NNEQ / (NTT + NNZ) "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "kulczynski1" Kulczynski1Distance NTT / NNEQ "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) "russellrao" RussellRaoDistance (N - NTT) / N "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) @@ -2312,6 +2320,102 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (n_neq - n_tt + size) * 1.0 / (n_neq + size) + +#------------------------------------------------------------ +# Kulczynski Distance (boolean) +# D(x, y) = c_11 / (c_01 + c_10) +cdef class Kulczynski1Distance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Kulsinski Distance + + Kulczynski Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = c_11 / (c_01 + c_10) + + """ + cdef inline DTYPE_t dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: + cdef int tf1, tf2, c_11 = 0, c_01 = 0, c_10 = 0 + cdef cnp.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + if tf1 == 0 and tf2 == 0: + continue + elif tf1 == tf2: + c_11 += (tf1 and tf2) + elif tf1 == 0: + c_01 += tf2 + else: + c_10 += tf1 + return c_11 / (c_01 + c_10) + + cdef inline DTYPE_t dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, c_11 = 0, c_01 = 0, c_10 = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + if tf1 == 0 and tf2 == 0: + pass + elif tf1 == tf2: + c_11 += (tf1 and tf2) + elif tf1 == 0: + c_01 += tf2 + else: + c_10 += tf1 + i1 += 1 + i2 += 1 + elif ix1 < ix2: + # non-zero value in x1 but not in x2 + c_10 += tf1 + i1 += 1 + else: + # non-zero value in x2 but not in x1 + c_01 += tf2 + i2 += 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + c_01 += tf2 + i2 += 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + c_10 += tf1 + i1 += 1 + + return c_11 / (c_10 + c_01) + + #------------------------------------------------------------ # Rogers-Tanimoto Distance (boolean) # D(x, y) = 2 * n_neq / (n + n_neq) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b34722dc25df7..4e2a22d163f02 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -635,7 +635,7 @@ def pairwise_distances_argmin_min( 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'kulczynski1', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] @@ -643,6 +643,10 @@ def pairwise_distances_argmin_min( See the documentation for scipy.spatial.distance for details on these metrics. + .. note:: + `'kulsinski'` is deprecated from SciPy 1.8. Use `'kulczynski1'` instead. + Note that the two metrics are not identical. + metric_kwargs : dict, default=None Keyword arguments to pass to specified metric function. @@ -752,7 +756,7 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'kulczynski1', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] @@ -760,6 +764,10 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs See the documentation for scipy.spatial.distance for details on these metrics. + .. note:: + `'kulsinski'` is deprecated from SciPy 1.8. Use `'kulczynski1'` instead. + Note that the two metrics are not identical. + metric_kwargs : dict, default=None Keyword arguments to pass to specified metric function. @@ -1639,7 +1647,6 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): "dice", "hamming", "jaccard", - "kulsinski", "mahalanobis", "matching", "minkowski", @@ -1654,6 +1661,12 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): "nan_euclidean", "haversine", ] +if sp_version >= parse_version("1.8"): + # Introduced in SciPy 1.8 + _VALID_METRICS += ["kulczynski1"] +if sp_version < parse_version("1.11"): + # Deprecated in SciPy 1.8 and removed in SciPy 1.10 + _VALID_METRICS += ["kulsinski"] _NAN_METRICS = ["nan_euclidean"] @@ -1902,12 +1915,16 @@ def pairwise_distances( ['nan_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', - 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'kulczynski1', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. + .. note:: + `'kulsinski'` is deprecated from SciPy 1.8. Use `'kulczynski1'` instead. + Note that the two metrics are not identical. + Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except @@ -2043,7 +2060,6 @@ def pairwise_distances( PAIRWISE_BOOLEAN_FUNCTIONS = [ "dice", "jaccard", - "kulsinski", "matching", "rogerstanimoto", "russellrao", @@ -2051,6 +2067,12 @@ def pairwise_distances( "sokalsneath", "yule", ] +if sp_version >= parse_version("1.8"): + # Introduced in SciPy 1.8 + PAIRWISE_BOOLEAN_FUNCTIONS += ["kulczynski1"] +if sp_version < parse_version("1.11"): + # Deprecated in SciPy 1.8 and removed in SciPy 1.10 + PAIRWISE_BOOLEAN_FUNCTIONS += ["kulsinski"] # Helper functions - distance PAIRWISE_KERNEL_FUNCTIONS = { diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 3624983c4c481..a9800ea772feb 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -225,7 +225,7 @@ def test_pairwise_boolean_distance(metric): with ignore_warnings(category=DataConversionWarning): for Z in [Y, None]: res = pairwise_distances(X, Z, metric=metric) - res[np.isnan(res)] = 0 + np.nan_to_num(res, nan=0, posinf=0, neginf=0, copy=False) assert np.sum(res != 0) == 0 # non-boolean arrays are converted to boolean for boolean diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index 094a8826acfb9..6dcc6afa2127d 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -11,7 +11,7 @@ VALID_METRICS = ['EuclideanDistance', 'SEuclideanDistance', 'MahalanobisDistance', 'HammingDistance', 'CanberraDistance', 'BrayCurtisDistance', 'JaccardDistance', 'MatchingDistance', - 'DiceDistance', 'KulsinskiDistance', + 'DiceDistance', 'RogersTanimotoDistance', 'RussellRaoDistance', 'SokalMichenerDistance', 'SokalSneathDistance', 'PyFuncDistance', 'HaversineDistance'] diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3b01824a3a73a..dc8c450f61c3d 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -57,7 +57,7 @@ "dice", "hamming", "jaccard", - "kulsinski", + "kulsinski" if sp_version < parse_version("1.8") else "kulczynski1", "mahalanobis", "matching", "minkowski", diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index d5046afd2da2a..8d665f799e9d8 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -30,7 +30,6 @@ "matching", "jaccard", "dice", - "kulsinski", "rogerstanimoto", "russellrao", "sokalmichener",