From de5d7dcc74634ee90f82cf8a5ffd4812e2a4065c Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 14 Jun 2024 16:24:29 +0200 Subject: [PATCH 01/20] array API support for cosine_distances --- doc/modules/array_api.rst | 1 + sklearn/metrics/pairwise.py | 17 +++++++++++++++-- sklearn/metrics/tests/test_common.py | 2 ++ sklearn/utils/_array_api.py | 14 ++++++++++++++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index a51ee60e47e04..d884a631fc408 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -123,6 +123,7 @@ Metrics - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` - :func:`sklearn.metrics.pairwise.chi2_kernel` - :func:`sklearn.metrics.pairwise.cosine_similarity` +- :func:`sklearn.metrics.pairwise.cosine_distances` - :func:`sklearn.metrics.pairwise.paired_cosine_distances` - :func:`sklearn.metrics.r2_score` - :func:`sklearn.metrics.zero_one_loss` diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9382d585a5fe7..6fff79e251635 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -13,6 +13,7 @@ from scipy.sparse import csr_matrix, issparse from scipy.spatial import distance +from sklearn.utils import _array_api from .. import config_context from ..exceptions import DataConversionWarning from ..preprocessing import normalize @@ -1120,15 +1121,27 @@ def cosine_distances(X, Y=None): array([[1. , 1. ], [0.42..., 0.18...]]) """ + xp, _ = get_namespace(X, Y) + # 1.0 - cosine_similarity(X, Y) without copy S = cosine_similarity(X, Y) S *= -1 S += 1 - np.clip(S, 0, 2, out=S) + S = _array_api._clip(X, S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - np.fill_diagonal(S, 0.0) + S = _fill_diagonal(S, 0.0, xp) + return S + + +def _fill_diagonal(S, val, xp): + S = xp.asarray(S) + shape = S.shape + diagonal_length = min(shape) + indices = xp.arange(diagonal_length) + S[tuple(indices for _ in shape)] = val + return S diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 6110cbd3d1d13..904540258f2db 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -56,6 +56,7 @@ chi2_kernel, cosine_similarity, paired_cosine_distances, + cosine_distances, ) from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle @@ -2014,6 +2015,7 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) mean_gamma_deviance: [check_array_api_regression_metric], max_error: [check_array_api_regression_metric], chi2_kernel: [check_array_api_metric_pairwise], + cosine_distances: [check_array_api_metric_pairwise], } diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 542a8136da661..786de95f85c2b 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -752,6 +752,20 @@ def _nanmax(X, axis=None, xp=None): return X +def _clip(X, S, min_val, max_val, xp): + # TODO: remove this method and change all usage once we move to array api 2023.12 version + # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip + xp, _ = get_namespace(X, xp=xp) + if _is_numpy_namespace(xp): + return numpy.clip(S, min_val, max_val) + else: + min_arr = xp.asarray(min_val, dtype=S.dtype) + max_arr = xp.asarray(max_val, dtype=S.dtype) + S = xp.where(S < min_arr, min_arr, S) + S = xp.where(S > max_arr, max_arr, S) + return S + + def _asarray_with_order( array, dtype=None, order=None, copy=None, *, xp=None, device=None ): From 97764a7b37b78e2aa8ddd9ed8fd16e7d9c8219b3 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 14 Jun 2024 16:36:40 +0200 Subject: [PATCH 02/20] added PR number --- doc/whats_new/v1.6.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index c98314d5ca1de..0acb7cf716429 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -43,6 +43,7 @@ See :ref:`array_api` for more details. - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` :pr:`29144` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.chi2_kernel` :pr:`29267` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.pairwise.cosine_distances` :pr:`29265` by :user:`Emily Chen ` - :func:`sklearn.metrics.pairwise.paired_cosine_distances` :pr:`29112` by :user:`Edoardo Abati `. **Classes:** From b0ee75191fb832b877c3a8f216d20e03299aedae Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 14 Jun 2024 16:40:54 +0200 Subject: [PATCH 03/20] fixing linting issues --- sklearn/metrics/pairwise.py | 1 + sklearn/metrics/tests/test_common.py | 2 +- sklearn/utils/_array_api.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 6fff79e251635..9e0ea92a4e6e7 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -14,6 +14,7 @@ from scipy.spatial import distance from sklearn.utils import _array_api + from .. import config_context from ..exceptions import DataConversionWarning from ..preprocessing import normalize diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 904540258f2db..c60edefbc86c8 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -54,9 +54,9 @@ from sklearn.metrics.pairwise import ( additive_chi2_kernel, chi2_kernel, + cosine_distances, cosine_similarity, paired_cosine_distances, - cosine_distances, ) from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 786de95f85c2b..8a8d6f9a9e6d4 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -753,7 +753,7 @@ def _nanmax(X, axis=None, xp=None): def _clip(X, S, min_val, max_val, xp): - # TODO: remove this method and change all usage once we move to array api 2023.12 version + # TODO: remove this method and change all usage once we move to array api 2023.12 # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip xp, _ = get_namespace(X, xp=xp) if _is_numpy_namespace(xp): From d8e955716bdcf0fe6a44d867406b59e56a6bae52 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 17 Jun 2024 13:41:56 +0200 Subject: [PATCH 04/20] addressing review comments --- sklearn/metrics/pairwise.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9e0ea92a4e6e7..b779fad7706da 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -13,8 +13,6 @@ from scipy.sparse import csr_matrix, issparse from scipy.spatial import distance -from sklearn.utils import _array_api - from .. import config_context from ..exceptions import DataConversionWarning from ..preprocessing import normalize @@ -27,6 +25,7 @@ _find_matching_floating_dtype, _is_numpy_namespace, get_namespace, + _clip, ) from ..utils._chunking import get_chunk_n_rows from ..utils._mask import _get_mask @@ -1128,22 +1127,20 @@ def cosine_distances(X, Y=None): S = cosine_similarity(X, Y) S *= -1 S += 1 - S = _array_api._clip(X, S, 0, 2, xp) + S = _clip(X, S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - S = _fill_diagonal(S, 0.0, xp) + S = _fill_diagonal_2d(S, 0.0, xp) return S -def _fill_diagonal(S, val, xp): - S = xp.asarray(S) - shape = S.shape - diagonal_length = min(shape) - indices = xp.arange(diagonal_length) - S[tuple(indices for _ in shape)] = val +def _fill_diagonal_2d(S, val, xp): + assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" + n, m = S.shape + S_flat = xp.reshape(S, (-1,)) + S_flat[::m + 1] = val - return S # Paired distances From f81119b79f676c71f4b4830a34b2619ff45b8c98 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 17 Jun 2024 13:52:02 +0200 Subject: [PATCH 05/20] fixing linting issues --- sklearn/metrics/pairwise.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b779fad7706da..d276fe6f89e3c 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,10 +22,10 @@ gen_even_slices, ) from ..utils._array_api import ( + _clip, _find_matching_floating_dtype, _is_numpy_namespace, get_namespace, - _clip, ) from ..utils._chunking import get_chunk_n_rows from ..utils._mask import _get_mask @@ -1139,8 +1139,7 @@ def _fill_diagonal_2d(S, val, xp): assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" n, m = S.shape S_flat = xp.reshape(S, (-1,)) - S_flat[::m + 1] = val - + S_flat[:: m + 1] = val # Paired distances From d8d4c3c8880a68200bcfac6b968b65fcb522a4cd Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 17 Jun 2024 16:43:12 +0200 Subject: [PATCH 06/20] remove assignment for _fill_diagonal_2d --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index d276fe6f89e3c..59e16e6e4253b 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1131,7 +1131,7 @@ def cosine_distances(X, Y=None): if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - S = _fill_diagonal_2d(S, 0.0, xp) + _fill_diagonal_2d(S, 0.0, xp) return S From c7330603fb5f686270d83ddd065141c19cf728ca Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Wed, 26 Jun 2024 15:21:39 +0200 Subject: [PATCH 07/20] address review comments --- sklearn/metrics/pairwise.py | 2 +- sklearn/utils/_array_api.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 59e16e6e4253b..b2f9c5f3e9c04 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1137,7 +1137,7 @@ def cosine_distances(X, Y=None): def _fill_diagonal_2d(S, val, xp): assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" - n, m = S.shape + _, m = S.shape S_flat = xp.reshape(S, (-1,)) S_flat[:: m + 1] = val diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 8a8d6f9a9e6d4..1253f22bbf6dd 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -755,7 +755,6 @@ def _nanmax(X, axis=None, xp=None): def _clip(X, S, min_val, max_val, xp): # TODO: remove this method and change all usage once we move to array api 2023.12 # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip - xp, _ = get_namespace(X, xp=xp) if _is_numpy_namespace(xp): return numpy.clip(S, min_val, max_val) else: From 12d55693b3e8dbd18627e5ac0efef282626eebc5 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 8 Jul 2024 14:44:06 +0200 Subject: [PATCH 08/20] removing unnecessary input in _clip() --- sklearn/utils/_array_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 1253f22bbf6dd..4f140e0f31dbd 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -752,7 +752,7 @@ def _nanmax(X, axis=None, xp=None): return X -def _clip(X, S, min_val, max_val, xp): +def _clip(S, min_val, max_val, xp): # TODO: remove this method and change all usage once we move to array api 2023.12 # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip if _is_numpy_namespace(xp): From 8c773c8d0fe345ffdba6b7f1889a6cb740ebd79e Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 8 Jul 2024 15:57:23 +0200 Subject: [PATCH 09/20] changing _clip() usage inputs --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b2f9c5f3e9c04..152f288106525 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1127,7 +1127,7 @@ def cosine_distances(X, Y=None): S = cosine_similarity(X, Y) S *= -1 S += 1 - S = _clip(X, S, 0, 2, xp) + S = _clip(S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. From b23651dfc263e7b76d247b50a3fcff351576dc63 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 14 Jun 2024 16:24:29 +0200 Subject: [PATCH 10/20] array API support for cosine_distances --- doc/modules/array_api.rst | 1 + sklearn/metrics/pairwise.py | 17 +++++++++++++++-- sklearn/metrics/tests/test_common.py | 2 ++ sklearn/utils/_array_api.py | 14 ++++++++++++++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index a51ee60e47e04..d884a631fc408 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -123,6 +123,7 @@ Metrics - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` - :func:`sklearn.metrics.pairwise.chi2_kernel` - :func:`sklearn.metrics.pairwise.cosine_similarity` +- :func:`sklearn.metrics.pairwise.cosine_distances` - :func:`sklearn.metrics.pairwise.paired_cosine_distances` - :func:`sklearn.metrics.r2_score` - :func:`sklearn.metrics.zero_one_loss` diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9382d585a5fe7..6fff79e251635 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -13,6 +13,7 @@ from scipy.sparse import csr_matrix, issparse from scipy.spatial import distance +from sklearn.utils import _array_api from .. import config_context from ..exceptions import DataConversionWarning from ..preprocessing import normalize @@ -1120,15 +1121,27 @@ def cosine_distances(X, Y=None): array([[1. , 1. ], [0.42..., 0.18...]]) """ + xp, _ = get_namespace(X, Y) + # 1.0 - cosine_similarity(X, Y) without copy S = cosine_similarity(X, Y) S *= -1 S += 1 - np.clip(S, 0, 2, out=S) + S = _array_api._clip(X, S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - np.fill_diagonal(S, 0.0) + S = _fill_diagonal(S, 0.0, xp) + return S + + +def _fill_diagonal(S, val, xp): + S = xp.asarray(S) + shape = S.shape + diagonal_length = min(shape) + indices = xp.arange(diagonal_length) + S[tuple(indices for _ in shape)] = val + return S diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 6110cbd3d1d13..904540258f2db 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -56,6 +56,7 @@ chi2_kernel, cosine_similarity, paired_cosine_distances, + cosine_distances, ) from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle @@ -2014,6 +2015,7 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) mean_gamma_deviance: [check_array_api_regression_metric], max_error: [check_array_api_regression_metric], chi2_kernel: [check_array_api_metric_pairwise], + cosine_distances: [check_array_api_metric_pairwise], } diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index a00d250ab31d2..852493590909e 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -746,6 +746,20 @@ def _nanmax(X, axis=None, xp=None): return X +def _clip(X, S, min_val, max_val, xp): + # TODO: remove this method and change all usage once we move to array api 2023.12 version + # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip + xp, _ = get_namespace(X, xp=xp) + if _is_numpy_namespace(xp): + return numpy.clip(S, min_val, max_val) + else: + min_arr = xp.asarray(min_val, dtype=S.dtype) + max_arr = xp.asarray(max_val, dtype=S.dtype) + S = xp.where(S < min_arr, min_arr, S) + S = xp.where(S > max_arr, max_arr, S) + return S + + def _asarray_with_order( array, dtype=None, order=None, copy=None, *, xp=None, device=None ): From e90d498ad19d3c7599de5aacbbddce85fdfd0908 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 14 Jun 2024 16:36:40 +0200 Subject: [PATCH 11/20] added PR number --- doc/whats_new/v1.6.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index d4f1d14fabe23..0497c1e28e8a4 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -43,6 +43,7 @@ See :ref:`array_api` for more details. - :func:`sklearn.metrics.pairwise.additive_chi2_kernel` :pr:`29144` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.chi2_kernel` :pr:`29267` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.pairwise.cosine_distances` :pr:`29265` by :user:`Emily Chen ` - :func:`sklearn.metrics.pairwise.paired_cosine_distances` :pr:`29112` by :user:`Edoardo Abati `. **Classes:** From dd5ec7f169e82faa793c9034d2f8b687a9f86f1c Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 14 Jun 2024 16:40:54 +0200 Subject: [PATCH 12/20] fixing linting issues --- sklearn/metrics/pairwise.py | 1 + sklearn/metrics/tests/test_common.py | 2 +- sklearn/utils/_array_api.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 6fff79e251635..9e0ea92a4e6e7 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -14,6 +14,7 @@ from scipy.spatial import distance from sklearn.utils import _array_api + from .. import config_context from ..exceptions import DataConversionWarning from ..preprocessing import normalize diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 904540258f2db..c60edefbc86c8 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -54,9 +54,9 @@ from sklearn.metrics.pairwise import ( additive_chi2_kernel, chi2_kernel, + cosine_distances, cosine_similarity, paired_cosine_distances, - cosine_distances, ) from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 852493590909e..7f27275f146c2 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -747,7 +747,7 @@ def _nanmax(X, axis=None, xp=None): def _clip(X, S, min_val, max_val, xp): - # TODO: remove this method and change all usage once we move to array api 2023.12 version + # TODO: remove this method and change all usage once we move to array api 2023.12 # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip xp, _ = get_namespace(X, xp=xp) if _is_numpy_namespace(xp): From 917de8763fc1f4dd3288fcd96e307dbcba9cd439 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 17 Jun 2024 13:41:56 +0200 Subject: [PATCH 13/20] addressing review comments --- sklearn/metrics/pairwise.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9e0ea92a4e6e7..b779fad7706da 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -13,8 +13,6 @@ from scipy.sparse import csr_matrix, issparse from scipy.spatial import distance -from sklearn.utils import _array_api - from .. import config_context from ..exceptions import DataConversionWarning from ..preprocessing import normalize @@ -27,6 +25,7 @@ _find_matching_floating_dtype, _is_numpy_namespace, get_namespace, + _clip, ) from ..utils._chunking import get_chunk_n_rows from ..utils._mask import _get_mask @@ -1128,22 +1127,20 @@ def cosine_distances(X, Y=None): S = cosine_similarity(X, Y) S *= -1 S += 1 - S = _array_api._clip(X, S, 0, 2, xp) + S = _clip(X, S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - S = _fill_diagonal(S, 0.0, xp) + S = _fill_diagonal_2d(S, 0.0, xp) return S -def _fill_diagonal(S, val, xp): - S = xp.asarray(S) - shape = S.shape - diagonal_length = min(shape) - indices = xp.arange(diagonal_length) - S[tuple(indices for _ in shape)] = val +def _fill_diagonal_2d(S, val, xp): + assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" + n, m = S.shape + S_flat = xp.reshape(S, (-1,)) + S_flat[::m + 1] = val - return S # Paired distances From 9f431af237df57cfe3fd4a936f05e77359fa00e4 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 17 Jun 2024 13:52:02 +0200 Subject: [PATCH 14/20] fixing linting issues --- sklearn/metrics/pairwise.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b779fad7706da..d276fe6f89e3c 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,10 +22,10 @@ gen_even_slices, ) from ..utils._array_api import ( + _clip, _find_matching_floating_dtype, _is_numpy_namespace, get_namespace, - _clip, ) from ..utils._chunking import get_chunk_n_rows from ..utils._mask import _get_mask @@ -1139,8 +1139,7 @@ def _fill_diagonal_2d(S, val, xp): assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" n, m = S.shape S_flat = xp.reshape(S, (-1,)) - S_flat[::m + 1] = val - + S_flat[:: m + 1] = val # Paired distances From 73707482f1e66bf2193900105b5984f48a55bf34 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 17 Jun 2024 16:43:12 +0200 Subject: [PATCH 15/20] remove assignment for _fill_diagonal_2d --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index d276fe6f89e3c..59e16e6e4253b 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1131,7 +1131,7 @@ def cosine_distances(X, Y=None): if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - S = _fill_diagonal_2d(S, 0.0, xp) + _fill_diagonal_2d(S, 0.0, xp) return S From c655e74c22302f90bd96fac1016cd763e5e554a1 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Wed, 26 Jun 2024 15:21:39 +0200 Subject: [PATCH 16/20] address review comments --- sklearn/metrics/pairwise.py | 2 +- sklearn/utils/_array_api.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 59e16e6e4253b..b2f9c5f3e9c04 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1137,7 +1137,7 @@ def cosine_distances(X, Y=None): def _fill_diagonal_2d(S, val, xp): assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" - n, m = S.shape + _, m = S.shape S_flat = xp.reshape(S, (-1,)) S_flat[:: m + 1] = val diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 7f27275f146c2..0dd683f2911bc 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -749,7 +749,6 @@ def _nanmax(X, axis=None, xp=None): def _clip(X, S, min_val, max_val, xp): # TODO: remove this method and change all usage once we move to array api 2023.12 # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip - xp, _ = get_namespace(X, xp=xp) if _is_numpy_namespace(xp): return numpy.clip(S, min_val, max_val) else: From 3cb52c735e60e0358e2d0b262f08eda701ddc175 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 8 Jul 2024 14:44:06 +0200 Subject: [PATCH 17/20] removing unnecessary input in _clip() --- sklearn/utils/_array_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 0dd683f2911bc..8ec4115240cf8 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -746,7 +746,7 @@ def _nanmax(X, axis=None, xp=None): return X -def _clip(X, S, min_val, max_val, xp): +def _clip(S, min_val, max_val, xp): # TODO: remove this method and change all usage once we move to array api 2023.12 # https://data-apis.org/array-api/2023.12/API_specification/generated/array_api.clip.html#clip if _is_numpy_namespace(xp): From 8236c3bd1ab5eebe4d1b6a138d3ab65b69d34f94 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 8 Jul 2024 15:57:23 +0200 Subject: [PATCH 18/20] changing _clip() usage inputs --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b2f9c5f3e9c04..152f288106525 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1127,7 +1127,7 @@ def cosine_distances(X, Y=None): S = cosine_similarity(X, Y) S *= -1 S += 1 - S = _clip(X, S, 0, 2, xp) + S = _clip(S, 0, 2, xp) if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. From 7b898ce89d6ed3f76ba85844ba74322e8292f2db Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 12 Jul 2024 11:54:59 +0200 Subject: [PATCH 19/20] addressing review comments --- sklearn/metrics/pairwise.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2d51f9a12b1b0..3d571a1dda8a2 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1150,15 +1150,15 @@ def cosine_distances(X, Y=None): if X is Y or Y is None: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. - _fill_diagonal_2d(S, 0.0, xp) + _fill_or_add_to_diagonal(S, 0.0, xp, add_value=False) return S -def _fill_diagonal_2d(S, val, xp): - assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" - _, m = S.shape - S_flat = xp.reshape(S, (-1,)) - S_flat[:: m + 1] = val +# def _fill_diagonal_2d(S, val, xp): +# assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" +# _, m = S.shape +# S_flat = xp.reshape(S, (-1,)) +# S_flat[:: m + 1] = val # Paired distances From e7d583950ef26acc0069a3ac1f60a2888f08cda2 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Fri, 12 Jul 2024 14:35:15 +0200 Subject: [PATCH 20/20] address review comments --- sklearn/metrics/pairwise.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 3d571a1dda8a2..f8b163813d6d6 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1154,13 +1154,6 @@ def cosine_distances(X, Y=None): return S -# def _fill_diagonal_2d(S, val, xp): -# assert S.ndim == 2, "_fill_diagonal_2d supports 2D arrays only" -# _, m = S.shape -# S_flat = xp.reshape(S, (-1,)) -# S_flat[:: m + 1] = val - - # Paired distances @validate_params( {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},