From 93d0ffe154870934a027d0b249e694cda5d5e5f7 Mon Sep 17 00:00:00 2001 From: greyisbetter Date: Fri, 7 Jul 2023 12:21:46 +0530 Subject: [PATCH 1/6] [doc] DistanceMetric class docstring enhancement --- sklearn/metrics/_dist_metrics.pyx.tp | 125 ++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index bc54e51a7511a..e84793e6b680a 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -65,20 +65,137 @@ def get_valid_metric_ids(L): if (val.__name__ in L) or (val in L)] cdef class DistanceMetric: + """DistanceMetric class + + This class provides a uniform interface to fast distance metric + functions. The various metrics can be accessed via the :meth:`get_metric` + class method and the metric string identifier (see below). + + Examples + -------- + >>> from sklearn.metrics import DistanceMetric + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[1, 2], [3, 4], [5, 6]] + >>> Y = [[7, 8], [9, 10]] + >>> dist.pairwise(X,Y) + array([[7.81024968, 10.63014581] + [5.65685425, 8.48528137] + [1.41421356, 4.24264069]]) + + Available Metrics + + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``max(|x - y|)`` + "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== =============================================================== + identifier class name distance function + ------------ ------------------ --------------------------------------------------------------- + "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` + ============ ================== =============================================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N : number of dimensions + - NTT : number of dims in which both values are True + - NTF : number of dims in which the first value is True, second is False + - NFT : number of dims in which the first value is False, second is True + - NFF : number of dims in which both values are False + - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "matching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance (N - NTT) / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ @classmethod def get_metric(cls, metric, dtype=np.float64, **kwargs): """Get the given distance metric from the string identifier. - See the docstring of DistanceMetric for a list of available metrics. + This method returns an instance of the requested distance metric given its string identifier or class name. + It provides a convenient way to obtain a distance metric object for use in pairwise distance computations. Parameters ---------- metric : str or class name - The distance metric to use + The string identifier or class name of the desired distance metric. + See the documentation of the `DistanceMetric` class for a list of available metrics. + dtype : {np.float32, np.float64}, default=np.float64 - The dtype of the data on which the metric will be applied + The data type of the input on which the metric will be applied. + This affects the precision of the computed distances. + By default, it is set to `np.float64` for higher precision. + **kwargs - additional arguments will be passed to the requested metric + Additional keyword arguments that will be passed to the requested metric. + These arguments can be used to customize the behavior of the specific metric. + + Returns + ------- + metric_obj : instance of the requested metric + An instance of the requested distance metric class. """ if dtype == np.float32: specialized_class = DistanceMetric32 From f53a165ac1db76fb0d7936489b9703e42cf61efd Mon Sep 17 00:00:00 2001 From: greyisbetter Date: Mon, 10 Jul 2023 18:47:12 +0530 Subject: [PATCH 2/6] [doc] add some notes on methods --- sklearn/metrics/_dist_metrics.pyx.tp | 92 +++------------------------- 1 file changed, 7 insertions(+), 85 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index e84793e6b680a..077a389aec229 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -82,93 +82,15 @@ cdef class DistanceMetric: [5.65685425, 8.48528137] [1.41421356, 4.24264069]]) - Available Metrics - - The following lists the string metric identifiers and the associated - distance metric classes: - - **Metrics intended for real-valued vector spaces:** - - ============== ==================== ======== =============================== - identifier class name args distance function - -------------- -------------------- -------- ------------------------------- - "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` - "manhattan" ManhattanDistance - ``sum(|x - y|)`` - "chebyshev" ChebyshevDistance - ``max(|x - y|)`` - "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` - "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` - "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` - ============== ==================== ======== =============================== + Notes + ----- + The `DistanceMetric` class provides a convenient way to compute pairwise distances between samples. + It supports various distance metrics, such as Euclidean distance, Manhattan distance, and more. - **Metrics intended for two-dimensional vector spaces:** Note that the haversine - distance metric requires data in the form of [latitude, longitude] and both - inputs and outputs are in units of radians. - - ============ ================== =============================================================== - identifier class name distance function - ------------ ------------------ --------------------------------------------------------------- - "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` - ============ ================== =============================================================== - - - **Metrics intended for integer-valued vector spaces:** Though intended - for integer-valued vectors, these are also valid metrics in the case of - real-valued vectors. - - ============= ==================== ======================================== - identifier class name distance function - ------------- -------------------- ---------------------------------------- - "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` - "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` - "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` - ============= ==================== ======================================== - - **Metrics intended for boolean-valued vector spaces:** Any nonzero entry - is evaluated to "True". In the listings below, the following - abbreviations are used: + The `pairwise` method can be used to compute pairwise distances between samples in the input arrays. + It returns a distance matrix representing the distances between all pairs of samples. - - N : number of dimensions - - NTT : number of dims in which both values are True - - NTF : number of dims in which the first value is True, second is False - - NFT : number of dims in which the first value is False, second is True - - NFF : number of dims in which both values are False - - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT - - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT - - ================= ======================= =============================== - identifier class name distance function - ----------------- ----------------------- ------------------------------- - "jaccard" JaccardDistance NNEQ / NNZ - "matching" MatchingDistance NNEQ / N - "dice" DiceDistance NNEQ / (NTT + NNZ) - "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) - "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) - "russellrao" RussellRaoDistance (N - NTT) / N - "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) - "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) - ================= ======================= =============================== - - **User-defined distance:** - - =========== =============== ======= - identifier class name args - ----------- --------------- ------- - "pyfunc" PyFuncDistance func - =========== =============== ======= - - Here ``func`` is a function which takes two one-dimensional numpy - arrays, and returns a distance. Note that in order to be used within - the BallTree, the distance must be a true metric: - i.e. it must satisfy the following properties - - 1) Non-negativity: d(x, y) >= 0 - 2) Identity: d(x, y) = 0 if and only if x == y - 3) Symmetry: d(x, y) = d(y, x) - 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) - - Because of the Python object overhead involved in calling the python - function, this will be fairly slow, but it will have the same - scaling as other distances. + The `get_metric` class method allows obtaining an instance of a specific distance metric by providing its name or class. """ @classmethod def get_metric(cls, metric, dtype=np.float64, **kwargs): From ea631d68b34844dccf10af1a69dffbc984607270 Mon Sep 17 00:00:00 2001 From: greyisbetter Date: Tue, 11 Jul 2023 16:14:58 +0530 Subject: [PATCH 3/6] [doc] suggested updates --- sklearn/metrics/_dist_metrics.pyx.tp | 43 +++++++++++++--------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 077a389aec229..7dfc2dbe73ca4 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -65,11 +65,19 @@ def get_valid_metric_ids(L): if (val.__name__ in L) or (val in L)] cdef class DistanceMetric: - """DistanceMetric class + """Uniform interface for fast distance metric functions. - This class provides a uniform interface to fast distance metric - functions. The various metrics can be accessed via the :meth:`get_metric` - class method and the metric string identifier (see below). + The `DistanceMetric` class provides a convenient way to compute pairwise distances + between samples. It supports various distance metrics, such as Euclidean distance, + Manhattan distance, and more. + + The `pairwise` method can be used to compute pairwise distances between samples in + the input arrays. It returns a distance matrix representing the distances between + all pairs of samples. + + The `get_metric` class method allows obtaining an instance of a specific distance + metric by providing its name or class. The various metrics can be accessed via the + :meth:`get_metric` class method and the metric string identifiers. Examples -------- @@ -78,41 +86,30 @@ cdef class DistanceMetric: >>> X = [[1, 2], [3, 4], [5, 6]] >>> Y = [[7, 8], [9, 10]] >>> dist.pairwise(X,Y) - array([[7.81024968, 10.63014581] - [5.65685425, 8.48528137] - [1.41421356, 4.24264069]]) - - Notes - ----- - The `DistanceMetric` class provides a convenient way to compute pairwise distances between samples. - It supports various distance metrics, such as Euclidean distance, Manhattan distance, and more. - - The `pairwise` method can be used to compute pairwise distances between samples in the input arrays. - It returns a distance matrix representing the distances between all pairs of samples. - - The `get_metric` class method allows obtaining an instance of a specific distance metric by providing its name or class. + array([[7.81..., 10.63...] + [5.65..., 8.48...] + [1.41..., 4.24...]]) """ @classmethod def get_metric(cls, metric, dtype=np.float64, **kwargs): """Get the given distance metric from the string identifier. - This method returns an instance of the requested distance metric given its string identifier or class name. - It provides a convenient way to obtain a distance metric object for use in pairwise distance computations. - Parameters ---------- metric : str or class name The string identifier or class name of the desired distance metric. - See the documentation of the `DistanceMetric` class for a list of available metrics. + See the documentation of the `DistanceMetric` class for a list of + available metrics. dtype : {np.float32, np.float64}, default=np.float64 The data type of the input on which the metric will be applied. This affects the precision of the computed distances. - By default, it is set to `np.float64` for higher precision. + By default, it is set to `np.float64`. **kwargs Additional keyword arguments that will be passed to the requested metric. - These arguments can be used to customize the behavior of the specific metric. + These arguments can be used to customize the behavior of the specific + metric. Returns ------- From 62a339b6994e3808626ede0dbc815840a7e9bdf3 Mon Sep 17 00:00:00 2001 From: greyisbetter Date: Wed, 12 Jul 2023 23:40:58 +0530 Subject: [PATCH 4/6] [doc] update get_metric desc in class docstring --- sklearn/metrics/_dist_metrics.pyx.tp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 7dfc2dbe73ca4..2bbdbc0e4fd50 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -75,9 +75,8 @@ cdef class DistanceMetric: the input arrays. It returns a distance matrix representing the distances between all pairs of samples. - The `get_metric` class method allows obtaining an instance of a specific distance - metric by providing its name or class. The various metrics can be accessed via the - :meth:`get_metric` class method and the metric string identifiers. + The :meth:`get_metric` method allows you to obtain an instance of a specific + distance metric based on its name or class. Examples -------- From d9dab6f0bce2ad67181a0be416538074242db12f Mon Sep 17 00:00:00 2001 From: greyisbetter Date: Thu, 13 Jul 2023 14:35:07 +0530 Subject: [PATCH 5/6] [doc] add info regarding how to find valid metrics --- sklearn/metrics/_dist_metrics.pyx.tp | 90 ++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 2bbdbc0e4fd50..ac8d39583dcce 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -88,11 +88,101 @@ cdef class DistanceMetric: array([[7.81..., 10.63...] [5.65..., 8.48...] [1.41..., 4.24...]]) + + Available Metrics + + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``max(|x - y|)`` + "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== =============================================================== + identifier class name distance function + ------------ ------------------ --------------------------------------------------------------- + "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` + ============ ================== =============================================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N : number of dimensions + - NTT : number of dims in which both values are True + - NTF : number of dims in which the first value is True, second is False + - NFT : number of dims in which the first value is False, second is True + - NFF : number of dims in which both values are False + - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "matching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance (N - NTT) / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. """ @classmethod def get_metric(cls, metric, dtype=np.float64, **kwargs): """Get the given distance metric from the string identifier. + See the docstring of DistanceMetric for a list of available metrics. + Parameters ---------- metric : str or class name From 89843a0894e1c1bc0be287af490c66c7c77aa134 Mon Sep 17 00:00:00 2001 From: greyisbetter Date: Thu, 13 Jul 2023 15:33:09 +0530 Subject: [PATCH 6/6] [doc] get_metric desc update --- sklearn/metrics/_dist_metrics.pyx.tp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index ac8d39583dcce..539eef70ec4e6 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -75,8 +75,8 @@ cdef class DistanceMetric: the input arrays. It returns a distance matrix representing the distances between all pairs of samples. - The :meth:`get_metric` method allows you to obtain an instance of a specific - distance metric based on its name or class. + The :meth:`get_metric` method allows you to retrieve a specific metric using its + string identifier. Examples --------