@@ -65,6 +65,118 @@ def get_valid_metric_ids(L):
65
65
if (val.__name__ in L) or (val in L)]
66
66
67
67
cdef class DistanceMetric:
68
+ """Uniform interface for fast distance metric functions.
69
+
70
+ The `DistanceMetric` class provides a convenient way to compute pairwise distances
71
+ between samples. It supports various distance metrics, such as Euclidean distance,
72
+ Manhattan distance, and more.
73
+
74
+ The `pairwise` method can be used to compute pairwise distances between samples in
75
+ the input arrays. It returns a distance matrix representing the distances between
76
+ all pairs of samples.
77
+
78
+ The :meth:`get_metric` method allows you to retrieve a specific metric using its
79
+ string identifier.
80
+
81
+ Examples
82
+ --------
83
+ >>> from sklearn.metrics import DistanceMetric
84
+ >>> dist = DistanceMetric.get_metric('euclidean')
85
+ >>> X = [[1, 2], [3, 4], [5, 6]]
86
+ >>> Y = [[7, 8], [9, 10]]
87
+ >>> dist.pairwise(X,Y)
88
+ array([[7.81..., 10.63...]
89
+ [5.65..., 8.48...]
90
+ [1.41..., 4.24...]])
91
+
92
+ Available Metrics
93
+
94
+ The following lists the string metric identifiers and the associated
95
+ distance metric classes:
96
+
97
+ **Metrics intended for real-valued vector spaces:**
98
+
99
+ ============== ==================== ======== ===============================
100
+ identifier class name args distance function
101
+ -------------- -------------------- -------- -------------------------------
102
+ "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))``
103
+ "manhattan" ManhattanDistance - ``sum(|x - y|)``
104
+ "chebyshev" ChebyshevDistance - ``max(|x - y|)``
105
+ "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)``
106
+ "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))``
107
+ "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))``
108
+ ============== ==================== ======== ===============================
109
+
110
+ **Metrics intended for two-dimensional vector spaces:** Note that the haversine
111
+ distance metric requires data in the form of [latitude, longitude] and both
112
+ inputs and outputs are in units of radians.
113
+
114
+ ============ ================== ===============================================================
115
+ identifier class name distance function
116
+ ------------ ------------------ ---------------------------------------------------------------
117
+ "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
118
+ ============ ================== ===============================================================
119
+
120
+
121
+ **Metrics intended for integer-valued vector spaces:** Though intended
122
+ for integer-valued vectors, these are also valid metrics in the case of
123
+ real-valued vectors.
124
+
125
+ ============= ==================== ========================================
126
+ identifier class name distance function
127
+ ------------- -------------------- ----------------------------------------
128
+ "hamming" HammingDistance ``N_unequal(x, y) / N_tot``
129
+ "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))``
130
+ "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
131
+ ============= ==================== ========================================
132
+
133
+ **Metrics intended for boolean-valued vector spaces:** Any nonzero entry
134
+ is evaluated to "True". In the listings below, the following
135
+ abbreviations are used:
136
+
137
+ - N : number of dimensions
138
+ - NTT : number of dims in which both values are True
139
+ - NTF : number of dims in which the first value is True, second is False
140
+ - NFT : number of dims in which the first value is False, second is True
141
+ - NFF : number of dims in which both values are False
142
+ - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
143
+ - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
144
+
145
+ ================= ======================= ===============================
146
+ identifier class name distance function
147
+ ----------------- ----------------------- -------------------------------
148
+ "jaccard" JaccardDistance NNEQ / NNZ
149
+ "matching" MatchingDistance NNEQ / N
150
+ "dice" DiceDistance NNEQ / (NTT + NNZ)
151
+ "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N)
152
+ "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ)
153
+ "russellrao" RussellRaoDistance (N - NTT) / N
154
+ "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ)
155
+ "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT)
156
+ ================= ======================= ===============================
157
+
158
+ **User-defined distance:**
159
+
160
+ =========== =============== =======
161
+ identifier class name args
162
+ ----------- --------------- -------
163
+ "pyfunc" PyFuncDistance func
164
+ =========== =============== =======
165
+
166
+ Here ``func`` is a function which takes two one-dimensional numpy
167
+ arrays, and returns a distance. Note that in order to be used within
168
+ the BallTree, the distance must be a true metric:
169
+ i.e. it must satisfy the following properties
170
+
171
+ 1) Non-negativity: d(x, y) >= 0
172
+ 2) Identity: d(x, y) = 0 if and only if x == y
173
+ 3) Symmetry: d(x, y) = d(y, x)
174
+ 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
175
+
176
+ Because of the Python object overhead involved in calling the python
177
+ function, this will be fairly slow, but it will have the same
178
+ scaling as other distances.
179
+ """
68
180
@classmethod
69
181
def get_metric(cls, metric, dtype=np.float64, **kwargs):
70
182
"""Get the given distance metric from the string identifier.
@@ -74,11 +186,24 @@ cdef class DistanceMetric:
74
186
Parameters
75
187
----------
76
188
metric : str or class name
77
- The distance metric to use
189
+ The string identifier or class name of the desired distance metric.
190
+ See the documentation of the `DistanceMetric` class for a list of
191
+ available metrics.
192
+
78
193
dtype : {np.float32, np.float64}, default=np.float64
79
- The dtype of the data on which the metric will be applied
194
+ The data type of the input on which the metric will be applied.
195
+ This affects the precision of the computed distances.
196
+ By default, it is set to `np.float64`.
197
+
80
198
**kwargs
81
- additional arguments will be passed to the requested metric
199
+ Additional keyword arguments that will be passed to the requested metric.
200
+ These arguments can be used to customize the behavior of the specific
201
+ metric.
202
+
203
+ Returns
204
+ -------
205
+ metric_obj : instance of the requested metric
206
+ An instance of the requested distance metric class.
82
207
"""
83
208
if dtype == np.float32:
84
209
specialized_class = DistanceMetric32
0 commit comments