@@ -65,6 +65,118 @@ def get_valid_metric_ids(L):
6565 if (val.__name__ in L) or (val in L)]
6666
6767cdef class DistanceMetric:
68+ """Uniform interface for fast distance metric functions.
69+
70+ The `DistanceMetric` class provides a convenient way to compute pairwise distances
71+ between samples. It supports various distance metrics, such as Euclidean distance,
72+ Manhattan distance, and more.
73+
74+ The `pairwise` method can be used to compute pairwise distances between samples in
75+ the input arrays. It returns a distance matrix representing the distances between
76+ all pairs of samples.
77+
78+ The :meth:`get_metric` method allows you to retrieve a specific metric using its
79+ string identifier.
80+
81+ Examples
82+ --------
83+ >>> from sklearn.metrics import DistanceMetric
84+ >>> dist = DistanceMetric.get_metric('euclidean')
85+ >>> X = [[1, 2], [3, 4], [5, 6]]
86+ >>> Y = [[7, 8], [9, 10]]
87+ >>> dist.pairwise(X,Y)
88+ array([[7.81..., 10.63...]
89+ [5.65..., 8.48...]
90+ [1.41..., 4.24...]])
91+
92+ Available Metrics
93+
94+ The following lists the string metric identifiers and the associated
95+ distance metric classes:
96+
97+ **Metrics intended for real-valued vector spaces:**
98+
99+ ============== ==================== ======== ===============================
100+ identifier class name args distance function
101+ -------------- -------------------- -------- -------------------------------
102+ "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))``
103+ "manhattan" ManhattanDistance - ``sum(|x - y|)``
104+ "chebyshev" ChebyshevDistance - ``max(|x - y|)``
105+ "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)``
106+ "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))``
107+ "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))``
108+ ============== ==================== ======== ===============================
109+
110+ **Metrics intended for two-dimensional vector spaces:** Note that the haversine
111+ distance metric requires data in the form of [latitude, longitude] and both
112+ inputs and outputs are in units of radians.
113+
114+ ============ ================== ===============================================================
115+ identifier class name distance function
116+ ------------ ------------------ ---------------------------------------------------------------
117+ "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
118+ ============ ================== ===============================================================
119+
120+
121+ **Metrics intended for integer-valued vector spaces:** Though intended
122+ for integer-valued vectors, these are also valid metrics in the case of
123+ real-valued vectors.
124+
125+ ============= ==================== ========================================
126+ identifier class name distance function
127+ ------------- -------------------- ----------------------------------------
128+ "hamming" HammingDistance ``N_unequal(x, y) / N_tot``
129+ "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))``
130+ "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
131+ ============= ==================== ========================================
132+
133+ **Metrics intended for boolean-valued vector spaces:** Any nonzero entry
134+ is evaluated to "True". In the listings below, the following
135+ abbreviations are used:
136+
137+ - N : number of dimensions
138+ - NTT : number of dims in which both values are True
139+ - NTF : number of dims in which the first value is True, second is False
140+ - NFT : number of dims in which the first value is False, second is True
141+ - NFF : number of dims in which both values are False
142+ - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
143+ - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
144+
145+ ================= ======================= ===============================
146+ identifier class name distance function
147+ ----------------- ----------------------- -------------------------------
148+ "jaccard" JaccardDistance NNEQ / NNZ
149+ "matching" MatchingDistance NNEQ / N
150+ "dice" DiceDistance NNEQ / (NTT + NNZ)
151+ "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N)
152+ "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ)
153+ "russellrao" RussellRaoDistance (N - NTT) / N
154+ "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ)
155+ "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT)
156+ ================= ======================= ===============================
157+
158+ **User-defined distance:**
159+
160+ =========== =============== =======
161+ identifier class name args
162+ ----------- --------------- -------
163+ "pyfunc" PyFuncDistance func
164+ =========== =============== =======
165+
166+ Here ``func`` is a function which takes two one-dimensional numpy
167+ arrays, and returns a distance. Note that in order to be used within
168+ the BallTree, the distance must be a true metric:
169+ i.e. it must satisfy the following properties
170+
171+ 1) Non-negativity: d(x, y) >= 0
172+ 2) Identity: d(x, y) = 0 if and only if x == y
173+ 3) Symmetry: d(x, y) = d(y, x)
174+ 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
175+
176+ Because of the Python object overhead involved in calling the python
177+ function, this will be fairly slow, but it will have the same
178+ scaling as other distances.
179+ """
68180 @classmethod
69181 def get_metric(cls, metric, dtype=np.float64, **kwargs):
70182 """Get the given distance metric from the string identifier.
@@ -74,11 +186,24 @@ cdef class DistanceMetric:
74186 Parameters
75187 ----------
76188 metric : str or class name
77- The distance metric to use
189+ The string identifier or class name of the desired distance metric.
190+ See the documentation of the `DistanceMetric` class for a list of
191+ available metrics.
192+
78193 dtype : {np.float32, np.float64}, default=np.float64
79- The dtype of the data on which the metric will be applied
194+ The data type of the input on which the metric will be applied.
195+ This affects the precision of the computed distances.
196+ By default, it is set to `np.float64`.
197+
80198 **kwargs
81- additional arguments will be passed to the requested metric
199+ Additional keyword arguments that will be passed to the requested metric.
200+ These arguments can be used to customize the behavior of the specific
201+ metric.
202+
203+ Returns
204+ -------
205+ metric_obj : instance of the requested metric
206+ An instance of the requested distance metric class.
82207 """
83208 if dtype == np.float32:
84209 specialized_class = DistanceMetric32
0 commit comments