8000 MNT Move `DistanceMetric` under `metrics` (#21177) · scikit-learn/scikit-learn@6b332d9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6b332d9

Browse files
authored
MNT Move DistanceMetric under metrics (#21177)
1 parent 1cab25c commit 6b332d9

27 files changed

+152
-107
lines changed

doc/glossary.rst

+3-4
Original file line numberDiff line numberDiff line change
@@ -644,9 +644,8 @@ General Concepts
644644

645645
Note that for most distance metrics, we rely on implementations from
646646
:mod:`scipy.spatial.distance`, but may reimplement for efficiency in
647-
our context. The :mod:`neighbors` module also duplicates some metric
648-
implementations for integration with efficient binary tree search data
649-
structures.
647+
our context. The :class:`metrics.DistanceMetric` interface is used to implement
648+
distance metrics for integration with efficient neighbors search.
650649

651650
pd
652651
A shorthand for `Pandas <https://pandas.pydata.org>`_ due to the
@@ -1023,7 +1022,7 @@ such as:
10231022

10241023< 9E81 code class="diff-text syntax-highlighted-line">
Further examples:
10251024

1026-
* :class:`neighbors.DistanceMetric`
1025+
* :class:`metrics.DistanceMetric`
10271026
* :class:`gaussian_process.kernels.Kernel`
10281027
* ``tree.Criterion``
10291028

doc/modules/classes.rst

+10-1
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,16 @@ further details.
10581058

10591059
metrics.consensus_score
10601060

1061+
Distance metrics
1062+
----------------
1063+
1064+
.. currentmodule:: sklearn
1065+
1066+
.. autosummary::
1067+
:toctree: generated/
1068+
:template: class.rst
1069+
1070+
metrics.DistanceMetric
10611071

10621072
Pairwise metrics
10631073
----------------
@@ -1317,7 +1327,6 @@ Model validation
13171327
:template: class.rst
13181328

13191329
neighbors.BallTree
1320-
neighbors.DistanceMetric
13211330
neighbors.KDTree
13221331
neighbors.KernelDensity
13231332
neighbors.KNeighborsClassifier

doc/modules/density.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ The form of these kernels is as follows:
136136
:math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h`
137137

138138
The kernel density estimator can be used with any of the valid distance
139-
metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though
140-
the results are properly normalized only for the Euclidean metric. One
141-
particularly useful metric is the
139+
metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of
140+
available metrics), though the results are properly normalized only
141+
for the Euclidean metric. One particularly useful metric is the
142142
`Haversine distance <https://en.wikipedia.org/wiki/Haversine_formula>`_
143143
which measures the angular distance between points on a sphere. Here
144144
is an example of using a kernel density estimate for a visualization

doc/whats_new/v1.1.rst

+9
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,15 @@ Changelog
6060
message when the solver does not support sparse matrices with int64 indices.
6161
:pr:`21093` by `Tom Dupre la Tour`_.
6262

63+
:mod:`sklearn.metrics`
64+
......................
65+
66+
- |API| :class:`metrics.DistanceMetric` has been moved from
67+
:mod:`sklearn.neighbors` to :mod:`sklearn.metric`.
68+
Using `neighbors.DistanceMetric` for imports is still valid for
69+
backward compatibility, but this alias will be removed in 1.3.
70+
:pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
71+
6372
:mod:`sklearn.model_selection`
6473
..............................
6574

sklearn/cluster/_agglomerative.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
from ..base import BaseEstimator, ClusterMixin
1818
from ..metrics.pairwise import paired_distances
19-
from ..neighbors import DistanceMetric
20-
from ..neighbors._dist_metrics import METRIC_MAPPING
19+
from ..metrics import DistanceMetric
20+
from ..metrics._dist_metrics import METRIC_MAPPING
2121
from ..utils import check_array
2222
from ..utils._fast_dict import IntFloatDict
2323
from ..utils.fixes import _astype_copy_false

sklearn/cluster/_hierarchical_fast.pyx

+7-8
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ ctypedef np.int8_t INT8
1313

1414
np.import_array()
1515

16-
from ..neighbors._dist_metrics cimport DistanceMetric
16+
from ..metrics._dist_metrics cimport DistanceMetric
1717
from ..utils._fast_dict cimport IntFloatDict
1818

1919
# C++
@@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b,
236236
def average_merge(IntFloatDict a, IntFloatDict b,
237237
np.ndarray[ITYPE_t, ndim=1] mask,
238238
ITYPE_t n_a, ITYPE_t n_b):
239-
"""Merge two IntFloatDicts with the average strategy: when the
240-
same key is present in the two dicts, the weighted average of the two
239+
"""Merge two IntFloatDicts with the average strategy: when the
240+
same key is present in the two dicts, the weighted average of the two
241241
values is used.
242242
243243
Parameters
@@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b,
290290

291291

292292
###############################################################################
293-
# An edge object for fast comparisons
293+
# An edge object for fast comparisons
294294

295295
cdef class WeightedEdge:
296296
cdef public ITYPE_t a
297297
cdef public ITYPE_t b
298298
cdef public DTYPE_t weight
299-
299+
300300
def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b):
301301
self.weight = weight
302302
self.a = a
@@ -326,7 +326,7 @@ cdef class WeightedEdge:
326326
return self.weight > other.weight
327327
elif op == 5:
328328
return self.weight >= other.weight
329-
329+
330330
def __repr__(self):
331331
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
332332
self.weight,
@@ -475,7 +475,7 @@ def mst_linkage_core(
475475
476476
dist_metric: DistanceMetric
477477
A DistanceMetric object conforming to the API from
478-
``sklearn.neighbors._dist_metrics.pxd`` that will be
478+
``sklearn.metrics._dist_metrics.pxd`` that will be
479479
used to compute distances.
480480
481481
Returns
@@ -534,4 +534,3 @@ def mst_linkage_core(
534534
current_node = new_node
535535

536536
return np.array(result)
537-

sklearn/cluster/tests/test_hierarchical.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from scipy.sparse.csgraph import connected_components
1818

1919
from sklearn.metrics.cluster import adjusted_rand_score
20-
from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
20+
from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
2121
from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
2222
from sklearn.utils._testing import assert_array_almost_equal
2323
from sklearn.utils._testing import ignore_warnings
@@ -31,14 +31,15 @@
3131
_fix_connectivity,
3232
)
3333
from sklearn.feature_extraction.image import grid_to_graph
34+
from sklearn.metrics import DistanceMetric
3435
from sklearn.metrics.pairwise import (
3536
PAIRED_DISTANCES,
3637
cosine_distances,
3738
manhattan_distances,
3839
pairwise_distances,
3940
)
4041
from sklearn.metrics.cluster import normalized_mutual_info_score
41-
from sklearn.neighbors import kneighbors_graph, DistanceMetric
42+
from sklearn.neighbors import kneighbors_graph
4243
from sklearn.cluster._hierarchical_fast import (
4344
average_merge,
4445
max_merge,

sklearn/metrics/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
from ._classification import brier_score_loss
3737
from ._classification import multilabel_confusion_matrix
3838

39+
from ._dist_metrics import DistanceMetric
40+
3941
from . import cluster
4042
from .cluster import adjusted_mutual_info_score
4143
from .cluster import adjusted_rand_score
@@ -115,6 +117,7 @@
115117
"davies_bouldin_score",
116118
"DetCurveDisplay",
117119
"det_curve",
120+
"DistanceMetric",
118121
"euclidean_distances",
119122
"explained_variance_score",
120123
"f1_score",

sklearn/neighbors/_dist_metrics.pxd renamed to sklearn/metrics/_dist_metrics.pxd

+7-9
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
#!python
2-
#cython: boundscheck=False
3-
#cython: wraparound=False
4-
#cython: cdivision=True
1+
# cython: boundscheck=False
2+
# cython: cdivision=True
3+
# cython: initializedcheck=False
4+
# cython: wraparound=False
55

6-
cimport cython
76
cimport numpy as np
8-
from libc.math cimport fabs, sqrt, exp, cos, pow
7+
from libc.math cimport sqrt, exp
98

10-
from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
11-
from ._typedefs import DTYPE, ITYPE
9+
from ..utils._typedefs cimport DTYPE_t, ITYPE_t
1210

1311
######################################################################
1412
# Inline distance functions
@@ -60,7 +58,7 @@ cdef class DistanceMetric:
6058
cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
6159
ITYPE_t size) nogil except -1
6260

63-
cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
61+
cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
6462
ITYPE_t size) nogil except -1
6563

6664
cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1

sklearn/neighbors/_dist_metrics.pyx renamed to sklearn/metrics/_dist_metrics.pyx

+25-29
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
#!python
2-
#cython: boundscheck=False
3-
#cython: wraparound=False
4-
#cython: initializedcheck=False
5-
#cython: cdivision=True
1+
# cython: boundscheck=False
2+
# cython: cdivision=True
3+
# cython: initializedcheck=False
4+
# cython: wraparound=False
65

76
# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
87
# written for the scikit-learn project
@@ -19,7 +18,7 @@ cdef extern from "arrayobject.h":
1918
int typenum, void* data)
2019

2120

22-
cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
21+
cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
2322
# Wrap a memory buffer with an ndarray. Warning: this is not robust.
2423
# In particular, if x is deallocated before the returned array goes
2524
# out of scope, this could cause memory errors. Since there is not
@@ -33,8 +32,8 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
3332
from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
3433
cdef DTYPE_t INF = np.inf
3534

36-
from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
37-
from ._typedefs import DTYPE, ITYPE
35+
from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
36+
from ..utils._typedefs import DTYPE, ITYPE
3837

3938

4039
######################################################################
@@ -98,7 +97,7 @@ cdef class DistanceMetric:
9897
9998
Examples
10099
--------
101-
>>> from sklearn.neighbors import DistanceMetric
100+
>>> from sklearn.metrics import DistanceMetric
102101
>>> dist = DistanceMetric.get_metric('euclidean')
103102
>>> X = [[0, 1, 2],
104103
[3, 4, 5]]
@@ -291,14 +290,13 @@ cdef class DistanceMetric:
291290

292291
cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
293292
ITYPE_t size) nogil except -1:
294-
"""Compute the reduced distance between vectors x1 and x2.
293+
"""Compute the rank-preserving surrogate distance between vectors x1 and x2.
295294
296295
This can optionally be overridden in a base class.
297296
298-
The reduced distance is any measure that yields the same rank as the
299-
distance, but is more efficient to compute. For example, for the
300-
Euclidean metric, the reduced distance is the squared-euclidean
301-
distance.
297+
The rank-preserving surrogate distance is any measure that yields the same
298+
rank as the distance, but is more efficient to compute. For example, for the
299+
Euclidean metric, the surrogate distance is the squared-euclidean distance.
302300
"""
303301
return self.dist(x1, x2, size)
304302

@@ -323,25 +321,24 @@ cdef class DistanceMetric:
323321
return 0
324322

325323
cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
326-
"""Convert the reduced distance to the distance"""
324+
"""Convert the rank-preserving surrogate distance to the distance"""
327325
return rdist
328326

329327
cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
330-
"""Convert the distance to the reduced distance"""
328+
"""Convert the distance to the rank-preserving surrogate distance"""
331329
return dist
332330

333331
def rdist_to_dist(self, rdist):
334-
"""Convert the Reduced distance to the true distance.
332+
"""Convert the rank-preserving surrogate distance to the distance.
335333
336-
The reduced distance, defined for some metrics, is a computationally
337-
more efficient measure which preserves the rank of the true distance.
338-
For example, in the Euclidean distance metric, the reduced distance
339-
is the squared-euclidean distance.
334+
The surrogate distance is any measure that yields the same rank as the
335+
distance, but is more efficient to compute. For example, for the
336+
Euclidean metric, the surrogate distance is the squared-euclidean distance.
340337
341338
Parameters
342339
----------
343340
rdist : double
344-
Reduced distance.
341+
Surrogate distance.
345342
346343
Returns
347344
-------
@@ -351,12 +348,11 @@ cdef class DistanceMetric:
351348
return rdist
352349

353350
def dist_to_rdist(self, dist):
354-
"""Convert the true distance to the reduced distance.
351+
"""Convert the true distance to the rank-preserving surrogate distance.
355352
356-
The reduced distance, defined for some metrics, is a computationally
357-
more efficient measure which preserves the rank of the true distance.
358-
For example, in the Euclidean distance metric, the reduced distance
359-
is the squared-euclidean distance.
353+
The surrogate distance is any measure that yields the same rank as the
354+
distance, but is more efficient to compute. For example, for the
355+
Euclidean metric, the surrogate distance is the squared-euclidean distance.
360356
361357
Parameters
362358
----------
@@ -366,7 +362,7 @@ cdef class DistanceMetric:
366362
Returns
367363
-------
368364
double
369-
Reduced distance.
365+
Surrogate distance.
370366
"""
371367
return dist
372368

@@ -519,7 +515,7 @@ cdef class ChebyshevDistance(DistanceMetric):
519515
520516
Examples
521517
--------
522-
>>> from sklearn.neighbors.dist_metrics import DistanceMetric
518+
>>> from sklearn.metrics.dist_metrics import DistanceMetric
523519
>>> dist = DistanceMetric.get_metric('chebyshev')
524520
>>> X = [[0, 1, 2],
525521
... [3, 4, 5]]

sklearn/metrics/pairwise.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,7 @@ def haversine_distances(X, Y=None):
780780
array([[ 0. , 11099.54035582],
781781
[11099.54035582, 0. ]])
782782
"""
783-
from ..neighbors import DistanceMetric
783+
from ..metrics import DistanceMetric
784784

785785
return DistanceMetric.get_metric("haversine").pairwise(X, Y)
786786

sklearn/metrics/setup.py

+8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import numpy as np
23

34
from numpy.distutils.misc_util import Configuration
45

@@ -18,6 +19,13 @@ def configuration(parent_package="", top_path=None):
1819
"_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries
1920
)
2021

22+
config.add_extension(
23+
"_dist_metrics",
24+
sources=["_dist_metrics.pyx"],
25+
include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
26+
libraries=libraries,
27+
)
28+
2129
config.add_subpackage("tests")
2230

2331
return config

0 commit comments

Comments
 (0)
0