From 831da32936691420d38e203eb393ca2d9a773163 Mon Sep 17 00:00:00 2001 From: kyrajeep <26465428+kyrajeep@users.noreply.github.com> Date: Thu, 20 Jun 2024 15:10:04 -0400 Subject: [PATCH 1/3] Draft to optimize computation for the distance between two points --- sklearn/neighbors/_graph.pyx | 103 +++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 sklearn/neighbors/_graph.pyx diff --git a/sklearn/neighbors/_graph.pyx b/sklearn/neighbors/_graph.pyx new file mode 100644 index 0000000000000..d8bd12fd93c92 --- /dev/null +++ b/sklearn/neighbors/_graph.pyx @@ -0,0 +1,103 @@ +# Author: Jee Won (Kyra) Park, partly generated by gemini & copilot. +cimport cython +from libc.math cimport sqrt + +''' +This function takes two 1D numpy arrays as input, representing +the coordinates of two points. It computes the square of the +distance between these points in a loop, which is faster +than using numpy's built-in functions due to the overhead +of function calls in Python. The square root of the total +square distance is then returned as the Euclidean distance. + +The decorators @cython.boundscheck(False) and @cython.wraparound(False) +are used to disable bounds checking and negative indexing, +respectively, which can speed up the code when you're sure +that your indices are always valid. TODO: this should probably +be changed to use check_array(). The nogil keyword is used +to allow this function to be called in a multi-threaded context. +''' + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +cdef double compute_distance(double[:] point1, double[:] point2) nogil: + cdef: + int dim = point1.shape[0] + int i + double sqdist = 0.0 + + for i in range(dim): + sqdist += (point1[i] - point2[i]) ** 2 + + return sqrt(sqdist) + + + + + + +import numpy as np +cimport cython + +cdef extern from "math": + double sqrt(double x) + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initialized + +def cdist(double[:, ::1] X, double[:, ::1] Y): + """ + Cython optimized distance calculation between two sets of points. + + Args: + X: A 2D numpy array of shape (n_samples1, n_features) representing the first set of points. + Y: A 2D numpy array of shape (n_samples2, n_features) representing the second set of points. + + Returns: + A 2D numpy array of shape (n_samples1, n_samples2) containing the pairwise distances between points in X and Y. + """ + cdef int n_samples1, n_samples2, n_features + cdef double[:] distances + cdef int i, j, f + + n_samples1 = X.shape[0] + n_samples2 = Y.shape[0] + n_features = X.shape[1] + + distances = np.zeros(shape=(n_samples1, n_samples2), dtype=np.float64) + + # Loop through each point in X + for i in range(n_samples1): + # Loop through each point in Y + for j in range(n_samples2): + # Calculate squared distance for efficiency + for f in range(n_features): + distances[i, j] += (X[i, f] - Y[j, f]) ** 2 + + # Take the square root only if necessary (e.g., Euclidean distance) + distances[i, j] = sqrt(distances[i, j]) + + return distances + +def radius_neighbors_graph_cython(X, radius, *, mode="connectivity", + metric="minkowski", p=2, + metric_params=None, include_self=False, + n_jobs=None): + """ + Compute the (weighted) graph of Neighbors for points in X using Cython optimized distance calculation. + + This function replicates the functionality of scikit-learn's radius_neighbors_graph + but uses Cython for faster distance computations. + + Refer to the original radius_neighbors_graph documentation for details on parameters. + + Returns: + A sparse matrix of shape (n_samples, n_samples) + """ + # Delegate remaining logic (neighbor search, graph construction) to original implementation + # but replace the distance calculation with the optimized Cython function + knn = NearestNeighbors(radius=radius, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) + knn.fit(X) + query = _query_include_self(knn._fit_X, include_self, mode) + return knn.radius_neighbors_graph(query, radius, mode, distance_func=cdist) From f1e801c3a8ff30c94d245cfb9b985cc3b2cccac2 Mon Sep 17 00:00:00 2001 From: kyrajeep <26465428+kyrajeep@users.noreply.github.com> Date: Tue, 2 Jul 2024 13:14:35 -0400 Subject: [PATCH 2/3] Add the option to bypass test for precomputed --- sklearn/neighbors/tests/test_ball_tree.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 5263f201f320b..5f244c272d773 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -20,6 +20,7 @@ "manhattan": {}, "minkowski": dict(p=3), "chebyshev": {}, + "precomputed": {}, } DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] @@ -41,6 +42,9 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): from sklearn.metrics import DistanceMetric + + if metric == "precomputed": + return Y, np.argsort(Y, axis=1)[:, :k] X, Y = check_array(X), check_array(Y) D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) @@ -73,7 +77,8 @@ def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) - + + @pytest.mark.parametrize( "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) From a91295435f415ae793c7e89429c216f801fd1227 Mon Sep 17 00:00:00 2001 From: kyrajeep <26465428+kyrajeep@users.noreply.github.com> Date: Tue, 2 Jul 2024 13:15:55 -0400 Subject: [PATCH 3/3] Delete unrelated --- sklearn/neighbors/_graph.pyx | 103 ----------------------------------- 1 file changed, 103 deletions(-) delete mode 100644 sklearn/neighbors/_graph.pyx diff --git a/sklearn/neighbors/_graph.pyx b/sklearn/neighbors/_graph.pyx deleted file mode 100644 index d8bd12fd93c92..0000000000000 --- a/sklearn/neighbors/_graph.pyx +++ /dev/null @@ -1,103 +0,0 @@ -# Author: Jee Won (Kyra) Park, partly generated by gemini & copilot. -cimport cython -from libc.math cimport sqrt - -''' -This function takes two 1D numpy arrays as input, representing -the coordinates of two points. It computes the square of the -distance between these points in a loop, which is faster -than using numpy's built-in functions due to the overhead -of function calls in Python. The square root of the total -square distance is then returned as the Euclidean distance. - -The decorators @cython.boundscheck(False) and @cython.wraparound(False) -are used to disable bounds checking and negative indexing, -respectively, which can speed up the code when you're sure -that your indices are always valid. TODO: this should probably -be changed to use check_array(). The nogil keyword is used -to allow this function to be called in a multi-threaded context. -''' - -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. -cdef double compute_distance(double[:] point1, double[:] point2) nogil: - cdef: - int dim = point1.shape[0] - int i - double sqdist = 0.0 - - for i in range(dim): - sqdist += (point1[i] - point2[i]) ** 2 - - return sqrt(sqdist) - - - - - - -import numpy as np -cimport cython - -cdef extern from "math": - double sqrt(double x) - -@cython.boundscheck(False) -@cython.wraparound(False) -@cython.initialized - -def cdist(double[:, ::1] X, double[:, ::1] Y): - """ - Cython optimized distance calculation between two sets of points. - - Args: - X: A 2D numpy array of shape (n_samples1, n_features) representing the first set of points. - Y: A 2D numpy array of shape (n_samples2, n_features) representing the second set of points. - - Returns: - A 2D numpy array of shape (n_samples1, n_samples2) containing the pairwise distances between points in X and Y. - """ - cdef int n_samples1, n_samples2, n_features - cdef double[:] distances - cdef int i, j, f - - n_samples1 = X.shape[0] - n_samples2 = Y.shape[0] - n_features = X.shape[1] - - distances = np.zeros(shape=(n_samples1, n_samples2), dtype=np.float64) - - # Loop through each point in X - for i in range(n_samples1): - # Loop through each point in Y - for j in range(n_samples2): - # Calculate squared distance for efficiency - for f in range(n_features): - distances[i, j] += (X[i, f] - Y[j, f]) ** 2 - - # Take the square root only if necessary (e.g., Euclidean distance) - distances[i, j] = sqrt(distances[i, j]) - - return distances - -def radius_neighbors_graph_cython(X, radius, *, mode="connectivity", - metric="minkowski", p=2, - metric_params=None, include_self=False, - n_jobs=None): - """ - Compute the (weighted) graph of Neighbors for points in X using Cython optimized distance calculation. - - This function replicates the functionality of scikit-learn's radius_neighbors_graph - but uses Cython for faster distance computations. - - Refer to the original radius_neighbors_graph documentation for details on parameters. - - Returns: - A sparse matrix of shape (n_samples, n_samples) - """ - # Delegate remaining logic (neighbor search, graph construction) to original implementation - # but replace the distance calculation with the optimized Cython function - knn = NearestNeighbors(radius=radius, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) - knn.fit(X) - query = _query_include_self(knn._fit_X, include_self, mode) - return knn.radius_neighbors_graph(query, radius, mode, distance_func=cdist)