diff --git a/.gitignore b/.gitignore index 76f8a60158209..3e5e4c9795358 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ sklearn/neighbors/_kd_tree.pyx # Default JupyterLite content jupyterlite_contents + +#SIMD +!sklearn/metrics/_simd/* diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..6efb282cbe34f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "xsimd"] + path = xsimd + url = https://github.com/xtensor-stack/xsimd diff --git a/setup.py b/setup.py index f9ae13c94502b..474ed7b5a2f5d 100755 --- a/setup.py +++ b/setup.py @@ -11,6 +11,8 @@ import sys import traceback from os.path import join +from pathlib import Path +from textwrap import dedent from setuptools import Command, Extension, setup from setuptools.command.build_ext import build_ext @@ -28,7 +30,6 @@ # away from numpy.distutils? builtins.__SKLEARN_SETUP__ = True - DISTNAME = "scikit-learn" DESCRIPTION = "A set of python modules for machine learning and data mining" with open("README.rst") as f: @@ -43,6 +44,7 @@ "Documentation": "https://scikit-learn.org/stable/documentation.html", "Source Code": "https://github.com/scikit-learn/scikit-learn", } +SIMD_DIRECTORY = join("sklearn", "metrics", "_simd") # We can actually import a restricted version of sklearn that # does not need the compiled code @@ -50,6 +52,27 @@ import sklearn._min_dependencies as min_deps # noqa from sklearn._build_utils import _check_cython_version # noqa from sklearn.externals._packaging.version import parse as parse_version # noqa +from sklearn._build_utils.pre_build_helpers import compile_test_program # noqa + +runtime_check_program = dedent(""" + #include "xsimd.hpp" + #include + int main(){ + std::cout << xsimd::available_architectures().avx; + return 0; + } + """) +# XXX: Can we do this without an absolute path? +dir_path = os.path.dirname(os.path.realpath(__file__)) +xsimd_include_path = Path(dir_path, "xsimd", "include", "xsimd") +HAS_AVX_RUNTIME = int( + compile_test_program( + runtime_check_program, + extra_preargs=["-lstdc++"], + extra_postargs=[f"-I{xsimd_include_path}"], + extension="cpp", + )[0] +) VERSION = sklearn.__version__ @@ -252,8 +275,23 @@ def check_package_status(package, min_version): "metrics": [ {"sources": ["_pairwise_fast.pyx"], "include_np": True}, { - "sources": ["_dist_metrics.pyx.tp", "_dist_metrics.pxd.tp"], + "sources": [ + "_dist_metrics.pyx.tp", + "_dist_metrics.pxd.tp", + join("_simd", "_dist_optim.cpp"), + ], "include_np": True, + "language": "c++", + "extra_compile_args": ["-std=c++11"], + "define_macros": [("DIST_METRICS", None)], + "include_dirs": [join("..", "..", "xsimd", "include", "xsimd")], + }, + { + "sources": ["_runtime_check.pyx"], + "language": "c++", + "include_np": True, + "extra_compile_args": ["-std=c++11"], + "include_dirs": [join("..", "..", "xsimd", "include", "xsimd")], }, ], "metrics.cluster": [ @@ -471,6 +509,23 @@ def configure_extension_modules(): build_with_debug_symbols = ( os.environ.get("SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS", "0") != "0" ) + BUILD_WITH_SIMD = os.environ.get("SKLEARN_NO_SIMD", "0") == "0" and HAS_AVX_RUNTIME + if BUILD_WITH_SIMD: + libraries.append( + ( + "avx_dist_metrics", + { + "language": "c++", + "sources": [join(SIMD_DIRECTORY, "simd.cpp")], + "cflags": ["-std=c++14", "-mavx"], + "extra_link_args": ["-std=c++14"], + "include_dirs": [join("xsimd", "include", "xsimd")], + }, + ), + ) + extension_config["metrics"][1]["define_macros"].append( + ("WITH_SIMD", BUILD_WITH_SIMD) + ) if os.name == "posix": if build_with_debug_symbols: default_extra_compile_args.append("-g") @@ -538,13 +593,13 @@ def configure_extension_modules(): optimization_level = extension.get( "optimization_level", default_optimization_level ) + define_macros = extension.get("define_macros", []) if os.name == "posix": extra_compile_args.append(f"-{optimization_level}") else: extra_compile_args.append(f"/{optimization_level}") libraries_ext = extension.get("libraries", []) + default_libraries - new_ext = Extension( name=name, sources=sources, @@ -554,6 +609,7 @@ def configure_extension_modules(): depends=depends, extra_link_args=extension.get("extra_link_args", None), extra_compile_args=extra_compile_args, + define_macros=define_macros, ) cython_exts.append(new_ext) diff --git a/sklearn/_build_utils/pre_build_helpers.py b/sklearn/_build_utils/pre_build_helpers.py index f3eb054bb037e..89f9b7af94aa9 100644 --- a/sklearn/_build_utils/pre_build_helpers.py +++ b/sklearn/_build_utils/pre_build_helpers.py @@ -10,7 +10,7 @@ from setuptools.command.build_ext import customize_compiler, new_compiler -def compile_test_program(code, extra_preargs=None, extra_postargs=None): +def compile_test_program(code, extra_preargs=None, extra_postargs=None, extension="c"): """Check that some C code can be compiled and run""" ccompiler = new_compiler() customize_compiler(ccompiler) @@ -22,14 +22,16 @@ def compile_test_program(code, extra_preargs=None, extra_postargs=None): os.chdir(tmp_dir) # Write test program - with open("test_program.c", "w") as f: + with open(f"test_program.{extension}", "w") as f: f.write(code) os.mkdir("objects") # Compile, test program ccompiler.compile( - ["test_program.c"], output_dir="objects", extra_postargs=extra_postargs + [f"test_program.{extension}"], + output_dir="objects", + extra_postargs=extra_postargs, ) # Link test program diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 488c776ae9a86..3ed7a792fd737 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -63,6 +63,7 @@ median_absolute_error, r2_score, ) +from ._runtime_check import get_available_architectures from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer from .cluster import ( adjusted_mutual_info_score, @@ -123,6 +124,7 @@ "f1_score", "fbeta_score", "fowlkes_mallows_score", + "get_available_architectures", "get_scorer", "hamming_loss", "hinge_loss", diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 313225088c776..c25e21a588e74 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -16,6 +16,11 @@ from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t cdef class DistanceMetric: pass +cdef extern from "_simd/_dist_optim.cpp": + cdef Type xsimd_manhattan_dist[Type](Type * x, Type * y, intp_t size) nogil + cdef Type xsimd_manhattan_dist_scalar[Type](Type * x, Type * y, intp_t size) nogil + bint WITH_SIMD + {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### @@ -70,6 +75,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef intp_t size cdef object func cdef object kwargs + cdef bint simd_runtime cdef {{INPUT_DTYPE_t}} dist( self, diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 6b5ea300f038b..7c0db241ec624 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -24,7 +24,7 @@ from scipy.sparse import csr_matrix, issparse from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t from ..utils import check_array from ..utils.fixes import parse_version, sp_base_version - +from ._runtime_check import get_available_architectures cdef inline double fmax(double a, double b) noexcept nogil: return max(a, b) @@ -837,7 +837,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): intp_t i1, i2 intp_t x1_start, x1_end - {{INPUT_DTYPE_t}} * x2_data + const {{INPUT_DTYPE_t}} * x2_data with nogil: # Use the exact same adaptation for CSR than in SparseDenseDatasetsPair @@ -901,7 +901,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') intp_t i1, i2 - {{INPUT_DTYPE_t}} * x1_data + const {{INPUT_DTYPE_t}} * x1_data intp_t x2_start, x2_end @@ -1083,7 +1083,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const intp_t size, ) except -1 nogil: return sqrt( - self.rdist_csr( + EuclideanDistance{{name_suffix}}.rdist_csr( + self, x1_data, x1_indices, x2_data, @@ -1132,7 +1133,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: - return sqrt(self.rdist(x1, x2, size)) + return sqrt(SEuclideanDistance{{name_suffix}}.rdist(self, x1, x2, size)) cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) @@ -1212,7 +1213,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const intp_t size, ) except -1 nogil: return sqrt( - self.rdist_csr( + SEuclideanDistance{{name_suffix}}.rdist_csr( + self, x1_data, x1_indices, x2_data, @@ -1235,6 +1237,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ def __init__(self): self.p = 1 + self.simd_runtime = get_available_architectures()["avx"] and WITH_SIMD cdef inline {{INPUT_DTYPE_t}} dist( self, @@ -1242,11 +1245,10 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: - cdef float64_t d = 0 - cdef intp_t j - for j in range(size): - d += fabs(x1[j] - x2[j]) - return d + if self.simd_runtime: + return xsimd_manhattan_dist[{{INPUT_DTYPE_t}}](x1, x2, size) + else: + return xsimd_manhattan_dist_scalar[{{INPUT_DTYPE_t}}](x1, x2, size) cdef inline {{INPUT_DTYPE_t}} dist_csr( self, @@ -1463,7 +1465,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: - return pow(self.rdist(x1, x2, size), 1. / self.p) + return pow(MinkowskiDistance{{name_suffix}}.rdist(self, x1, x2, size), 1. / self.p) cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return pow(rdist, 1. / self.p) @@ -1570,7 +1572,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const intp_t size, ) except -1 nogil: return pow( - self.rdist_csr( + MinkowskiDistance{{name_suffix}}.rdist_csr( + self, x1_data, x1_indices, x2_data, @@ -1655,7 +1658,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: - return sqrt(self.rdist(x1, x2, size)) + return sqrt(MahalanobisDistance{{name_suffix}}.rdist(self, x1, x2, size)) cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) @@ -1736,7 +1739,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const intp_t size, ) except -1 nogil: return sqrt( - self.rdist_csr( + MahalanobisDistance{{name_suffix}}.rdist_csr( + self, x1_data, x1_indices, x2_data, @@ -2641,7 +2645,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: - return 2 * asin(sqrt(self.rdist(x1, x2, size))) + return 2 * asin(sqrt(HaversineDistance{{name_suffix}}.rdist(self, x1, x2, size))) cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return 2 * asin(sqrt(rdist)) @@ -2669,7 +2673,8 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const int32_t x2_end, const intp_t size, ) except -1 nogil: - return 2 * asin(sqrt(self.rdist_csr( + return 2 * asin(sqrt(HaversineDistance{{name_suffix}}.rdist_csr( + self, x1_data, x1_indices, x2_data, @@ -2779,7 +2784,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: - return self._dist(x1, x2, size) + return PyFuncDistance{{name_suffix}}._dist(self, x1, x2, size) cdef inline {{INPUT_DTYPE_t}} _dist( self, diff --git a/sklearn/metrics/_runtime_check.pyx b/sklearn/metrics/_runtime_check.pyx new file mode 100644 index 0000000000000..5e6557e92fa6c --- /dev/null +++ b/sklearn/metrics/_runtime_check.pyx @@ -0,0 +1,28 @@ +cdef extern from "xsimd.hpp" namespace "xsimd::detail": + ctypedef struct supported_arch: + unsigned int sse2 + unsigned int sse3 + unsigned int ssse3 + unsigned int sse4_1 + unsigned int sse4_2 + unsigned int sse4a + unsigned int fma3_sse + unsigned int fma4 + unsigned int xop + unsigned int avx + unsigned int fma3_avx + unsigned int avx2 + unsigned int fma3_avx2 + unsigned int avx512f + unsigned int avx512cd + unsigned int avx512dq + unsigned int avx512bw + unsigned int neon + unsigned int neon64 + unsigned int sve + +cdef extern from "xsimd.hpp" namespace "xsimd": + cdef supported_arch available_architectures() noexcept nogil + +cpdef get_available_architectures(): + return available_architectures() diff --git a/sklearn/metrics/_simd/_dist_optim.cpp b/sklearn/metrics/_simd/_dist_optim.cpp new file mode 100644 index 0000000000000..80f8e766081ab --- /dev/null +++ b/sklearn/metrics/_simd/_dist_optim.cpp @@ -0,0 +1,35 @@ +/* +Only provide a body upon inclusion if we are being included in _dist_metrics.pyx +otherwise we remain empty so as to bypass Cython's forced inclusion of +this file due to cimporting _dist_metrics +*/ +#ifdef DIST_METRICS + +/* If built with SIMD support, include the compiled library code */ +#if WITH_SIMD == 1 +#include "simd.hpp" +#else +#include + +/* Else, we provide trivial functions for compilation */ +template +Type xsimd_manhattan_dist(const Type* a, const Type* b, const std::size_t size){return -1;} +#endif + +/* +In case of a runtime machine without AVX, we need to +provide alternative scalar implementations. +*/ +template +Type xsimd_manhattan_dist_scalar(const Type* a, const Type* b, const std::size_t size){ + double scalar_sum = 0; + + for(std::size_t idx = 0; idx < size; ++idx) { + scalar_sum += fabs(a[idx] - b[idx]); + } + return (Type) scalar_sum; +} + +#else +/* Empty body */ +#endif diff --git a/sklearn/metrics/_simd/simd.cpp b/sklearn/metrics/_simd/simd.cpp new file mode 100644 index 0000000000000..64d386dd15abe --- /dev/null +++ b/sklearn/metrics/_simd/simd.cpp @@ -0,0 +1,7 @@ +#include "simd.hpp" +/* +Externed declarations to provide separate translation unit for compilation +with specific feature flags (in this case -mavx) +*/ +template float xsimd_manhattan_dist(const float* a, const float* b, const std::size_t size); +template double xsimd_manhattan_dist(const double* a, const double* b, const std::size_t size); diff --git a/sklearn/metrics/_simd/simd.hpp b/sklearn/metrics/_simd/simd.hpp new file mode 100644 index 0000000000000..d99a946948b07 --- /dev/null +++ b/sklearn/metrics/_simd/simd.hpp @@ -0,0 +1,57 @@ +#include "xsimd.hpp" + +namespace xs = xsimd; + +/*Manhattan Distance*/ +template +Type xsimd_manhattan_dist(const Type* a, const Type* b, const std::size_t size){ + using batch_type = xs::batch; + + // Unroll explicitly for pipeline optimization + batch_type simd_x_0; + batch_type simd_y_0; + batch_type sum_0 = batch_type::broadcast((Type) 0.); + + batch_type simd_x_1; + batch_type simd_y_1; + batch_type sum_1 = batch_type::broadcast((Type) 0.); + // End unrolled + + // Begin VECTOR LOOP + std::size_t inc = batch_type::size; + std::size_t loop_iter = inc * 2; + std::size_t vec_size = size - size % loop_iter; + std::size_t vec_remainder_size = size - size % inc; + for(std::size_t idx = 0; idx < vec_size; idx += loop_iter) { + // Begin unrolled + simd_x_0 = batch_type::load_unaligned(&a[idx + inc * 0]); + simd_y_0 = batch_type::load_unaligned(&b[idx + inc * 0]); + sum_0 += xs::fabs(simd_x_0 - simd_y_0); + + simd_x_1 = batch_type::load_unaligned(&a[idx + inc * 1]); + simd_y_1 = batch_type::load_unaligned(&b[idx + inc * 1]); + sum_1 += xs::fabs(simd_x_1 - simd_y_1); + // End unrolled + + } + for(std::size_t idx = vec_size; idx < vec_remainder_size; idx += inc) { + simd_x_0 = batch_type::load_unaligned(&a[idx + inc * 0]); + simd_y_0 = batch_type::load_unaligned(&b[idx + inc * 0]); + sum_0 += xs::fabs(simd_x_0 - simd_y_0); + } + // End VECTOR LOOP + // Reduction + sum_0 += sum_1; + batch_type batch_sum = xs::reduce_add(sum_0); + double scalar_sum = *(Type*)&batch_sum; + + // Remaining scalar loop that cannot be vectorized + for(std::size_t idx = vec_remainder_size; idx < size; ++idx) { + scalar_sum += fabs(a[idx] - b[idx]); + } + return (Type) scalar_sum; +} + +/*Extern declarations to later be mapped to simd.cpp declarations at link time*/ +extern template float xsimd_manhattan_dist(const float* a, const float* b, const std::size_t size); +extern template double xsimd_manhattan_dist(const double* a, const double* b, const std::size_t size); diff --git a/xsimd b/xsimd new file mode 160000 index 0000000000000..80626b9d4a2a9 --- /dev/null +++ b/xsimd @@ -0,0 +1 @@ +Subproject commit 80626b9d4a2a9b22a57103405d142c63375d91a2