diff --git a/doc/developers/performance.rst b/doc/developers/performance.rst index bf6a1a24efd8e..6b9f4389a354d 100644 --- a/doc/developers/performance.rst +++ b/doc/developers/performance.rst @@ -344,27 +344,18 @@ Using OpenMP Since scikit-learn can be built without OpenMP, it's necessary to protect each direct call to OpenMP. -There are some helpers in +The `_openmp_helpers` module, available in `sklearn/utils/_openmp_helpers.pyx `_ -that you can reuse for the main useful functionalities and already have the -necessary protection to be built without OpenMP. +provides protected versions of the OpenMP routines. To use OpenMP routines, they +must be cimported from this module and not from the OpenMP library directly:: -If the helpers are not enough, you need to protect your OpenMP code using the -following syntax:: - - # importing OpenMP - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - cimport openmp - - # calling OpenMP - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - max_threads = openmp.omp_get_max_threads() - ELSE: - max_threads = 1 + from sklearn.utils._openmp_helpers cimport omp_get_max_threads + max_threads = omp_get_max_threads() .. note:: - Protecting the parallel loop, ``prange``, is already done by cython. + The parallel loop, `prange`, is already protected by cython and can be used directly + from `cython.parallel`. .. _profiling-compiled-extension: diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index 33af1a5128ad7..d539c0c06ecc1 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -52,11 +52,9 @@ def cythonize_extensions(extension): # compilers are properly configured to build with OpenMP. This is expensive # and we only want to call this function once. # The result of this check is cached as a private attribute on the sklearn - # module (only at build-time) to be used twice: - # - First to set the value of SKLEARN_OPENMP_PARALLELISM_ENABLED, the - # cython build-time variable passed to the cythonize() call. - # - Then in the build_ext subclass defined in the top-level setup.py file - # to actually build the compiled extensions with OpenMP flags if needed. + # module (only at build-time) to be used in the build_ext subclass defined + # in the top-level setup.py file to actually build the compiled extensions + # with OpenMP flags if needed. sklearn._OPENMP_SUPPORTED = check_openmp_support() n_jobs = 1 @@ -82,9 +80,6 @@ def cythonize_extensions(extension): return cythonize( extension, nthreads=n_jobs, - compile_time_env={ - "SKLEARN_OPENMP_PARALLELISM_ENABLED": sklearn._OPENMP_SUPPORTED - }, compiler_directives=compiler_directives, ) diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index abde900903c50..c2919ac1d0012 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -6,13 +6,16 @@ # fused types and when the array may be read-only (for instance when it's # provided by the user). This is fixed in cython > 0.3. -IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - cimport openmp from cython cimport floating from cython.parallel import prange, parallel from libc.stdlib cimport calloc, free from libc.string cimport memset +from ..utils._openmp_helpers cimport omp_lock_t +from ..utils._openmp_helpers cimport omp_init_lock +from ..utils._openmp_helpers cimport omp_destroy_lock +from ..utils._openmp_helpers cimport omp_set_lock +from ..utils._openmp_helpers cimport omp_unset_lock from ..utils.extmath import row_norms from ._k_means_common import CHUNK_SIZE from ._k_means_common cimport _relocate_empty_clusters_dense @@ -274,8 +277,7 @@ def elkan_iter_chunked_dense( floating *centers_new_chunk floating *weight_in_clusters_chunk - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_lock_t lock + omp_lock_t lock # count remainder chunk in total number of chunks n_chunks += n_samples != n_chunks * n_samples_chunk @@ -286,8 +288,7 @@ def elkan_iter_chunked_dense( if update_centers: memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_init_lock(&lock) + omp_init_lock(&lock) with nogil, parallel(num_threads=n_threads): # thread local buffers @@ -316,23 +317,20 @@ def elkan_iter_chunked_dense( # reduction from local buffers. if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - # The lock is necessary to avoid race conditions when aggregating - # info from different thread-local buffers. - openmp.omp_set_lock(&lock) + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) for j in range(n_clusters): weight_in_clusters[j] += weight_in_clusters_chunk[j] for k in range(n_features): centers_new[j, k] += centers_new_chunk[j * n_features + k] - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_unset_lock(&lock) + omp_unset_lock(&lock) free(centers_new_chunk) free(weight_in_clusters_chunk) if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_destroy_lock(&lock) + omp_destroy_lock(&lock) _relocate_empty_clusters_dense(X, sample_weight, centers_old, centers_new, weight_in_clusters, labels) @@ -516,8 +514,7 @@ def elkan_iter_chunked_sparse( floating *centers_new_chunk floating *weight_in_clusters_chunk - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_lock_t lock + omp_lock_t lock # count remainder chunk in total number of chunks n_chunks += n_samples != n_chunks * n_samples_chunk @@ -528,8 +525,7 @@ def elkan_iter_chunked_sparse( if update_centers: memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_init_lock(&lock) + omp_init_lock(&lock) with nogil, parallel(num_threads=n_threads): # thread local buffers @@ -561,23 +557,20 @@ def elkan_iter_chunked_sparse( # reduction from local buffers. if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - # The lock is necessary to avoid race conditions when aggregating - # info from different thread-local buffers. - openmp.omp_set_lock(&lock) + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) for j in range(n_clusters): weight_in_clusters[j] += weight_in_clusters_chunk[j] for k in range(n_features): centers_new[j, k] += centers_new_chunk[j * n_features + k] - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_unset_lock(&lock) + omp_unset_lock(&lock) free(centers_new_chunk) free(weight_in_clusters_chunk) if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_destroy_lock(&lock) + omp_destroy_lock(&lock) _relocate_empty_clusters_sparse( X_data, X_indices, X_indptr, sample_weight, centers_old, centers_new, weight_in_clusters, labels) diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx index 63e5ee3530d6e..6ca50b2a1d0d9 100644 --- a/sklearn/cluster/_k_means_lloyd.pyx +++ b/sklearn/cluster/_k_means_lloyd.pyx @@ -4,14 +4,17 @@ # fused types and when the array may be read-only (for instance when it's # provided by the user). This is fixed in cython > 0.3. -IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - cimport openmp from cython cimport floating from cython.parallel import prange, parallel from libc.stdlib cimport malloc, calloc, free from libc.string cimport memset from libc.float cimport DBL_MAX, FLT_MAX +from ..utils._openmp_helpers cimport omp_lock_t +from ..utils._openmp_helpers cimport omp_init_lock +from ..utils._openmp_helpers cimport omp_destroy_lock +from ..utils._openmp_helpers cimport omp_set_lock +from ..utils._openmp_helpers cimport omp_unset_lock from ..utils.extmath import row_norms from ..utils._cython_blas cimport _gemm from ..utils._cython_blas cimport RowMajor, Trans, NoTrans @@ -94,8 +97,8 @@ def lloyd_iter_chunked_dense( floating *centers_new_chunk floating *weight_in_clusters_chunk floating *pairwise_distances_chunk - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_lock_t lock + + omp_lock_t lock # count remainder chunk in total number of chunks n_chunks += n_samples != n_chunks * n_samples_chunk @@ -106,8 +109,7 @@ def lloyd_iter_chunked_dense( if update_centers: memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_init_lock(&lock) + omp_init_lock(&lock) with nogil, parallel(num_threads=n_threads): # thread local buffers @@ -135,24 +137,22 @@ def lloyd_iter_chunked_dense( # reduction from local buffers. if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - # The lock is necessary to avoid race conditions when aggregating - # info from different thread-local buffers. - openmp.omp_set_lock(&lock) + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) for j in range(n_clusters): weight_in_clusters[j] += weight_in_clusters_chunk[j] for k in range(n_features): centers_new[j, k] += centers_new_chunk[j * n_features + k] - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_unset_lock(&lock) + + omp_unset_lock(&lock) free(centers_new_chunk) free(weight_in_clusters_chunk) free(pairwise_distances_chunk) if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_destroy_lock(&lock) + omp_destroy_lock(&lock) _relocate_empty_clusters_dense(X, sample_weight, centers_old, centers_new, weight_in_clusters, labels) @@ -292,8 +292,7 @@ def lloyd_iter_chunked_sparse( floating *centers_new_chunk floating *weight_in_clusters_chunk - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_lock_t lock + omp_lock_t lock # count remainder chunk in total number of chunks n_chunks += n_samples != n_chunks * n_samples_chunk @@ -304,8 +303,7 @@ def lloyd_iter_chunked_sparse( if update_centers: memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_init_lock(&lock) + omp_init_lock(&lock) with nogil, parallel(num_threads=n_threads): # thread local buffers @@ -333,23 +331,20 @@ def lloyd_iter_chunked_sparse( # reduction from local buffers. if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - # The lock is necessary to avoid race conditions when aggregating - # info from different thread-local buffers. - openmp.omp_set_lock(&lock) + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) for j in range(n_clusters): weight_in_clusters[j] += weight_in_clusters_chunk[j] for k in range(n_features): centers_new[j, k] += centers_new_chunk[j * n_features + k] - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_unset_lock(&lock) + omp_unset_lock(&lock) free(centers_new_chunk) free(weight_in_clusters_chunk) if update_centers: - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - openmp.omp_destroy_lock(&lock) + omp_destroy_lock(&lock) _relocate_empty_clusters_sparse( X_data, X_indices, X_indptr, sample_weight, centers_old, centers_new, weight_in_clusters, labels) diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx index 182635bfb5136..96f13733c8892 100644 --- a/sklearn/linear_model/_sgd_fast.pyx +++ b/sklearn/linear_model/_sgd_fast.pyx @@ -20,20 +20,33 @@ from ..utils._seq_dataset cimport SequentialDataset64 as SequentialDataset cnp.import_array() -# Penalty constants -DEF NO_PENALTY = 0 -DEF L1 = 1 -DEF L2 = 2 -DEF ELASTICNET = 3 - -# Learning rate constants -DEF CONSTANT = 1 -DEF OPTIMAL = 2 -DEF INVSCALING = 3 -DEF ADAPTIVE = 4 -DEF PA1 = 5 -DEF PA2 = 6 - +cdef extern from *: + """ + /* Penalty constants */ + #define NO_PENALTY 0 + #define L1 1 + #define L2 2 + #define ELASTICNET 3 + + /* Learning rate constants */ + #define CONSTANT 1 + #define OPTIMAL 2 + #define INVSCALING 3 + #define ADAPTIVE 4 + #define PA1 5 + #define PA2 6 + """ + int NO_PENALTY = 0 + int L1 = 1 + int L2 = 2 + int ELASTICNET = 3 + + int CONSTANT = 1 + int OPTIMAL = 2 + int INVSCALING = 3 + int ADAPTIVE = 4 + int PA1 = 5 + int PA2 = 6 # ---------------------------------------- diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index dec1e96dbbb9f..6a4d879667d3a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -21,7 +21,7 @@ from cython.parallel cimport parallel, prange from libcpp.vector cimport vector from ...utils._cython_blas cimport _dot -from ...utils._openmp_helpers cimport _openmp_thread_num +from ...utils._openmp_helpers cimport omp_get_thread_num from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np @@ -88,7 +88,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( ) with nogil, parallel(num_threads=num_threads): - thread_num = _openmp_thread_num() + thread_num = omp_get_thread_num() for i in prange(n, schedule='static'): # Upcasting the i-th row of X from float32 to float64 @@ -245,7 +245,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: ITYPE_t thread_num with nogil, parallel(num_threads=self.chunks_n_threads): - thread_num = _openmp_thread_num() + thread_num = omp_get_thread_num() # Allocating thread datastructures self._parallel_on_X_parallel_init(thread_num) @@ -324,7 +324,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: X_end = X_start + self.X_n_samples_chunk with nogil, parallel(num_threads=self.chunks_n_threads): - thread_num = _openmp_thread_num() + thread_num = omp_get_thread_num() # Initializing datastructures used in this thread self._parallel_on_Y_parallel_init(thread_num, X_start, X_end) diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd index 6819824785424..a7694d0be2d93 100644 --- a/sklearn/utils/_openmp_helpers.pxd +++ b/sklearn/utils/_openmp_helpers.pxd @@ -1,6 +1,33 @@ -# Helpers to access OpenMP threads information +# Helpers to safely access OpenMP routines # -# Those interfaces act as indirections which allows the non-support of OpenMP -# for implementations which have been written for it. +# no-op implementations are provided for the case where OpenMP is not available. +# +# All calls to OpenMP routines should be cimported from this module. + +cdef extern from *: + """ + #ifdef _OPENMP + #include + #define SKLEARN_OPENMP_PARALLELISM_ENABLED 1 + #else + #define SKLEARN_OPENMP_PARALLELISM_ENABLED 0 + #define omp_lock_t int + #define omp_init_lock(l) (void)0 + #define omp_destroy_lock(l) (void)0 + #define omp_set_lock(l) (void)0 + #define omp_unset_lock(l) (void)0 + #define omp_get_thread_num() 0 + #define omp_get_max_threads() 1 + #endif + """ + bint SKLEARN_OPENMP_PARALLELISM_ENABLED + + ctypedef struct omp_lock_t: + pass -cdef int _openmp_thread_num() noexcept nogil + void omp_init_lock(omp_lock_t*) noexcept nogil + void omp_destroy_lock(omp_lock_t*) noexcept nogil + void omp_set_lock(omp_lock_t*) noexcept nogil + void omp_unset_lock(omp_lock_t*) noexcept nogil + int omp_get_thread_num() noexcept nogil + int omp_get_max_threads() noexcept nogil diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx index 25f57021c9fa9..f2b2a421e4fae 100644 --- a/sklearn/utils/_openmp_helpers.pyx +++ b/sklearn/utils/_openmp_helpers.pyx @@ -1,7 +1,5 @@ -IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - import os - cimport openmp - from joblib import cpu_count +import os +from joblib import cpu_count def _openmp_parallelism_enabled(): @@ -9,9 +7,8 @@ def _openmp_parallelism_enabled(): It allows to retrieve at runtime the information gathered at compile time. """ - # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during - # cythonization. It is defined via the `compile_time_env` kwarg of the - # `cythonize` call and behaves like the `-D` option of the C preprocessor. + # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time and defined + # in _openmp_helpers.pxd as a boolean. This function exposes it to Python. return SKLEARN_OPENMP_PARALLELISM_ENABLED @@ -41,31 +38,20 @@ cpdef _openmp_effective_n_threads(n_threads=None): if n_threads == 0: raise ValueError("n_threads = 0 is invalid") - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - if os.getenv("OMP_NUM_THREADS"): - # Fall back to user provided number of threads making it possible - # to exceed the number of cpus. - max_n_threads = openmp.omp_get_max_threads() - else: - max_n_threads = min(openmp.omp_get_max_threads(), cpu_count()) - - if n_threads is None: - return max_n_threads - elif n_threads < 0: - return max(1, max_n_threads + n_threads + 1) - - return n_threads - ELSE: + if not SKLEARN_OPENMP_PARALLELISM_ENABLED: # OpenMP disabled at build-time => sequential mode return 1 + if os.getenv("OMP_NUM_THREADS"): + # Fall back to user provided number of threads making it possible + # to exceed the number of cpus. + max_n_threads = omp_get_max_threads() + else: + max_n_threads = min(omp_get_max_threads(), cpu_count()) -cdef inline int _openmp_thread_num() noexcept nogil: - """Return the number of the thread calling this function. + if n_threads is None: + return max_n_threads + elif n_threads < 0: + return max(1, max_n_threads + n_threads + 1) - If scikit-learn is built without OpenMP support, always return 0. - """ - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - return openmp.omp_get_thread_num() - ELSE: - return 0 + return n_threads