diff --git a/doc/developers/performance.rst b/doc/developers/performance.rst
index bf6a1a24efd8e..6b9f4389a354d 100644
--- a/doc/developers/performance.rst
+++ b/doc/developers/performance.rst
@@ -344,27 +344,18 @@ Using OpenMP
 Since scikit-learn can be built without OpenMP, it's necessary to protect each
 direct call to OpenMP.
 
-There are some helpers in
+The `_openmp_helpers` module, available in
 `sklearn/utils/_openmp_helpers.pyx <https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/_openmp_helpers.pyx>`_
-that you can reuse for the main useful functionalities and already have the
-necessary protection to be built without OpenMP.
+provides protected versions of the OpenMP routines. To use OpenMP routines, they
+must be cimported from this module and not from the OpenMP library directly::
 
-If the helpers are not enough, you need to protect your OpenMP code using the
-following syntax::
-
-  # importing OpenMP
-  IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-      cimport openmp
-
-  # calling OpenMP
-  IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-      max_threads = openmp.omp_get_max_threads()
-  ELSE:
-      max_threads = 1
+  from sklearn.utils._openmp_helpers cimport omp_get_max_threads
+  max_threads = omp_get_max_threads()
 
 .. note::
 
-   Protecting the parallel loop, ``prange``, is already done by cython.
+   The parallel loop, `prange`, is already protected by cython and can be used directly
+   from `cython.parallel`.
 
 
 .. _profiling-compiled-extension:
diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py
index 33af1a5128ad7..d539c0c06ecc1 100644
--- a/sklearn/_build_utils/__init__.py
+++ b/sklearn/_build_utils/__init__.py
@@ -52,11 +52,9 @@ def cythonize_extensions(extension):
     # compilers are properly configured to build with OpenMP. This is expensive
     # and we only want to call this function once.
     # The result of this check is cached as a private attribute on the sklearn
-    # module (only at build-time) to be used twice:
-    # - First to set the value of SKLEARN_OPENMP_PARALLELISM_ENABLED, the
-    #   cython build-time variable passed to the cythonize() call.
-    # - Then in the build_ext subclass defined in the top-level setup.py file
-    #   to actually build the compiled extensions with OpenMP flags if needed.
+    # module (only at build-time) to be used in the build_ext subclass defined
+    # in the top-level setup.py file to actually build the compiled extensions
+    # with OpenMP flags if needed.
     sklearn._OPENMP_SUPPORTED = check_openmp_support()
 
     n_jobs = 1
@@ -82,9 +80,6 @@ def cythonize_extensions(extension):
     return cythonize(
         extension,
         nthreads=n_jobs,
-        compile_time_env={
-            "SKLEARN_OPENMP_PARALLELISM_ENABLED": sklearn._OPENMP_SUPPORTED
-        },
         compiler_directives=compiler_directives,
     )
 
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index abde900903c50..c2919ac1d0012 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -6,13 +6,16 @@
 # fused types and when the array may be read-only (for instance when it's
 # provided by the user). This is fixed in cython > 0.3.
 
-IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-    cimport openmp
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport calloc, free
 from libc.string cimport memset
 
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
 from ..utils.extmath import row_norms
 from ._k_means_common import CHUNK_SIZE
 from ._k_means_common cimport _relocate_empty_clusters_dense
@@ -274,8 +277,7 @@ def elkan_iter_chunked_dense(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
 
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -286,8 +288,7 @@ def elkan_iter_chunked_dense(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -316,23 +317,20 @@ def elkan_iter_chunked_dense(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_dense(X, sample_weight, centers_old,
                                        centers_new, weight_in_clusters, labels)
 
@@ -516,8 +514,7 @@ def elkan_iter_chunked_sparse(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
 
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -528,8 +525,7 @@ def elkan_iter_chunked_sparse(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -561,23 +557,20 @@ def elkan_iter_chunked_sparse(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_sparse(
             X_data, X_indices, X_indptr, sample_weight,
             centers_old, centers_new, weight_in_clusters, labels)
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index 63e5ee3530d6e..6ca50b2a1d0d9 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -4,14 +4,17 @@
 # fused types and when the array may be read-only (for instance when it's
 # provided by the user). This is fixed in cython > 0.3.
 
-IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-    cimport openmp
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
 from ..utils.extmath import row_norms
 from ..utils._cython_blas cimport _gemm
 from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
@@ -94,8 +97,8 @@ def lloyd_iter_chunked_dense(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
         floating *pairwise_distances_chunk
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -106,8 +109,7 @@ def lloyd_iter_chunked_dense(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -135,24 +137,22 @@ def lloyd_iter_chunked_dense(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
         free(pairwise_distances_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_dense(X, sample_weight, centers_old,
                                     centers_new, weight_in_clusters, labels)
 
@@ -292,8 +292,7 @@ def lloyd_iter_chunked_sparse(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
 
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -304,8 +303,7 @@ def lloyd_iter_chunked_sparse(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -333,23 +331,20 @@ def lloyd_iter_chunked_sparse(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_sparse(
             X_data, X_indices, X_indptr, sample_weight,
             centers_old, centers_new, weight_in_clusters, labels)
diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx
index 182635bfb5136..96f13733c8892 100644
--- a/sklearn/linear_model/_sgd_fast.pyx
+++ b/sklearn/linear_model/_sgd_fast.pyx
@@ -20,20 +20,33 @@ from ..utils._seq_dataset cimport SequentialDataset64 as SequentialDataset
 
 cnp.import_array()
 
-# Penalty constants
-DEF NO_PENALTY = 0
-DEF L1 = 1
-DEF L2 = 2
-DEF ELASTICNET = 3
-
-# Learning rate constants
-DEF CONSTANT = 1
-DEF OPTIMAL = 2
-DEF INVSCALING = 3
-DEF ADAPTIVE = 4
-DEF PA1 = 5
-DEF PA2 = 6
-
+cdef extern from *:
+    """
+    /* Penalty constants */
+    #define NO_PENALTY 0
+    #define L1 1
+    #define L2 2
+    #define ELASTICNET 3
+
+    /* Learning rate constants */
+    #define CONSTANT 1
+    #define OPTIMAL 2
+    #define INVSCALING 3
+    #define ADAPTIVE 4
+    #define PA1 5
+    #define PA2 6
+    """
+    int NO_PENALTY = 0
+    int L1 = 1
+    int L2 = 2
+    int ELASTICNET = 3
+
+    int CONSTANT = 1
+    int OPTIMAL = 2
+    int INVSCALING = 3
+    int ADAPTIVE = 4
+    int PA1 = 5
+    int PA2 = 6
 
 
 # ----------------------------------------
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index dec1e96dbbb9f..6a4d879667d3a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -21,7 +21,7 @@ from cython.parallel cimport parallel, prange
 from libcpp.vector cimport vector
 
 from ...utils._cython_blas cimport _dot
-from ...utils._openmp_helpers cimport _openmp_thread_num
+from ...utils._openmp_helpers cimport omp_get_thread_num
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
@@ -88,7 +88,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
         )
 
     with nogil, parallel(num_threads=num_threads):
-        thread_num = _openmp_thread_num()
+        thread_num = omp_get_thread_num()
 
         for i in prange(n, schedule='static'):
             # Upcasting the i-th row of X from float32 to float64
@@ -245,7 +245,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
             ITYPE_t thread_num
 
         with nogil, parallel(num_threads=self.chunks_n_threads):
-            thread_num = _openmp_thread_num()
+            thread_num = omp_get_thread_num()
 
             # Allocating thread datastructures
             self._parallel_on_X_parallel_init(thread_num)
@@ -324,7 +324,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
                 X_end = X_start + self.X_n_samples_chunk
 
             with nogil, parallel(num_threads=self.chunks_n_threads):
-                thread_num = _openmp_thread_num()
+                thread_num = omp_get_thread_num()
 
                 # Initializing datastructures used in this thread
                 self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd
index 6819824785424..a7694d0be2d93 100644
--- a/sklearn/utils/_openmp_helpers.pxd
+++ b/sklearn/utils/_openmp_helpers.pxd
@@ -1,6 +1,33 @@
-# Helpers to access OpenMP threads information
+# Helpers to safely access OpenMP routines
 #
-# Those interfaces act as indirections which allows the non-support of OpenMP
-# for implementations which have been written for it.
+# no-op implementations are provided for the case where OpenMP is not available.
+#
+# All calls to OpenMP routines should be cimported from this module.
+
+cdef extern from *:
+    """
+    #ifdef _OPENMP
+        #include <omp.h>
+        #define SKLEARN_OPENMP_PARALLELISM_ENABLED 1
+    #else
+        #define SKLEARN_OPENMP_PARALLELISM_ENABLED 0
+        #define omp_lock_t int
+        #define omp_init_lock(l) (void)0
+        #define omp_destroy_lock(l) (void)0
+        #define omp_set_lock(l) (void)0
+        #define omp_unset_lock(l) (void)0
+        #define omp_get_thread_num() 0
+        #define omp_get_max_threads() 1
+    #endif
+    """
+    bint SKLEARN_OPENMP_PARALLELISM_ENABLED
+
+    ctypedef struct omp_lock_t:
+        pass
 
-cdef int _openmp_thread_num() noexcept nogil
+    void omp_init_lock(omp_lock_t*) noexcept nogil
+    void omp_destroy_lock(omp_lock_t*) noexcept nogil
+    void omp_set_lock(omp_lock_t*) noexcept nogil
+    void omp_unset_lock(omp_lock_t*) noexcept nogil
+    int omp_get_thread_num() noexcept nogil
+    int omp_get_max_threads() noexcept nogil
diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx
index 25f57021c9fa9..f2b2a421e4fae 100644
--- a/sklearn/utils/_openmp_helpers.pyx
+++ b/sklearn/utils/_openmp_helpers.pyx
@@ -1,7 +1,5 @@
-IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-    import os
-    cimport openmp
-    from joblib import cpu_count
+import os
+from joblib import cpu_count
 
 
 def _openmp_parallelism_enabled():
@@ -9,9 +7,8 @@ def _openmp_parallelism_enabled():
 
     It allows to retrieve at runtime the information gathered at compile time.
     """
-    # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during
-    # cythonization. It is defined via the `compile_time_env` kwarg of the
-    # `cythonize` call and behaves like the `-D` option of the C preprocessor.
+    # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time and defined
+    # in _openmp_helpers.pxd as a boolean. This function exposes it to Python.
     return SKLEARN_OPENMP_PARALLELISM_ENABLED
 
 
@@ -41,31 +38,20 @@ cpdef _openmp_effective_n_threads(n_threads=None):
     if n_threads == 0:
         raise ValueError("n_threads = 0 is invalid")
 
-    IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-        if os.getenv("OMP_NUM_THREADS"):
-            # Fall back to user provided number of threads making it possible
-            # to exceed the number of cpus.
-            max_n_threads = openmp.omp_get_max_threads()
-        else:
-            max_n_threads = min(openmp.omp_get_max_threads(), cpu_count())
-
-        if n_threads is None:
-            return max_n_threads
-        elif n_threads < 0:
-            return max(1, max_n_threads + n_threads + 1)
-
-        return n_threads
-    ELSE:
+    if not SKLEARN_OPENMP_PARALLELISM_ENABLED:
         # OpenMP disabled at build-time => sequential mode
         return 1
 
+    if os.getenv("OMP_NUM_THREADS"):
+        # Fall back to user provided number of threads making it possible
+        # to exceed the number of cpus.
+        max_n_threads = omp_get_max_threads()
+    else:
+        max_n_threads = min(omp_get_max_threads(), cpu_count())
 
-cdef inline int _openmp_thread_num() noexcept nogil:
-    """Return the number of the thread calling this function.
+    if n_threads is None:
+        return max_n_threads
+    elif n_threads < 0:
+        return max(1, max_n_threads + n_threads + 1)
 
-    If scikit-learn is built without OpenMP support, always return 0.
-    """
-    IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-        return openmp.omp_get_thread_num()
-    ELSE:
-        return 0
+    return n_threads