8000 SIMD: Introduce tracer for enabled CPU targets on each optimized func… · Python-Repository-Hub/numpy@5ffeef1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5ffeef1

Browse files
authored
SIMD: Introduce tracer for enabled CPU targets on each optimized function (numpy#24420)
* SIMD: Cleanup the cpu dispatcher by moving disutils helper macros into a seprated header This should should be removed once we drop the support of disutils * SIMD: Introduce tracer for enabled CPU targets on each optimized function This update introduces a tracer mechanism that enables tracking of the enabled targets for each optimized function in the NumPy library. With this enhancement, it becomes possible to precisely monitor the enabled CPU dispatch targets for the dispatched functions. A new function named `opt_func_info` has been added to the new namespace `numpy.lib.introspect` module, offering this tracing capability. This function allows you to retrieve information about the enabled targets based on function names and data type signatures. Here's an example of how to use it: ```python >> func_info = numpy.lib.introspect.opt_func_info(func_name='add|abs', signature='float64|complex64') >> print(json.dumps(func_info, indent=2)) { "absolute": { "dd": { "current": "SSE41", "available": "SSE41 baseline(SSE SSE2 SSE3)" }, "Ff": { "current": "FMA3__AVX2", "available": "AVX512F FMA3__AVX2 baseline(SSE SSE2 SSE3)" }, "Dd": { "current": "FMA3__AVX2", "available": "AVX512F FMA3__AVX2 baseline(SSE SSE2 SSE3)" } }, "add": { "ddd": { "current": "FMA3__AVX2", "available": "FMA3__AVX2 baseline(SSE SSE2 SSE3)" }, "FFF": { "current": "FMA3__AVX2", "available": "FMA3__AVX2 baseline(SSE SSE2 SSE3)" } } } ``` For tracer utilization, remember to invoke the new `NPY_CPU_DISPATCH_TRACE()` macro either before or after employing `NPY_CPU_DISPATCH_CALL()` for dispatching. For more clarification, please refer to the header `numpy/core/src/common/npy_cpu_dispatch.h`. As part of this solution, a new dictionary, `__cpu_targets_info__`, has been introduced within the `numpy.core._multiarray_umath` module. This dictionary contains relevant data about enabled targets for each optimized function. As of now, the tracing mechanism covers ufunc-based functions, `argmax`, and `argmin` However, functions like sorting operations may require refactoring due to the tracer's associated cost. It's noteworthy that the tracer should be called only once during the initialization of Python C functions to avoid performance regressions. * DOC: Add a release note for CPU Optimization Tracking
1 parent b9c4023 commit 5ffeef1

File tree

16 files changed

+506
-256
lines changed

16 files changed

+506
-256
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Improved CPU Optimization Tracking
2+
----------------------------------
3+
4+
Introduces a tracer mechanism that enables tracking of the enabled targets
5+
for each optimized function in the NumPy library. With this enhancement,
6+
it becomes possible to precisely monitor the enabled CPU dispatch
7+
targets for the dispatched functions.
8+
9+
A new function named `opt_func_info` has been added to the new namespace `numpy.lib.introspect`,
10+
offering this tracing capability. This function allows you to retrieve information
11+
about the enabled targets based on function names and data type signatures.
12+

doc/source/reference/simd/build-options.rst

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,3 +385,42 @@ this will disable ``AVX2`` and ``FMA3``::
385385
NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"
386386

387387
If the feature is not available, a warning will be emitted.
388+
389+
Tracking dispatched functions
390+
-----------------------------
391+
Discovering which CPU targets are enabled for different optimized functions is achievable
392+
through the Python function ``numpy.lib.introspect.opt_func_info``.
393+
This function offers the flexibility of applying filters using two optional arguments:
394+
one for refining function names and the other for specifying data types in the signatures.
395+
396+
For example::
397+
398+
>> func_info = numpy.lib.introspect.opt_func_info(func_name='add|abs', signature='float64|complex64')
399+
>> print(json.dumps(func_info, indent=2))
400+
{
401+
"absolute": {
402+
"dd": {
403+
"current": "SSE41",
404+
"available": "SSE41 baseline(SSE SSE2 SSE3)"
405+
},
406+
"Ff": {
407+
"current": "FMA3__AVX2",
408+
"available": "AVX512F FMA3__AVX2 baseline(SSE SSE2 SSE3)"
409+
},
410+
"Dd": {
411+
"current": "FMA3__AVX2",
412+
"available": "AVX512F FMA3__AVX2 baseline(SSE SSE2 SSE3)"
413+
}
414+
},
415+
"add": {
416+
"ddd": {
417+
"current": "FMA3__AVX2",
418+
"available": "FMA3__AVX2 baseline(SSE SSE2 SSE3)"
419+
},
420+
"FFF": {
421+
"current": "FMA3__AVX2",
422+
"available": "FMA3__AVX2 baseline(SSE SSE2 SSE3)"
423+
}
424+
}
425+
}
426+

meson_cpu/main_config.h.in

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
#define @P@_CPU_CAT__(a, b) a ## b
4141
#define @P@_CPU_CAT_(a, b) @P@_CPU_CAT__(a, b)
4242
#define @P@_CPU_CAT(a, b) @P@_CPU_CAT_(a, b)
43+
#define @P@_CPU_STRINGIFY(x) #x
44+
#define @P@_CPU_TOSTRING(x) @P@_CPU_STRINGIFY(x)
4345

4446
/**
4547
* @def @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...)
@@ -73,7 +75,6 @@
7375
*/
7476
#define @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...) \
7577
@WITH_CPU_DISPATCH_CALL@
76-
7778
/*
7879
* Defines the default behavior for the configurable macros derived from the configuration header
7980
* that is generated by the meson function `mod_features.multi_targets()`.
@@ -127,7 +128,7 @@
127128
* #elif defined(@P@HAVE_SSE3)
128129
* // CODE
129130
* #else
130-
* // Fallback code in case of features enabled
131+
* // Fallback code in case of no features enabled
131132
* #endif
132133
* }
133134
* @endif
@@ -145,12 +146,12 @@
145146
* @param NAME The base name of the dispatched function or variable.
146147
*/
147148
#ifdef @P@MTARGETS_CURRENT
148-
// '@P@MTARGETS_CURRENT': only defined by the dispatchable sources
149+
// '@P@MTARGETS_CURRENT': only defined by the dispatch targets
150+
// within the meson function `mod_features.multi_targets()`
149151
#define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_CAT(@P@_CPU_CAT(NAME, _), @P@MTARGETS_CURRENT)
150152
#else
151153
#define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_EXPAND(NAME)
152154
#endif
153-
154155
/**
155156
* @def @P@CPU_DISPATCH_DECLARE(...)
156157
*
@@ -290,6 +291,42 @@
290291
@P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_XB_CB_, __VA_ARGS__) \
291292
((void) 0 /* discarded expression value */)
292293

294+
/**
295+
* @def @P@CPU_DISPATCH_INFO(...)
296+
*
297+
* Returns an array of two strings containing the enabled target names
298+
* in each multi-target source.
299+
*
300+
* The first item represents the currently dispatched target,
301+
* while the second item contains the available targets that
302+
* can potentially be dispatched based on CPU capabilities.
303+
*
304+
* @code
305+
* #include "arithmetic.dispatch.h" // generated config file
306+
* const char *enabled_targets[] = @P@CPU_DISPATCH_INFO();
307+
*
308+
* printf("Current dispatched target: %s\n", enabled_targets[0]);
309+
* printf("Available targets: %s\n", enabled_targets[1]);
310+
* @endcode
311+
*/
312+
#define @P@CPU_DISPATCH_INFO() \
313+
{ \
314+
@P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_INFO_HIGH_CB_, DUMMY) \
315+
@P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_INFO_BASE_HIGH_CB_, DUMMY) \
316+
"", \
317+
@P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_INFO_CB_, DUMMY) \
318+
@P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_INFO_BASE_CB_, DUMMY) \
319+
""\
320+
}
321+
#define @P@CPU_DISPATCH_INFO_HIGH_CB_(TESTED_FEATURES, TARGET_NAME, ...) \
322+
(TESTED_FEATURES) ? @P@_CPU_TOSTRING(TARGET_NAME) :
323+
#define @P@CPU_DISPATCH_INFO_BASE_HIGH_CB_(...) \
324+
(1) ? "baseline(" @P@WITH_CPU_BASELINE ")" :
325+
// Preprocessor callbacks
326+
#define @P@CPU_DISPATCH_INFO_CB_(TESTED_FEATURES, TARGET_NAME, ...) \
327+
@P@_CPU_TOSTRING(TARGET_NAME) " "
328+
#define @P@CPU_DISPATCH_INFO_BASE_CB_(...) \
329+
"baseline(" @P@WITH_CPU_BASELINE ")"
293330
/**
294331
* Macro @P@CPU_DISPATCH_CALL_ALL(...)
295332
*

numpy/core/code_generators/generate_umath.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,7 +1240,9 @@ def make_arrays(funcdict):
12401240
if cfunc_fname:
12411241
funclist.append(cfunc_fname)
12421242
if t.dispatch:
1243-
dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname))
1243+
dispdict.setdefault(t.dispatch, []).append(
1244+
(name, k, cfunc_fname, t.in_ + t.out)
1245+
)
12441246
else:
12451247
funclist.append('NULL')
12461248

@@ -1265,8 +1267,9 @@ def make_arrays(funcdict):
12651267
#include "{dname}.dispatch.h"
12661268
#endif
12671269
"""))
1268-
for (ufunc_name, func_idx, cfunc_name) in funcs:
1270+
for (ufunc_name, func_idx, cfunc_name, inout) in funcs:
12691271
code2list.append(textwrap.dedent(f"""\
1272+
NPY_CPU_DISPATCH_TRACE("{ufunc_name}", "{''.join(inout)}");
12701273
NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name});
12711274
"""))
12721275
return "\n".join(code1list), "\n".join(code2list)

numpy/core/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,7 @@ src_multiarray_umath_common = [
982982
'src/common/ufunc_override.c',
983983
'src/common/numpyos.c',
984984
'src/common/npy_cpu_features.c',
985+
'src/common/npy_cpu_dispatch.c',
985986
src_file.process('src/common/templ_common.h.src')
986987
]
987988
if have_blas

numpy/core/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,7 @@ def get_mathlib_info(*args):
746746
join('src', 'common', 'ufunc_override.c'),
747747
join('src', 'common', 'numpyos.c'),
748748
join('src', 'common', 'npy_cpu_features.c'),
749+
join('src', 'common', 'npy_cpu_dispatch.c'),
749750
]
750751

751752
if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0":
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#include "npy_cpu_dispatch.h"
2+
3+
static PyObject *npy__cpu_dispatch_registery = NULL;
4+
5+
NPY_VISIBILITY_HIDDEN int
6+
npy_cpu_dispatch_tracer_init(PyObject *mod)
7+
{
8+
if (npy__cpu_dispatch_registery != NULL) {
9+
PyErr_Format(PyExc_RuntimeError, "CPU dispatcher tracer already initlized");
10+
return -1;
11+
}
12+
PyObject *mod_dict = PyModule_GetDict(mod);
13+
if (mod_dict == NULL) {
14+
return -1;
15+
}
16+
PyObject *reg_dict = PyDict_New();
17+
if (reg_dict == NULL) {
18+
return -1;
19+
}
20+
int err = PyDict_SetItemString(mod_dict, "__cpu_targets_info__", reg_dict);
21+
Py_DECREF(reg_dict);
22+
if (err != 0) {
23+
return -1;
24+
}
25+
npy__cpu_dispatch_registery = reg_dict;
26+
return 0;
27+
}
28+
29+
NPY_VISIBILITY_HIDDEN void
30+
npy_cpu_dispatch_trace(const char *fname, const char *signature,
31+
const char **dispatch_info)
32+
{
33+
PyObject *func_dict = PyDict_GetItemString(npy__cpu_dispatch_registery, fname);
34+
if (func_dict == NULL) {
35+
func_dict = PyDict_New();
36+
if (func_dict == NULL) {
37+
return;
38+
}
39+
int err = PyDict_SetItemString(npy__cpu_dispatch_registery, fname, func_dict);
40+
Py_DECREF(func_dict);
41+
if (err != 0) {
42+
return;
43+
}
44+
}
45+
// target info for each signature
46+
PyObject *sig_dict = PyDict_New();
47+
if (sig_dict == NULL) {
48+
return;
49+
}
50+
int err = PyDict_SetItemString(func_dict, signature, sig_dict);
51+
Py_DECREF(sig_dict);
52+
if (err != 0) {
53+
return;
54+
}
55+
// current dispatched target
56+
PyObject *current_target = PyUnicode_FromString(dispatch_info[0]);
57+
if (current_target == NULL) {
58+
return;
59+
}
60+
err = PyDict_SetItemString(sig_dict, "current", current_target);
61+
Py_DECREF(current_target);
62+
if (err != 0) {
63+
return;
64+
}
65+
// available targets
66+
PyObject *available = PyUnicode_FromString(dispatch_info[1]);
67+
if (available == NULL) {
68+
return;
69+
}
70+
err = PyDict_SetItemString(sig_dict, "available", available);
71+
Py_DECREF(available);
72+
if (err != 0) {
73+
return;
74+
}
75+
}

0 commit comments

Comments
 (0)
0