diff --git a/benchmarks/misc/benchin_ufunc.py b/benchmarks/misc/benchin_ufunc.py new file mode 100755 index 000000000000..9d01a358854c --- /dev/null +++ b/benchmarks/misc/benchin_ufunc.py @@ -0,0 +1,570 @@ +#!/usr/bin/env python3 +""" +bench_ufunc.py [OPTIONS] [-- ARGS] + +Standalone benchmark script for the inner loops of ufunc + + This script only measuring the performance of inner loops + of ufunc, the idea behind it is to remove umath object calls + from the equation, in order to reduce the number of noises and + provides stable ratios. + +Examples:: + $ benchin_ufunc.py --filter "square.*f" --export opt_square.json + $ benchin_ufunc.py --filter "square.*f" --compare opt_square.json --output current.md + $ benchin_ufunc.py --filter "square.*f" --compare opt_square.json --only-changed 0.05 +""" + +import os, sys, re, itertools, functools, json, argparse, multiprocessing, time +import numpy as np +import numpy.core._umath_tests as utests + +class Colored: + # FG codes + RED = 31 + GREEN = 32 + YELLOW = 33 + IS_TTYOUT = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty() + IS_TTYERR = hasattr(sys.stderr, 'isatty') and sys.stderr.isatty() + + _colored_ansi = lambda txt, fg: ( + "\033[%dm%s\033[0m" % (fg, txt) + ) if os.name != "nt" else lambda txt, fg: txt + + _colored_out = _colored_ansi if IS_TTYOUT \ + else lambda txt, fg: txt + + _colored_err = _colored_ansi if IS_TTYERR \ + else lambda txt, fg: txt + + @staticmethod + def text(txt, color_id): + return Colored._colored_ansi(txt, color_id) + + @staticmethod + def text_tty(txt, color_id): + return Colored._colored_out(txt, color_id) + + @staticmethod + def ok(txt, **kwargs): + print(Colored._colored_out(txt, Colored.GREEN), **kwargs) + + @staticmethod + def notify(txt, **kwargs): + print(Colored._colored_out(txt, Colored.YELLOW), **kwargs) + + @staticmethod + def fatal(txt): + raise SystemExit(Colored._colored_err(txt, Colored.RED)) + +class Table: + FORMATS = ('md', 'txt', 'ansi') + ALIGN_TXT = { + 'left': '<', + 'right': '>', + 'center': '^' + } + ALIGN_MD = { + 'left': '---', + 'right': '---', + 'center': ':-:' + } + def __init__(self, tformat, field_names, ratios_indexes=[], align_indexes={}): + assert(tformat in self.FORMATS) + assert(all([a in self.ALIGN_TXT for a in align_indexes.values()])) + if tformat == "md": + self._highlight = self._highlight_md + self.get_string = self._md_str + elif tformat == "ansi": + self._highlight = self._highlight_ansi + self.get_string = self._pretty_str + else: + self._highlight = lambda f: f + self.get_string = self._pretty_str + + self._rindexes = ratios_indexes + self._aindexes = align_indexes + self._fields_len = len(field_names) + self._fields_mwidth = [0 for _ in range(self._fields_len)] + self._rows = [] + self.add_row(field_names) + + def __str__(self): + return self.get_string() + + def add_row(self, fields): + assert(len(fields) == self._fields_len) + final = [] + mwidth = self._fields_mwidth + for c, f in enumerate(fields): + if f is None: + f = "N/A" + elif not isinstance(f, str): + if c in self._rindexes: + f = str(self._highlight(round(f, 2))) + else: + f = "{0:6.4f}".format(f) + flen = len(f) + if flen > mwidth[c]: + mwidth[c] = flen + final.append(f) + self._rows.append(final) + + def _highlight_md(self, f): + if f > 1.05: + return "**`%s`**" % f + elif f < 0.95: + return "*`%s`*" % f + return f + + def _highlight_ansi(self, f): + if f > 1.05: + return Colored.text(f, Colored.GREEN) + elif f < 0.95: + return Colored.text(f, Colored.RED) + return f + + def _pretty_str(self): + def pretty_row(margin, mwidth, joinc, row): + ansi_pad = lambda x: 9 if x.startswith("\033[") else 0 + text_align = lambda c: self.ALIGN_TXT.get( + self._aindexes.get(c, 'center') + ) + return ''.join([ + joinc + ( + "{margin}{val:%s%d}{margin}" % ( + text_align(c), mwidth[c] + ansi_pad(val) + ) + ).format(margin=' '*margin, val=val) + for c, val in enumerate(row) + ]) + joinc + + list_str = [] + padding = 1 + margin = 1 + mwidth = [w + padding for w in self._fields_mwidth] + # the header + list_str.append(pretty_row( + 0, mwidth, '+', [('-' * (w + margin*2)) for w in mwidth] + )) + list_str.append(pretty_row( + margin, mwidth, '|', self._rows[0] + )) + list_str.append(list_str[0]) + # get the rest + for row in self._rows[1:]: + list_str.append(pretty_row( + margin, mwidth, '|', row + )) + # the footer + list_str.append(list_str[0]) + return "\n".join(list_str) + + def _md_str(self): + ensure_space = lambda row: [ + f.replace(' ', r'  ') if ' '*2 in f else f for f in row + ] + md_row = lambda row: '|' + '|'.join(ensure_space(row)) + '|' + fields_align = md_row([ + self.ALIGN_MD.get( + self._aindexes.get(c, 'center') + ) + for c in range(self._fields_len) + ]) + list_str = [md_row(self._rows[0]), fields_align] + for row in self._rows[1:]: + list_str.append(md_row(row)) + return '\n'.join(list_str) + +class Timing: + SEC = 1e0 + MS = 1e3 + US = 1e6 + + @staticmethod + def set_affinity(*CPUs): + utests.ctiming_set_affinity(CPUs) + + @staticmethod + def to_unit(ncycles, scale): + # ctiming_frequency contains the number of clock-cycles per second + return (ncycles * scale) / utests.ctiming_frequency + + def __init__(self, ufunc, nsamples, iteration, warmup, msleep): + self._ufunc = utests.ctiming(ufunc, iteration, warmup) + self._csamples = utests.ctiming_elapsed(self._ufunc) + self._samples = [] + self._nsamples = nsamples + self._iteration = iteration + self._warmup = warmup + self._msleep = msleep + + def metrics(self): + samples = self._samples + lx = np.log(samples) + gmean = np.exp(lx.sum()/len(samples)) + gstd = np.exp(np.std(lx)) + mean = np.mean(samples) + median = np.median(samples) + return dict(gmean=gmean, gstd=gstd, mean=mean, median=median) + + def run(self, *args, **kwargs): + self._samples = self._run(*args, **kwargs) + + def _run(self, *args, **kwargs): + # clear any previous C samples + self._csamples.clear() + if self._msleep > 0: + ssleep = self._msleep / 1000 + for _ in range(self._nsamples): + time.sleep(ssleep) + self._ufunc(*args, **kwargs) + else: + for _ in range(self._nsamples): + self._ufunc(*args, **kwargs) + + s = np.array(self._csamples).astype("float64") + s /= self._iteration + # remove outliers + q_25, q_75 = np.percentile(s, [25, 75]) + iqr = q_75 - q_25 + half = iqr * 1.5 + low = s >= (q_25 - half) + high = s <= (q_75 + half) + idx = low & high + return s[idx] + +class Benchmark: + ASCII = 0 + MARKDOWN = 1 + + def __init__(self, **kwargs): + for attr, dval in ( + ("filter", ".*"), + ("strides", [1]), + ("sizes", [1024]), + ("nsamples", 100), + ("iteration", 1), + ("warmup", 0), + ("msleep", 0), + ("unit_scale", 1000), + ("metric", "gmean"), + ("rand_range", None) + ): + setattr(self, '_' + attr, kwargs.pop(attr, dval)) + + + def generate_tests(self): + tests = dict() + filter_rgx = re.compile(self._filter) + + for ufunc_name in dir(np): + ufunc = getattr(np, ufunc_name) + if not isinstance(ufunc, np.ufunc): + continue + + nin = ufunc.nin; nout = ufunc.nout; utypes = ufunc.types + permutes = [self._strides] * (nin + nout) + permutes = list(itertools.product(*permutes)) + + for tsym in utypes: + tsym = tsym.split('->'); tin = tsym[0]; tout = tsym[1] + for p in permutes: + str_in = ' '.join([ + "%s::%d" % (tin[c], s) for c, s in enumerate(p[:nin]) + ]) + str_out = ' '.join([ + "%s::%d" % (tout[c], s) for c, s in enumerate(p[nin:]) + ]) + for size in self._sizes: + case_name = "{ufunc_name}::{size:<6} {str_in} -> {str_out}".format( + ufunc_name=ufunc_name, size=size, str_in=str_in, str_out=str_out + ) + if not filter_rgx.match(case_name): + continue + cases = tests.setdefault(ufunc_name, {}) + cases[case_name] = dict( + size=size, strides=p, types=tin+tout + ) + return tests + + @staticmethod + def timing_ufunc(queue, test_cases, ufunc_name, nsamples, iteration, warmup, msleep, rand_range): + @functools.lru_cache(maxsize=1024) + def rand(size, dtype, prevent_overlap=0): + if dtype == '?': + return np.random.randint(0, 1, size=size, dtype=dtype) + elif not rand_range: + if dtype in 'bBhHiIlLqQ': + return np.random.randint(1, 127, size=size, dtype=dtype) + else: + return np.array(np.random.rand(size), dtype=dtype) + else: + ltype = dtype.lower() if dtype in 'GDF' else dtype + lrange = len(rand_range)//2 + rand = np.empty(size, dtype=ltype) + + for i, r in enumerate(zip(rand_range[0::2], rand_range[1::2])): + rmin, rmax = np.array(r, dtype=ltype) + rand[i::lrange] = np.random.uniform(low=rmin, high=rmax, size=size//lrange).astype(ltype) + + return rand + rand*1j if dtype in 'GDF' else rand + + timing = Timing( + ufunc=getattr(np, ufunc_name), nsamples=nsamples, + iteration=iteration, warmup=warmup, msleep=msleep + ) + result = {} + for name, prob in test_cases.items(): + size, strides, types = prob["size"], prob["strides"], prob["types"] + try: + timing.run(*[ + rand(size * strides[c], t, c)[::strides[c]] + for c, t in enumerate(types) + ]) + print('.', end='', flush=True) + except KeyboardInterrupt: + break + except Exception as err: + Colored.notify(f"Escape test '{name}' -> {err}") + continue + result[name] = timing.metrics() + queue.put(result) + print("done", flush=True) + + def run(self, tests): + multiprocessing.set_start_method('spawn') + for ufunc_name, test_cases in tests.items(): + print("Benchmarking ufunc %s, %d cases " % (ufunc_name, len(test_cases)), end='', flush=True) + queue = multiprocessing.Queue() + p = multiprocessing.Process(target=self.timing_ufunc, args=( + queue, test_cases, ufunc_name, self._nsamples, + self._iteration, self._warmup, self._msleep, self._rand_range) + ) + p.start() + p.join() + result = queue.get() + if not result: + continue + for name, metrics in result.items(): + test_cases[name].update(metrics) + + def generate_table(self, tests_names, tests, only_changed=0, tformat="txt"): + assert(len(tests_names) == len(tests)) + field_names = ["name of test"] + tests_names + [ + "%s vs %s" % (t, tests_names[0]) for t in tests_names[1:] + ] + ratios_fields = list(range(len(tests_names) + 1, len(tests_names)*2)) + table = Table(tformat, field_names, ratios_fields, {0:'left'}) + + if len(tests) > 1 and only_changed != 0: + factor_l = 1.0 - only_changed; factor_h = 1.0 + only_changed + factor_falls = lambda f: f <= factor_l or f >= factor_h + self._compare(tests[0], tests[1:], + lambda case_name, metrics, ratios: table.add_row( + [case_name] + metrics + ratios + ) if any([f and factor_falls(round(f, 2)) for f in ratios]) else None + ) + else: + self._compare(tests[0], tests[1:], + lambda case_name, metrics, ratios: table.add_row( + [case_name] + metrics + ratios + ) + ) + return str(table) + + def _compare(self, tests, cmp_tests, append_to): + for ufunc_name, test_cases in tests.items(): + test_cases = sorted(test_cases.items(), key=lambda k: ( + k[1]["types"], k[1]["strides"], k[1]["size"] + )) + for case_name, case in test_cases: + metric = case.get(self._metric) + if not metric: + cmplen = len(cmp_tests) + append_to(case_name, [None]*(cmplen+1), [None]*cmplen) + continue + + metric = Timing.to_unit(metric, self._unit_scale) + cmp_metrics, cmp_ratios = [], [] + for cmp_test in cmp_tests: + cmp_case = cmp_test.get(ufunc_name, {}).get(case_name, {}) + cmp_metric = cmp_case.get(self._metric) + if not cmp_metric: + cmp_metrics.append(None) + cmp_ratios.append(None) + continue + cmp_metric = Timing.to_unit(cmp_metric, self._unit_scale) + try: + ratio = metric/cmp_metric + except ZeroDivisionError: + ratio = None + cmp_ratios.append(ratio) + cmp_metrics.append(cmp_metric) + append_to(case_name, [metric] + cmp_metrics, cmp_ratios) + +def arg_output(arg): + output_path = os.path.abspath(arg) + output_name = os.path.splitext(os.path.basename(arg)) + output_type, output_name = output_name[1][1:], output_name[0] + + if not output_type or not output_name: + raise argparse.ArgumentTypeError(f"Invalid output path -> '{arg}'") + if output_type not in Table.FORMATS: + raise argparse.ArgumentTypeError( + f"Unsupported output format {output_type}, expected a path ends with one of ({Table.FORMATS}) -> '{arg}'" + ) + return dict(path=output_path, name=output_name, type=output_type) + +def arg_compare(arg): + path = os.path.abspath(arg) + test_name = os.path.splitext(os.path.basename(path))[0] + data = {} + try: + with open(path, "r") as fd: + data = json.load(fd) + except IOError as err: + raise argparse.ArgumentTypeError(f"Unable to load JSON file -> {str(err)}") + except json.JSONDecodeError as err: + raise argparse.ArgumentTypeError(f"Invalid JSON file -> {path}, {str(err)}") + return (test_name, data) + +def arg_only_changed(arg): + try: + a = float(arg) + except ValueError: + raise argparse.ArgumentTypeError("Expected a floating point number") + if a and a < 0.0 or a > 1.0: + raise argparse.ArgumentTypeError("The value must falls between 0.0 and 1.0") + return a + +class arg_rand_range(argparse.Action): + def __call__(self, parser, args, values, option_string=None): + if len(values) not in (2, 4, 8): + print(parser.error) + parser.error(f"argument {option_string}: Invalid range") + setattr(args, self.dest, values) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument("-o", "--output", nargs="+", type=arg_output, + metavar=f"[PATH.({', '.join(Table.FORMATS)})]", default=[], + help=("store the benchmark test results to the given file, " + "the extension determines the output format.")) + parser.add_argument("-f", "--filter", metavar="REGEX", default=".*", + help="regex to filter benchmark tests") + parser.add_argument("-c", "--compare", nargs="+", type=arg_compare, metavar="FILE", default=[], + help="list of exported JSON files to compared with") + parser.add_argument("-d", "--export", metavar="PATH", default=None, + help="store the result into JSON file, to be used later with --compare") + parser.add_argument("--only-changed", type=arg_only_changed, default=0, + help="show only changed results depend of the given factor and" + "value must falls between 0.0 and 1.0. " + "NOTE: this option works with '--compare'") + + parser.add_argument("-n", "--nsamples", type=int, default=100, + help="number of samples to be collected for each benchmark") + + parser.add_argument("--iteration", type=int, default=10, + help="number of iteration for each collected sample") + parser.add_argument("--warmup", type=int, default=5, + help="uncollected iterations for each collected sample") + + parser.add_argument("--strides", type=int, nargs="+", default=[1, 2], + help="strides for input and output arrays") + parser.add_argument("--sizes", type=int, nargs="+", default=[1024, 2048, 4096], + help="array sizes") + + cpu_count = multiprocessing.cpu_count() + parser.add_argument("--cpu-affinity", type=int, nargs="+", choices=range(0, cpu_count), default=None, + help="Set the CPU affinity for the running ufunc, only supported on Linux") + + metric_choices = ["gmean", "gstd", "mean", "median"] + parser.add_argument("--metric", choices=metric_choices, default="gmean", + help="output metric") + units_choices = dict(sec=Timing.SEC, ms=Timing.MS, us=Timing.US) + parser.add_argument("--units", default="ms", choices=list(units_choices.keys()), + help="units of the output values") + parser.add_argument("--msleep", type=float, default=0.1, + help="suspends execution of the calling thread before collecting each sample " + "for at least milliseconds") + parser.add_argument("--rand-range", type=float, nargs="+", default=None, action=arg_rand_range, + help="specify a rand range for the generated arrays. " + "NOTE: multiple ranges will be interleaved") + args = parser.parse_args() + # 1- initialize + bench = Benchmark(filter=args.filter, strides=args.strides, sizes=args.sizes, + nsamples=args.nsamples, iteration=args.iteration, + warmup=args.warmup, msleep=args.msleep, metric=args.metric, + unit_scale=units_choices[args.units], + rand_range=args.rand_range) + + # 2- fetch the generated tests, + from numpy._pytesttester import _show_numpy_info + _show_numpy_info() + Colored.ok("Discovering benchmarks") + running_tests = bench.generate_tests() + if len(running_tests) < 1: + Colored.fatal("No benchmarks selected") + + total_tests = 0 + for ufunc, cases in running_tests.items(): + total_tests += len(cases) + Colored.ok("Running %d total benchmarks from %d ufuncs" % ( + total_tests, len(running_tests) + )) + if total_tests > 1024 * 10: + desc = input(Colored.text(( + "Which is a huge amount of benchmarks, " + "you may need to use '--filter' to reduce them.\n" + "Do you want to continue? y or n? " + ), Colored.YELLOW)) + desc = desc.strip().lower() + while(1): + if desc.startswith('y'): + break + if desc.startswith('n'): + sys.exit(1) + desc = input(Colored.text(('y or n? '), Colored.RED)).strip().lower() + + # 3- Set CPU affinity for running the benchmark + if args.cpu_affinity: + Timing.set_affinity(*args.cpu_affinity[:cpu_count]) + + # 4- unleash the tests + bench.run(running_tests) + + # 5- print the results + cmp_tests_name, cmp_tests_data = zip(*args.compare) if args.compare else ([], []) + all_tests = [running_tests] + list(cmp_tests_data) + all_names = ["current"] + list(cmp_tests_name) + + final_result = bench.generate_table( + all_names, all_tests, only_changed=args.only_changed, + tformat=("ansi" if Colored.IS_TTYOUT else "txt") + ) + print(final_result) + + # 6- store results into JSON file + if args.export: + export_path = os.path.abspath(args.export) + Colored.notify(f"Exporting benchmarking result into '{export_path}'") + try: + with open(export_path, "w") as fd: + json.dump(running_tests, fd) + except IOError as err: + Colored.fatal(f"Failed to export benchmarking result, '{str(err)}'") + + # 7- store the results into files + for out in args.output: + Colored.notify(f"Writing benchmarking result into {out['path']}") + all_names[0] = out["name"] + final_result = bench.generate_table( + all_names, all_tests, only_changed=args.only_changed, tformat=out["type"] + ) + with open(out["path"], "w") as fd: + fd.write(f"metric: {args.metric}, units: {args.units}\n") + fd.write(final_result) diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src index 750fbeb92a7b..e448fbe0f41a 100644 --- a/numpy/core/src/umath/_umath_tests.c.src +++ b/numpy/core/src/umath/_umath_tests.c.src @@ -584,6 +584,294 @@ fail: Py_XDECREF(core_dim_sizes); return NULL; } +/* + ******************************************************************************** + ** A simple hack to get the elapsed time of executing the inner loop of ufunc ** + ******************************************************************************** + */ +#if defined(_POSIX_THREADS) && (defined(__linux) || defined(__linux__)) + #include + #define CTIMING_AFFINITY +#endif +#if defined(_WIN32) || defined(WINCE) + #include +#elif defined(__MACH__) && defined(__APPLE__) + #include + #include +#else + #include + #include +#endif + +static struct +{ + PyUFuncGenericFunction gfunctions[128]; + PyTypeObject type; + double frequency; +#ifdef CTIMING_AFFINITY + cpu_set_t cpuset; + int have_affinity; +#endif +} ctiming; + +typedef struct +{ + PyObject *elapsed_ticks; + npy_uint32 iteration; + npy_uint32 warmup; +} ctiming_attr; + +typedef struct +{ + void *cfunc_data; + PyUFuncGenericFunction cfunc; + ctiming_attr *attr; +} ctiming_data; + +typedef struct +{ + char **args; + npy_intp const *dimensions; + npy_intp const *steps; + void *data; + npy_int64 elapsed; +} ctiming_targ; + +// From (getTickCount) opencv/modules/core/src/system.cpp +static NPY_INLINE npy_int64 +ctiming_tick_count(void) +{ +#if defined(_WIN32) || defined(WINCE) + LARGE_INTEGER counter; + QueryPerformanceCounter(&counter); + return (npy_int64)counter.QuadPart; +#elif defined(__linux) || defined(__linux__) + struct timespec tp; + clock_gettime(CLOCK_MONOTONIC, &tp); + return (npy_int64)tp.tv_sec*1000000000 + tp.tv_nsec; +#elif defined(__MACH__) && defined (__APPLE__) + return (npy_int64)mach_absolute_time(); +#else + struct timeval tv; + struct timezone tz; + gettimeofday(&tv, &tz); + return (npy_int64)tv.tv_sec*1000000 + tv.tv_usec; +#endif +} +// From (getTickFrequency) opencv/modules/core/src/system.cpp +static NPY_INLINE double +ctiming_frequency(void) +{ +#if defined(_WIN32) || defined(WINCE) + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + return (double)freq.QuadPart; +#elif defined(__linux) || defined(__linux__) + return 1e9; +#elif defined(__MACH__) && defined(__APPLE__) + static double freq = 0; + if (freq == 0) { + mach_timebase_info_data_t sTimebaseInfo; + mach_timebase_info(&sTimebaseInfo); + freq = sTimebaseInfo.denom*1e9/sTimebaseInfo.numer; + } + return freq; +#else + return 1e6; +#endif +} + +static npy_int64 +ctiming_run(char **args, npy_intp const *dimensions, npy_intp const *steps, ctiming_data data) +{ + npy_int64 warmup = (npy_int64)data.attr->warmup; + PyUFuncGenericFunction cfunc = data.cfunc; + while (warmup--) { + cfunc(args, dimensions, steps, data.cfunc_data); + } + npy_int64 before, elapsed; + npy_int64 iteration = (npy_int64)data.attr->iteration; + if (iteration == 1) { + before = ctiming_tick_count(); + cfunc(args, dimensions, steps, data.cfunc_data); + elapsed = ctiming_tick_count() - before; + } else { + before = ctiming_tick_count(); + while (iteration--) { + cfunc(args, dimensions, steps, data.cfunc_data); + } + elapsed = ctiming_tick_count() - before; + } + return elapsed; +} + +#ifdef CTIMING_AFFINITY +static void * +ctiming_thfunc(void *_arg) +{ + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &ctiming.cpuset); + ctiming_targ *targ = _arg; + targ->elapsed = ctiming_run(targ->args, targ->dimensions, targ->steps, *((ctiming_data*)targ->data)); + return NULL; +} +#endif + +static void +ctiming_gfunc(char **args, npy_intp const *dimensions, npy_intp const *steps, void *_data) +{ + ctiming_data *data = (ctiming_data*)_data; + ctiming_attr *attr = data->attr; + npy_int64 elapsed; + +#ifdef CTIMING_AFFINITY + if (ctiming.have_affinity) { + ctiming_targ arg = {args, dimensions, steps, data, 0}; + pthread_t thread; + pthread_create(&thread, NULL, ctiming_thfunc, &arg); + pthread_join(thread, NULL); + elapsed = arg.elapsed; + } +#else + if (0) {} +#endif + else { + elapsed = ctiming_run(args, dimensions, steps, *data); + } + PyObject *item = PyLong_FromLongLong(elapsed); + if (item) { + PyList_Append(attr->elapsed_ticks, item); + } +} + +static PyObject * +ctiming_method(PyObject *NPY_UNUSED(dummy), PyObject *args) +{ + PyUFuncObject *ufunc; + npy_uint32 iteration = 1, warmup = 0; + if (!PyArg_ParseTuple(args, "O|II", &ufunc, &iteration, &warmup)) { + return NULL; + } + if (!PyObject_IsInstance((PyObject *)ufunc, (PyObject *)&PyUFunc_Type)) { + PyErr_Format(PyExc_TypeError, "ufunc is required"); + return NULL; + } + + ctiming_data **data = (ctiming_data**)PyArray_malloc( + sizeof(ctiming_attr) + ((sizeof(ctiming_data) + sizeof(ctiming_data*)) * ufunc->ntypes) + ); + if (data == NULL) { + return PyErr_NoMemory(); + } + PyUFuncObject *n_ufunc = PyObject_GC_New(PyUFuncObject, &ctiming.type); + if (n_ufunc == NULL) { + PyArray_free(data); + return NULL; + } + memcpy(((PyObject*)n_ufunc)+1, ((PyObject*)ufunc)+1, sizeof(PyUFuncObject) - sizeof(PyObject)); + + PyObject *elapsed_ticks = PyList_New(0); + if (elapsed_ticks == NULL) { + PyArray_free(data); + Py_DECREF(n_ufunc); + return NULL; + } + + ctiming_attr *data_attr = (ctiming_attr*)&data[ufunc->ntypes]; + data_attr->iteration = iteration; + data_attr->warmup = warmup; + data_attr->elapsed_ticks = elapsed_ticks; + + ctiming_data *data_offset = (ctiming_data*)(data_attr + 1); + for (int i = 0; i < n_ufunc->ntypes; ++i) { + data[i] = data_offset + i; + data[i]->cfunc_data = n_ufunc->data[i]; + data[i]->cfunc = n_ufunc->functions[i]; + data[i]->attr = data_attr; + } + n_ufunc->data = (void**)data; + n_ufunc->functions = ctiming.gfunctions; + return (PyObject*)n_ufunc; +} + +static PyObject * +ctiming_elapsed(PyObject *NPY_UNUSED(dummy), PyObject *args) +{ + PyUFuncObject *ufunc; + if (!PyArg_ParseTuple(args, "O", &ufunc)) { + return NULL; + } + if (!PyObject_IsInstance((PyObject *)ufunc, (PyObject *)&ctiming.type)) { + PyErr_Format(PyExc_TypeError, "wrapped ufunc is required"); + return NULL; + } + PyObject *elapsed_ticks = ((ctiming_data*)ufunc->data[0])->attr->elapsed_ticks; + Py_INCREF(elapsed_ticks); + return elapsed_ticks; +} + +static PyObject * +ctiming_set_affinity(PyObject *NPY_UNUSED(dummy), PyObject *args) +{ +#ifdef CTIMING_AFFINITY + PyObject *obj; + if (!PyArg_ParseTuple(args, "O", &obj)) { + return NULL; + } + PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence"); + if (seq_obj == NULL) { + return NULL; + } + CPU_ZERO(&ctiming.cpuset); + PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj); + Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj); + for (Py_ssize_t i = 0; i < seq_size; ++i) { + CPU_SET(PyLong_AsLong(seq_items[i]), &ctiming.cpuset); + if (PyErr_Occurred()) { + Py_DECREF(seq_obj); + return NULL; + } + } + Py_DECREF(seq_obj); + ctiming.have_affinity = 1; +#else + (void) args; +#endif + Py_RETURN_NONE; +} + +static void +ctiming_dealloc(PyUFuncObject *ufunc) +{ + PyObject_GC_UnTrack((PyObject*)ufunc); + ctiming_data *data = (ctiming_data*)ufunc->data[0]; + Py_DECREF(data->attr->elapsed_ticks); + PyArray_free(ufunc->data); + PyObject_GC_Del(ufunc); +} + +static int +ctiming_init(PyObject *mod) +{ + memcpy(&ctiming.type, &PyUFunc_Type, sizeof(PyTypeObject)); + ctiming.type.tp_name = "numpy.ufunc_ctiming"; + ctiming.type.tp_dealloc = (destructor)ctiming_dealloc; + + for (int i = 0; i < 128; ++i) { + ctiming.gfunctions[i] = ctiming_gfunc; + } + + ctiming.frequency = ctiming_frequency(); + PyObject *frequency = PyFloat_FromDouble(ctiming.frequency); + if (frequency == NULL) { + return -1; + } + // number of clock-cycles per second + if (PyModule_AddObject(mod, "ctiming_frequency", frequency)) { + Py_DECREF(frequency); + return -1; + } + return 0; +} // Testing the utilites of the CPU dispatcher #ifndef NPY_DISABLE_OPTIMIZATION @@ -638,6 +926,22 @@ static PyMethodDef UMath_TestsMethods[] = { "internals. \n", }, {"test_dispatch", UMath_Tests_test_dispatch, METH_NOARGS, NULL}, + {"ctiming", ctiming_method, METH_VARARGS, + "Timing the call of ufunc C function.\n" + "Arguments: ufunc iteration(default:1) warmup(default:0)\n" + "Returns a wrapped ufunc that can be used to determine " + "elapsed time of executing the inner loop of ufunc\n" + }, + {"ctiming_elapsed", ctiming_elapsed, METH_VARARGS, + "Returns mutable list of elapsed ticks(clock-cycles)\n" + "NOTE: use attr 'ctiming_frequency' to get the number of" + "clock-cycles per second\n" + "Arguments: wrapped_ufunc\n" + }, + {"ctiming_set_affinity", ctiming_set_affinity, METH_VARARGS, + "Set the CPU affinity for the running wrapped ufunc\n" + "Arguments: affinity(sequence of CPUs numbers)\n" + }, {NULL, NULL, 0, NULL} /* Sentinel */ }; @@ -692,5 +996,9 @@ PyMODINIT_FUNC PyInit__umath_tests(void) { "cannot load _umath_tests module."); return NULL; } + if (ctiming_init(m) < 0) { + Py_DECREF(m); + return NULL; + } return m; }