pytorch
diff --git a/‎cmake/Dependencies.cmake
Lines changed: 16 additions & 0 deletions b/‎cmake/Dependencies.cmake
Lines changed: 16 additions & 0 deletions
diff --git a/‎test/profiler/test_profiler.py
Lines changed: 42 additions & 0 deletions b/‎test/profiler/test_profiler.py
Lines changed: 42 additions & 0 deletions
diff --git a/‎tor 10000 ch/_inductor/autotune_process.py
Lines changed: 2 additions & 2 deletions b/‎tor 10000 ch/_inductor/autotune_process.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/multi_kernel.py
Lines changed: 2 additions & 2 deletions b/‎torch/_inductor/codegen/multi_kernel.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/triton.py
Lines changed: 4 additions & 4 deletions b/‎torch/_inductor/codegen/triton.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎torch/_inductor/fx_passes/pad_mm.py
Lines changed: 1 addition & 1 deletion b/‎torch/_inductor/fx_passes/pad_mm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/_inductor/ir.py
Lines changed: 2 additions & 6 deletions b/‎torch/_inductor/ir.py
Lines changed: 2 additions & 6 deletions
diff --git a/‎torch/_inductor/runtime/runtime_utils.py
Lines changed: 12 additions & 2 deletions b/‎torch/_inductor/runtime/runtime_utils.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎torch/_inductor/runtime/triton_heuristics.py
Lines changed: 2 additions & 2 deletions b/‎torch/_inductor/runtime/triton_heuristics.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_inductor/select_algorithm.py
Lines changed: 2 additions & 6 deletions b/‎torch/_inductor/select_algorithm.py
Lines changed: 2 additions & 6 deletions
@@ -1841,6 +1841,8 @@ if(USE_KINETO)
       set(CUPTI_LIB_NAME "cupti.lib")
     endif()
 
+    set(NVPERF_HOST_LIB_NAME "libnvperf_host.so")
+
     find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS
         ${CUDA_SOURCE_DIR}
         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
@@ -1855,13 +1857,27 @@ if(USE_KINETO)
         ${CUDA_SOURCE_DIR}/include
         NO_DEFAULT_PATH)
 
+    find_library(NVPERF_HOST_LIBRARY_PATH ${NVPERF_HOST_LIB_NAME} PATHS
+        ${CUDA_SOURCE_DIR}
+        ${CUDA_SOURCE_DIR}/lib
+        ${CUDA_SOURCE_DIR}/lib64
+        ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
+        NO_DEFAULT_PATH)
+
     if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)
       message(STATUS "  CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}")
       set(CUDA_cupti_LIBRARY ${CUPTI_LIBRARY_PATH})
       message(STATUS "  CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}")
+      # CUPTI Range Profiler requires the NVPerf library
+      # for configuring metrics
+      if(NVPERF_HOST_LIBRARY_PATH)
+        set(CUDA_nvperf_host_LIBRARY ${NVPERF_HOST_LIBRARY_PATH})
+        message(STATUS "  CUDA_nvperf_host_LIBRARY = ${NVPERF_HOST_LIBRARY_PATH}")
+      endif()
       message(STATUS "Found CUPTI")
       set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE)
 
+
       # I've only tested this sanity check on Linux; if someone
       # runs into this bug on another platform feel free to
       # generalize it accordingly
 
@@ -699,6 +699,48 @@ def create_mkldnn_tensor():
         if torch.cuda.is_available():
             check_metrics(stats, "device_memory_usage", deallocs=["[memory]"])
 
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    def test_kineto_cupti_range_profiler(self):
+        """CUPTI provides a newer Profiling API from CUDA 10.0 that enables measuring
+        performance events for the GPU. This is supported as an experimental pytorch profiler feature.
+        Read more here https://docs.nvidia.com/cupti/r_main.html#r_profiler.
+        """
+        exp_config = _ExperimentalConfig(
+            profiler_metrics=[
+                # Metrics list at https://docs.nvidia.com/cupti/r_main.html#r_profiler
+                # or use kineto__tensor_core_insts, kineto__cuda_core_flops
+                "kineto__tensor_core_insts",
+                "dram__bytes_read.sum",
+                "dram__bytes_write.sum",
+            ],
+            profiler_measure_per_kernel=True,
+        )
+        with _profile(
+            use_cuda=True, use_kineto=True, experimental_config=exp_config
+        ) as p:
+            self.payload(use_cuda=True)
+
+        def check_trace(fname):
+            with open(fname) as f:
+                trace = json.load(f)
+                self.assertTrue("traceEvents" in trace)
+                events = trace["traceEvents"]
+                found_cupti_profiler_events = False
+                for evt in events:
+                    self.assertTrue("name" in evt)
+                    if "__cupti_profiler__" in evt["name"]:
+                        found_cupti_profiler_events = True
+                # PyTorch OSS CI runs in docker containers where the Range Profiler
+                # does not have sufficient privilege level (CUPTI_ERROR_INSUFFICIENT_PRIVILEGES).
+                # We can check that the profiler does not crash the job and the trace is not
+                # malformed, however do not check the actual presence of data.
+                self.assertTrue(1 or found_cupti_profiler_events)
+
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_chrome_trace(fname)
+            check_trace(fname)
+
     @unittest.skipIf(
         IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared"
     )
 
@@ -44,7 +44,7 @@
     from torch._inductor.select_algorithm import TritonTemplateCaller
 
 from . import config
-from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .runtime.runtime_utils import do_bench_cpu, do_bench_gpu
 from .virtualized import V
 
 CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
@@ -592,7 +592,7 @@ def do_bench(
             device_idx = torch.cuda.current_device()
 
         with torch.cuda.device(device_idx):
-            out = do_bench(fn)
+            out = do_bench_gpu(fn)
             torch.cuda.synchronize()  # shake out any CUDA errors
 
         return out
 
@@ -6,7 +6,7 @@
 
 from .. import config
 from ..codecache import PyCodeCache, TritonFuture
-from ..runtime.runtime_utils import do_bench
+from ..runtime.runtime_utils import do_bench_gpu
 from ..utils import cache_on_self
 from ..virtualized import V
 from .common import TensorArg
@@ -339,7 +339,7 @@ def benchmark_sub_kernels(kernel_calls):
         be picked.
         """
         return [
-            do_bench(lambda: kernel_call(True), rep=40, fast_flush=True)
+            do_bench_gpu(lambda: kernel_call(True), rep=40, fast_flush=True)
             for kernel_call in kernel_calls
         ]
 
 
@@ -49,7 +49,7 @@
 from ..optimize_indexing import indexing_dtype_strength_reduction
 from ..runtime.hints import ReductionHint, TRITON_MAX_BLOCK
 from ..runtime.runtime_utils import (
-    do_bench,
+    do_bench_gpu,
     get_max_y_grid,
     green_text,
     next_power_of_2,
@@ -2651,7 +2651,7 @@ def codegen_kernel_benchmark(self, num_gb, grid=None):
 
             result.writeline("args = get_args()")
             result.writeline(
-                "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)"
+                "ms = do_bench_gpu(lambda: call(args), rep=40, fast_flush=True)"
             )
             result.writeline(f"num_gb = {num_gb}")
             result.writeline("gb_per_s = num_gb / (ms / 1e3)")
@@ -4034,13 +4034,13 @@ def store_cache():
         else:
             # We have to clone the inplace updated arguments to avoid earlier calls
             # generating out of range indices for later calls.
-            ms = do_bench(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
+            ms = do_bench_gpu(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
 
             # overhead of cloning args gives bias for fusing the kernel
             # in the case of mutating/in-placeable second fusion
             # TODO - would be better as a hook in triton do_bench that reset
             # the input values between benchmarking
-            ms = ms - do_bench(lambda: wrapped_jit_function.clone_args(*args))
+            ms = ms - do_bench_gpu(lambda: wrapped_jit_function.clone_args(*args))
 
         log.debug(
             "The fused kernel for %s took %.3f ms to run",
 
@@ -251,7 +251,7 @@ def should_pad_bench(
         return False
 
     do_bench = functools.partial(
-        torch._inductor.runtime.runtime_utils.do_bench,
+        torch._inductor.runtime.runtime_utils.do_bench_gpu,
         warmup=5,
     )
 
 
@@ -70,7 +70,7 @@
 )
 from .ops_handler import OpCounterCSE
 from .runtime.hints import ReductionHint
-from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .runtime.runtime_utils import do_bench
 from .utils import (
     argsort,
     cache_on_self,
@@ -79,7 +79,6 @@
     convert_shape_to_symint,
     developer_warning,
     get_kernel_metadata,
-    is_cpu_device,
     is_dynamic,
     is_gpu,
     pad_listlike,
@@ -3628,10 +3627,7 @@ def __init__(self, name, input_nodes, layout):
 
     def benchmark(self, *args, out) -> float:
         algo = self.to_callable()
-        if is_cpu_device(args):
-            return do_bench_cpu(lambda: algo(*args, out=out))
-        else:
-            return do_bench(lambda: algo(*args, out=out))
+        return do_bench(algo, args, {"out": out})
 
     def call_name(self) -> str:
         raise NotImplementedError
 
@@ -10,6 +10,7 @@
 import time
 
 import torch
+from torch._inductor.utils import is_cpu_device
 
 
 def conditional_product(*args):
@@ -70,7 +71,16 @@ def get_max_y_grid():
     return 65535
 
 
-def do_bench(*args, **kwargs):
+def do_bench(fn, fn_args, fn_kwargs, **kwargs):
+    args = list(fn_args)
+    args.extend(fn_kwargs.values())
+    if is_cpu_device(args):
+        return do_bench_cpu(lambda: fn(*fn_args, **fn_kwargs), **kwargs)
+    else:
+        return do_bench_gpu(lambda: fn(*fn_args, **fn_kwargs), **kwargs)
+
+
+def do_bench_gpu(fn, **kwargs):
     @functools.lru_cache(None)
     def load_triton():
         try:
@@ -98,7 +108,7 @@ def load_triton():
 
     if quantile_field_name not in kwargs:
         kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
-    return triton_do_bench(*args, **kwargs)[0]
+    return triton_do_bench(fn, **kwargs)[0]
 
 
 def do_bench_cpu(fn, warmup=5, times=20):
 
@@ -32,7 +32,7 @@
     ceildiv,
     conditional_product,
     create_bandwidth_info_str,
-    do_bench,
+    do_bench_gpu,
     dynamo_timed,
     get_first_attr,
     get_max_y_grid,
@@ -628,7 +628,7 @@ def kernel_call():
                 stream=stream,
             )
 
-        return do_bench(kernel_call, rep=40, fast_flush=True)
+        return do_bench_gpu(kernel_call, rep=40, fast_flush=True)
 
     def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
         from ..compile_fx import clone_preserve_strides
 
@@ -38,10 +38,9 @@
 from .exc import CUDACompileError
 from .ir import ChoiceCaller, PrimitiveInfoType
 from .runtime.hints import DeviceProperties
-from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .runtime.runtime_utils import do_bench
 from .utils import (
     get_dtype_size,
-    is_cpu_device,
     Placeholder,
     restore_stdout_stderr,
     sympy_dot,
@@ -847,10 +846,7 @@ def benchmark(self, *args, out):
                 out_new, tuple(out.size()), tuple(out.stride())
             )
             out.copy_(out_new)  # for correctness checking
-            if is_cpu_device(args):
-                return do_bench_cpu(lambda: algo(*args))
-            else:
-                return do_bench(lambda: algo(*args))
+            return do_bench(algo, args, {})
 
     def to_callable(self):
         fn = self.choice.to_callable()
Original file line number	Diff line number	Diff line change
`@@ -251,7 +251,7 @@ def should_pad_bench(`
`251`	`251`	`return False`
`252`	`252`
`253`	`253`	`do_bench = functools.partial(`
`254`		`- torch._inductor.runtime.runtime_utils.do_bench,`
	`254`	`+ torch._inductor.runtime.runtime_utils.do_bench_gpu,`
`255`	`255`	`warmup=5,`
`256`	`256`	`)`
`257`	`257`