pytorch · exclamaforte · May 2, 2025 · eellison · May 15, 2025 · eellison
diff --git a/test/inductor/test_analysis.py b/test/inductor/test_analysis.py
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
@@ -27,6 +27,7 @@
 import torch.optim
 import torch.utils.data
 from torch._C._profiler import _ExperimentalConfig, _ExtraFields_PyCall
+from torch._inductor.ir import FixedLayout
 from torch.autograd.profiler import KinetoStepTracker, profile as _profile
 from torch.autograd.profiler_legacy import profile as _profile_legacy
 from torch.profiler import (
@@ -2998,6 +2999,64 @@ def validate_json(prof):
             assert "Overload Name" in key_averages.table()
             validate_json(prof)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    # this tests to see if we can only use a Triton backend for max autotune
+    @unittest.skipIf(
+        torch.cuda.is_available()
+        and not torch._inductor.utils.use_triton_template(
+            FixedLayout(torch.device("cuda"), torch.float16, [400, 800])
+        ),
+        "Solo triton backend not possible",
+    )
+    def test_profiler_debug_autotuner(self):
+        """
+        This test makes sure that profiling events will be present when the kernel is run using the DebugAutotuner.
+        """
+        in1 = torch.randn((400, 600), device="cuda", dtype=torch.float16)
+        in2 = torch.randn((600, 800), device="cuda", dtype=torch.float16)
+
+        def mm():
+            return torch.mm(in1, in2)
+
+        pb_mm = torch.compile(
+            mm,
+            options={
+                "benchmark_kernel": True,
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "profile_bandwidth": True,
+            },
+        )
+        comp_mm = torch.compile(
+            mm,
+            options={
+                "benchmark_kernel": True,
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+        )
+
+        with profile() as prof1:
+            pb_mm()
+        with profile() as prof2:
+            comp_mm()
+
+        def names(prof):
+            return {
+                ev.name
+                for ev in prof.events()
+                if "mm" in ev.name or "triton" in ev.name
+            }
+
+        trace1 = "/tmp/trace1_pb.json"
+        trace2 = "/tmp/trace2_nopb.json"
+        prof1.export_chrome_trace(trace1)
+        prof2.export_chrome_trace(trace2)
+
+        n1 = names(prof1)
+        n2 = names(prof2)
+        self.assertEqual(n1, n2)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
@@ -854,5 +854,6 @@ def test_scaled_mm(self):
 
         self.assertExpectedInline(get_total_flops(mode), """860160""")
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_inductor/analysis/README.md b/torch/_inductor/analysis/README.md
@@ -0,0 +1,2 @@
+# `torch._inductor.analysis`
+Contains scripts for inductor performance analysis.
diff --git a/torch/_inductor/analysis/__init__.py b/torch/_inductor/analysis/__init__.py
diff --git a/torch/_inductor/analysis/device_info.py b/torch/_inductor/analysis/device_info.py
@@ -0,0 +1,150 @@
+from dataclasses import dataclass
 from logging import info
+from typing import Optional
+
+import torch
+
+
+@dataclass(frozen=True)
+class DeviceInfo:
+    """
+    Theoretical Numbers from data sheet. If two numbers are given, Tensor/Matrix Core vs not,
+    then the higher number is reported. Sparsity is not considered.
+
+
+    Bandwidth numbers are tricky, because there are platform differences that may not show up in the profiler trace.
+    For example,
+    """
+
+    tops: dict[torch.dtype, float]
+    dram_bw_gbs: float
+    dram_gb: float
+
+
+# Indexing is based on `torch.cuda.get_device_name()`
+# TODO investigate profiler support for tf32 and allow device to report correct number when it's turned on.
+_device_mapping: dict[str, DeviceInfo] = {
+    # Source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
+    "NVIDIA H100": DeviceInfo(
+        tops={
+            torch.float64: 9.7,
+            torch.float32: 19.5,
+            torch.bfloat16: 1979.0,
+            torch.float16: 1979.0,
+            torch.float8_e8m0fnu: 3958.0,
+            torch.float8_e8m0fnu: 3958.0,
+            torch.float8_e4m3fnuz: 3958.0,
+            torch.float8_e5m2: 3958.0,
+            torch.float8_e5m2fnuz: 3958.0,
+            torch.float8_e8m0fnu: 3958.0,
+            torch.int8: 3958.0,
+        },
+        dram_bw_gbs=3350,
+        dram_gb=80,
+    ),
+    # Source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
+    "NVIDIA A100": DeviceInfo(
+        tops={
+            torch.float64: 19.5,
+            torch.float32: 19.5,
+            torch.bfloat16: 312.5,
+            torch.float16: 312.5,
+            # Not in datasheet: float8
+            torch.int8: 624.0,
+        },
+        dram_bw_gbs=2039.0,
+        dram_gb=80.0,
+    ),
+    # Source: https://resources.nvidia.com/en-us-gpu-resources/l4-tensor-datasheet
+    "NVIDIA L4": DeviceInfo(
+        tops={
+            # This is a guess, not in datasheet
+            torch.float64: 15.1,
+            torch.float32: 30.3,
+            torch.bfloat16: 242.0,
+            torch.float16: 242.0,
+            torch.float8_e8m0fnu: 485.0,
+            torch.float8_e8m0fnu: 485.0,
+            torch.float8_e4m3fnuz: 485.0,
+            torch.float8_e5m2: 485.0,
+            torch.float8_e5m2fnuz: 485.0,
+            torch.float8_e8m0fnu: 485.0,
+            torch.int8: 485.0,
+        },
+        dram_bw_gbs=3350,
+        dram_gb=24,
+    ),
+    # Source: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300a-data-sheet.pdf
+    "AMD MI300A": DeviceInfo(
+        tops={
+            torch.float64: 122.6,
+            torch.float32: 122.6,
+            # torch.tf32: 490.3,
+            torch.bfloat16: 980.6,
+            torch.float16: 980.6,
+            torch.float8_e8m0fnu: 1961.2,
+            torch.float8_e8m0fnu: 1961.2,
+            torch.float8_e4m3fnuz: 1961.2,
+            torch.float8_e5m2: 1961.2,
+            torch.float8_e5m2fnuz: 1961.2,
+            torch.float8_e8m0fnu: 1961.2,
+            torch.int8: 1961.2,
+        },
+        dram_bw_gbs=5300.0,
+        dram_gb=128.0,
+    ),
+    # Source: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf
+    "AMD MI300X": DeviceInfo(
+        tops={
+            torch.float64: 163.4,
+            torch.float32: 163.4,
+            torch.bfloat16: 1307.4,
+            torch.float16: 1307.4,
+            torch.float8_e8m0fnu: 2614.9,
+            torch.float8_e8m0fnu: 2614.9,
+            torch.float8_e4m3fnuz: 2614.9,
+            torch.float8_e5m2: 2614.9,
+            torch.float8_e5m2fnuz: 2614.9,
+            torch.float8_e8m0fnu: 2614.9,
+            torch.int8: 2614.9,
+        },
+        dram_bw_gbs=5300.0,
+        dram_gb=192.0,
+    ),
+}
+
+
+def lookup_device_info(name: str) -> Optional[DeviceInfo]:
+    """
+    Problem: when diffing profiles between amd and nvidia, we don't have access to the device information
+    of the other one. Also, since the analysis is static, we should be able to do it on another device unrelated
+    to the recorded device. Therefore, _device_mapping statically contains the information for lots of devices.
+    If one is missing, please run DeviceInfo.get_device_info() and add it to _device_mapping.
+      name (str): name of the device to lookup. Should map onto torch.cuda.get_device_name().
+    """
+    if name not in _device_mapping:
+        return None
+    return _device_mapping[name]
+
+
+def datasheet_tops(dtype: torch.dtype) -> Optional[float]:
+    """
+    Get the theoretical TFLOPS of the device for a given dtype. This can throw an exception if the device
+    is not in the datasheet list above.
+    """
+    name: Optional[str] = torch.cuda.get_device_name()
+    if name is None:
+        info("No device found, returning None")
+        return None
+    device_info = lookup_device_info(name)
+    if device_info is None:
+        log_str = f"Device {name} not in datasheet, returning None"
+        info(log_str)
+        return None
+    if dtype not in device_info.tops:
+        log_str = (
+            f"Device {name} does not have a datasheet entry for {dtype}, returning None"
+        )
+        info(log_str)
+        return None
+    return device_info.tops[dtype]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -854,5 +854,6 @@ def test_scaled_mm(self):

		self.assertExpectedInline(get_total_flops(mode), """860160""")


		if __name__ == "__main__":
		run_tests()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# `torch._inductor.analysis`
		Contains scripts for inductor performance analysis.