pytorch
diff --git a/‎torch/_inductor/analysis/device_info.py
+112 b/‎torch/_inductor/analysis/device_info.py
+112
diff --git a/‎torch/_inductor/analysis/profile_analysis.py
+4-55 b/‎torch/_inductor/analysis/profile_analysis.py
+4-55
diff --git a/‎torch/_inductor/utils.py
+12 b/‎torch/_inductor/utils.py
+12
@@ -0,0 +1,112 @@
+import torch
+from dataclasses import dataclass
+
+@dataclass(frozen=True)
+class DeviceInfo:
+    """
+    Theoretical Numbers from data sheet. If two numbers are given, Tensor/Matrix Core vs not, 
+    then the higher number is reported. Sparsity is not considered.
+
+
+    Bandwidth numbers are tricky, because there are platform differences that may not show up in the profiler trace.
+    For example, 
+    """
+    tops: dict[torch.dtype, float]
+    dram_bw_tbs: float
+    dram_gb: float
+
+# TODO investigate profiler support for tf32 and allow device to report correct number when it's turned on.
+_device_mapping: dict[str, DeviceInfo] = {
+    # Source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
+    "NVIDIA H100": DeviceInfo(
+        tops={
+            torch.float64: 9.7,
+            torch.float32: 19.5,
+            torch.bfloat16: 1979.0,
+            torch.float16: 1979.0,
+            torch.float8_e8m0fnu: 3958.0,
+            torch.float8_e8m0fnu: 3958.0,
+            torch.float8_e4m3fnuz: 3958.0,
+            torch.float8_e5m2: 3958.0,
+            torch.float8_e5m2fnuz: 3958.0,
             torch.float8_e8m0fnu: 3958.0,
+            torch.int8: 3958.0,
+        },
+        dram_bw_tbs=3350,
+        dram_gb=80,
+    ),
+    # Source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
+    "NVIDIA A100": DeviceInfo(
+        tops={
+            torch.float64: 19.5,
+            torch.float32: 19.5,
+            torch.bfloat16: 312.5,
+            torch.float16: 312.5,
+            # Not in datasheet: float8
+            torch.int8: 624.0,
+        },
+        dram_bw_tbs=2039.0,
+        dram_gb=80.0,
+    ),
+    # Source: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300a-data-sheet.pdf
+    "AMD MI300A": DeviceInfo(
+        tops={
+            torch.float64: 122.6,
+            torch.float32: 122.6,
+            # torch.tf32: 490.3,
+            torch.bfloat16: 980.6,
+            torch.float16: 980.6,
+            torch.float8_e8m0fnu: 1961.2,
+            torch.float8_e8m0fnu: 1961.2,
+            torch.float8_e4m3fnuz: 1961.2,
+            torch.float8_e5m2: 1961.2,
+            torch.float8_e5m2fnuz: 1961.2,
+            torch.float8_e8m0fnu: 1961.2,
+            torch.int8: 1961.2,
+        },
+        dram_bw_tbs=5300.0,
+        dram_gb=128.0,
+    ),
+    # Source: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf
+    "AMD MI300X": DeviceInfo(
+        tops={
+            torch.float64: 163.4,
+            torch.float32: 163.4,
+            torch.bfloat16: 1307.4,
+            torch.float16: 1307.4,
+            torch.float8_e8m0fnu: 2614.9,
+            torch.float8_e8m0fnu: 2614.9,
+            torch.float8_e4m3fnuz: 2614.9,
+            torch.float8_e5m2: 2614.9,
+            torch.float8_e5m2fnuz: 2614.9,
+            torch.float8_e8m0fnu: 2614.9,
+            torch.int8: 2614.9,
+        },
+        dram_bw_tbs=5300.0,
+        dram_gb=192.0,
+    ),
+}
+
+
+def lookup_device_info(name: str) -> "DeviceInfo":
+    """
+    Problem: when diffing profiles between amd and nvidia, we don't have access to the device information
+    of the other one. Also, since the analysis is static, we should be able to do it on another device unrelated
+    to the recorded device. Therefore, _device_mapping statically contains the information for lots of devices.
+    If one is missing, please run DeviceInfo.get_device_info() and add it to _device_mapping.
+      name (str): name of the device to lookup. Should map onto torch.cuda.get_device_name().
+    """
+    if name not in _device_mapping:
+        raise RuntimeError(
+            f"Unsupported device in profile: {name}, please consider contributing to _device_mapping."
+        )
+    return _device_mapping[name]
+
+def datasheet_tops(dtype: torch.dtype) -> float:
+    """
+    Get the theoretical TFLOPS of the device for a given dtype. This can throw an exception if the device
+    is not in the datasheet list above.
+    """
+    name = torch.cuda.get_device_name()
+    device_info = lookup_device_info(name)
+    return device_info.tops[dtype]
@@ -8,13 +8,8 @@
 from typing import Any, Optional, Union
 
 import torch
-from torch._inductor.utils import (
-    flatten,
-    get_device_tflops,
-    get_gpu_dram_gbps,
-    tabulate_2d,
-    zip_dicts,
-)
+from torch._inductor.analysis.device_info import DeviceInfo, lookup_device_info
+from torch._inductor.utils import flatten, tabulate_2d, zip_dicts
 from torch.autograd import DeviceType
 from torch.utils._ordered_set import OrderedSet
 from torch.utils.flop_counter import flop_registry
@@ -212,50 +207,6 @@ def _augment_trace_helper(data: dict[str, Any]) -> dict[str, Any]:
 }
 
 
-@dataclass(frozen=True)
-class DeviceInfo:
-    tflops: dict[torch.dtype, float]
-    dram_bw_gbs: float
-
-    @staticmethod
-    def get_device_info() -> tuple[dict[torch.dtype, int], float]:
-        """
-        This is the info that populates DeviceInfo, but it needs to be run on each device separately.
-        For new hardware, run this function and then add the information to `_device_mapping`
-        """
-        # TODO support int dtypes
-        floats = [torch.float, torch.bfloat16, torch.float16]
-        return {
-            dtype: get_device_tflops(dtype) for dtype in floats
-        }, get_gpu_dram_gbps()
-
-
-_device_mapping: dict[str, DeviceInfo] = {
-    "NVIDIA H100": DeviceInfo(
-        tflops={
-            torch.float32: 0.033454080000000004,
-            torch.bfloat16: 0.5352652800000001,
-            torch.float16: 0.5352652800000001,
-        },
-        dram_bw_gbs=2446.848,
-    )
-}
-
-
-def lookup_device_info(name: str) -> "DeviceInfo":
-    """
-    problem: when diffing profiles between amd and nvidia, we don't have access to the device information
-    of the other one. Also, since the analysis is static, we should be able to do it on another device unrelated
-    to the recorded device. Therefore, _device_mapping statically contains the information for lots of devices.
-    If one is missing, please run DeviceInfo.get_device_info() and add it to _device_mapping.
-    """
-    if name not in _device_mapping:
-        raise RuntimeError(
-            f"Unsupported device in profile: {name}, consider contributing to _device_mapping."
-        )
-    return _device_mapping[name]
-
-
 @dataclass(frozen=True)
 class KernelStats:
     flops: int
@@ -386,9 +337,7 @@ def _compute_stats(self) -> None:
                     achieved_flops = 0
                 else:
                     dtype = self.convert_dtype(event)
-                    if event["name"].startswith("sm80_xmma_gemm_f32f32"):
-                        breakpoint()
-                    achieved_flops = 100 * op_flops / (1e12 * dev.info.tflops[dtype])
+                    achieved_flops = 100 * op_flops / (1e12 * dev.info.tops[dtype])
             else:
                 op_flops = 0
                 achieved_flops = 0
@@ -397,7 +346,7 @@ def _compute_stats(self) -> None:
                 assert dur != 0
                 # 1000ms/s * gb / ms = gb/s
                 op_gbps = 1e3 * event["args"]["kernel_num_gb"] / dur
-                achieved_bandwidth = 100 * op_gbps / dev.info.dram_bw_gbs
+                achieved_bandwidth = 100 * op_gbps / dev.info.dram_bw_tbs
             else:
                 op_gbps = 0
                 achieved_bandwidth = 0
 
@@ -56,6 +56,7 @@
 import sympy
 
 import torch
+from torch._inductor.analysis.device_info import datasheet_tops
 from torch._inductor.runtime.hints import DeviceProperties
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._ordered_set import OrderedSet
@@ -1895,6 +1896,16 @@ def get_backend_num_stages() -> int:
 
 @functools.lru_cache(None)
 def get_device_tflops(dtype: torch.dtype) -> int:
+    """
+    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
+    then fall back to the inaccurate triton estimation.
+    """
+    try:
+        return datasheet_tops(dtype)
+    except Exception:
+        # Not all devices are supported, fall back to triton theroetical estimate.
+        pass
+
     from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops
 
     assert dtype in (torch.float16, torch.bfloat16, torch.float32)
@@ -2846,6 +2857,7 @@ def get_ld_library_path() -> str:
 
     return path
 
+
 def tabulate_2d(elements: Sequence[Sequence[T]], headers: Sequence[T]) -> str:
     widths = [len(str(e)) for e in headers]
     for row in elements: