pytorch
diff --git a/‎test/inductor/test_analysis.py
Lines changed: 41 additions & 3 deletions b/‎test/inductor/test_analysis.py
Lines changed: 41 additions & 3 deletions
diff --git a/‎torch/_inductor/analysis/profile_analysis.py
Lines changed: 4 additions & 4 deletions b/‎torch/_inductor/analysis/profile_analysis.py
Lines changed: 4 additions & 4 deletions
@@ -9,7 +9,11 @@
 import torch
 import torch.nn.functional as F
 import torch.utils.flop_counter
-from torch._inductor.analysis.profile_analysis import _augment_trace_helper, main
+from torch._inductor.analysis.profile_analysis import (
+    _augment_trace_helper,
+    JsonProfile,
+    main,
+)
 from torch._inductor.utils import flatten, tabulate_2d, zip_dicts
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -297,10 +301,44 @@ def test_augment_trace_helper_args(self, device, dtype):
             om()
         trace1, trace2 = trace_files()
         p.export_chrome_trace(trace1)
-        # patch('sys.stdout', new_callable=StringIO) as mock_stdout,
         with patch("sys.argv", [*prefix, "--augment_trace", trace1, trace2]):
             main()
-            # self.assertEqual(mock_stdout.getvalue(), "")
+        profile = JsonProfile(trace2, 1, "foo")
+        rep = profile.report()
+        # If these fail, just update them. They could change over time
+        if device != "cpu":
+            self.assertTrue(len(rep.split("\n")) > 4)
+        self.assertIn("Kernel Name", rep)
+        self.assertIn("Kernel Count", rep)
+        self.assertIn("FLOPS", rep)
+        self.assertIn("bw gbps", rep)
+        self.assertIn("Dur (ms)", rep)
+        self.assertIn("Achieved", rep)
+        self.assertIn("|", rep)
+        self.assertIn("-----", rep)
+
+        # TODO we need a robust way of checking this report.
+        # In the mean time, make sure that no column is empty.
+        # TODO check to make sure all % values are less than 100%
+        tables = profile._create_tables(profile._devices)
+        for tab in tables.values():
+            header, rows = tab
+            ncols = len(header) - 1
+            seen = [False] * ncols
+            for row in rows.values():
+                for i in range(len(row)):
+                    try:
+                        val = float(row[i])
+                    except Exception:
+                        continue
+                    seen[i] = seen[i] or (val != 0.0)
+
+            if device != "cpu":
+                for i in range(len(seen)):
+                    self.assertTrue(
+                        seen[i],
+                        f"column values from column {i + 1} with header '{header[i + 1]}' are all zero",
+                    )
 
     @dtypes(torch.float, torch.double)
     def test_augment_trace_against_flop_counter(self, device, dtype):
 
@@ -160,7 +160,7 @@ def _estimate_gb(event: dict[str, Any]) -> float:
     This estimate isn't the best because it doesn't know if two input buffers are the same buffer, leading to an
     overestimate of the real achieved bandwidth.
     """
-    if "Input Type" not in event["args"] or "Input Dims" not in event["args"]:
+    if "Input type" not in event["args"] or "Input Dims" not in event["args"]:
         return 0
     sizes_and_types = zip(event["args"]["Input Dims"], event["args"]["Input type"])
     bw = 0
@@ -386,7 +386,7 @@ def _compute_stats(self) -> None:
                     achieved_flops = 0
                 else:
                     dtype = self.convert_dtype(event)
-                    achieved_flops = op_flops / (1e12 * dev.info.tflops[dtype])
+                    achieved_flops = 100 * op_flops / (1e12 * dev.info.tflops[dtype])
             else:
                 op_flops = 0
                 achieved_flops = 0
@@ -395,7 +395,7 @@ def _compute_stats(self) -> None:
                 assert dur != 0
                 <
8000
span class=pl-c># 1000ms/s * gb / ms = gb/s
                 op_gbps = 1e3 * event["args"]["kernel_num_gb"] / dur
-                achieved_bandwidth = op_gbps / dev.info.dram_bw_gbs
+                achieved_bandwidth = 100 * op_gbps / dev.info.dram_bw_gbs
             else:
                 op_gbps = 0
                 achieved_bandwidth = 0
@@ -534,7 +534,7 @@ def dump(self, out: str) -> None:
 
 def parse_profile_event_list(
     benchmark_name: str,
-    event_list: torch.autograd.profiler_util.EventList | dict[str, Any],
+    event_list: Union[torch.autograd.profiler_util.EventList, dict[str, Any]],
     wall_time_ms: float,
     nruns: int,
     device_name: str,