pytorch
diff --git a/‎test/inductor/test_analysis.py
+704 b/‎test/inductor/test_analysis.py
+704
diff --git a/‎test/inductor/test_inductor_scheduler.py
+87 b/‎test/inductor/test_inductor_scheduler.py
+87
diff --git a/‎test/inductor/test_utils.py
+107 b/‎test/inductor/test_utils.py
+107
diff --git a/‎test/profiler/test_profiler.py
+59 b/‎test/profiler/test_profiler.py
+59
diff --git a/‎test/test_flop_counter.py
+1 b/‎test/test_flop_counter.py
+1
diff --git a/‎torch/_inductor/analysis/README.md
+2 b/‎torch/_inductor/analysis/README.md
+2
diff --git a/‎torch/_inductor/analysis/__init__.py b/‎torch/_inductor/analysis/__init__.py
@@ -0,0 +1,87 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+import torch.utils.flop_counter
+from torch._inductor.debug import DebugContext
+from torch._inductor.graph import GraphLowering
+from torch._inductor.virtualized import V
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_cuda import SM70OrLater
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    skipCUDAIf,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+def FlopCounterMode(*args, **kwargs):
+    return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
+
+
+def get_total_flops(mode):
+    return sum(v for _, v in mode.flop_counts["Global"].items())
+
+
+def random_tensor(size, dtype, **kwargs):
+    if dtype in [torch.half, torch.bfloat16, torch.float, torch.double]:
+        return torch.randn(size, dtype=dtype, **kwargs)
+    elif dtype in [torch.uint8, torch.int8, torch.short, torch.int, torch.long]:
+        return torch.randint(0, 100, size, dtype=dtype, **kwargs)
+    else:
+        raise ValueError("Unsupported data type")
+
+
+def cT(device, dtype):
+    def T(*shape, requires_grad=False):
+        return random_tensor(
+            shape, requires_grad=requires_grad, device=device, dtype=dtype
+        )
+
+    return T
+
+
+class TestScheduler(TestCase):
+    @dtypes(torch.float, torch.double)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    def test_flop_counter_op(self, device, dtype):
+        T = cT(device, dtype)
+
+        def composite(x, y, z):
+            tmp = torch.mm(x + 10, y / 12)
+            return torch.mm(tmp, z)
+
+        def composite_relu(x, y):
+            tmp = torch.mm(x, y)
+            return torch.relu(tmp)
+
+        test_cases = [
+            (torch.mm, [T(4, 5), T(5, 6)], {}),
+            (torch.add, [T(4, 5), T(4, 5)], {}),
+            (composite, [T(5, 4), T(4, 3), T(3, 12)], {}),
+            (composite_relu, [T(5, 4), T(4, 3)], {}),
+        ]
+        for op, example_inputs, kwargs in test_cases:
+            comp = torch.compile(op)
+            with FlopCounterMode() as mode:
+                comp(*example_inputs, **kwargs)
+            gm = make_fx(op)(*example_inputs, **kwargs)
+            reference_flops = get_total_flops(mode)
+
+            graph = GraphLowering(gm)
+
+            with V.set_graph_handler(graph), V.set_debug_handler(DebugContext()):
+                graph.run(*example_inputs, **kwargs)
+                graph.init_wrapper_code()
+                graph._update_scheduler()
+                scheduler_flops = 0
+                for node in graph.scheduler.nodes:
+                    flops = node.estimate_flops()
+                    scheduler_flops += flops if flops is not None else 0
+            self.assertEqual(reference_flops, scheduler_flops, msg=f"op = {op}")
+
+
+instantiate_device_type_tests(TestScheduler, globals())
+
+if __name__ == "__main__":
+    run_tests()
@@ -3,8 +3,10 @@
 from sympy import Symbol, sympify
 
 import torch
+from torch._inductor.fx_utils import count_flops_fx, countable_fx
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import sympy_str, sympy_subs
+from torch._inductor.virtualized import V
 
 
 class TestUtils(TestCase):
@@ -81,6 +83,111 @@ def test_sympy_str(self):
         self.assertEqual(sympy_str(sympify("a-b")), "a - b")
         self.assertEqual(sympy_str(sympify("a+-b")), "a - b")
 
+    def test_flops_fx(self):
+        def create_fx_node(
+            aten: torch._ops.OpOverloadPacket, args, kwargs
+        ) -> tuple[torch.fx.Node, torch.fx.Node]:
+            node1 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=aten,
+                args=args,
+                kwargs=kwargs,
+            )
+            name: str = aten.overloads()[0]
+            op_overload: torch._ops.OpOverload = getattr(aten, name)
+            node2 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=op_overload,
+                args=args,
+                kwargs=kwargs,
+            )
+            return node1, node2
+
+        with V.set_fake_mode(
+            torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        ):
+            trues = [
+                (
+                    torch.ops.aten.addmm,
+                    (torch.Tensor(4, 4), torch.Tensor(4, 5), torch.Tensor(5, 4)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.bmm,
+                    (torch.Tensor(10, 4, 5), torch.Tensor(10, 5, 4)),
+                    {},
+                ),
+                (torch.ops.aten.mm, (torch.Tensor(2, 3), torch.Tensor(3, 2)), {}),
+                (
+                    torch.ops.aten.convolution,
+                    (
+                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1, 1),
+                        (0, 0),
+                        (1, 1),
+                        True,
+                        (0, 0),
+                        1,
+                    ),
+                    {},
+                ),
+                (
+                    torch.ops.aten._convolution,
+                    (
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1,),
+                        (0,),
+                        (1,),
+                        True,
+                        (0,),
+                        1,
+                        False,
+                        True,
+                        False,
+                    ),
+                    {},
+                ),
+            ]
+            # we don't support pointwise ops
+            falses = [
+                (
+                    torch.ops.aten.add,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.mul,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+            ]
+            for t, args, kwargs in trues:
+                fx_node_1, fx_node_2 = create_fx_node(t, args, kwargs)
+                self.assertTrue(
+                    countable_fx(fx_node_1), f"Expected true {t}: {fx_node_1}"
+                )
+                self.assertTrue(
+                    countable_fx(fx_node_2), f"Expected true {t}: {fx_node_2}"
+                )
+                self.assertNotEqual(count_flops_fx(fx_node_1), None)
+                self.assertNotEqual(count_flops_fx(fx_node_2), None)
+            for f, args, kwargs in falses:
+                fx_node_1, fx_node_2 = create_fx_node(f, args, kwargs)
+                self.assertFalse(
+                    countable_fx(fx_node_1), f"Expected false {f}: {fx_node_1}"
+                )
+                self.assertFalse(
+                    countable_fx(fx_node_2), f"Expected false {f}: {fx_node_2}"
+                )
+
 
 if __name__ == "__main__":
     run_tests()
@@ -27,6 +27,7 @@
 import torch.optim
 import torch.utils.data
 from torch._C._profiler import _ExperimentalConfig, _ExtraFields_PyCall
+from torch._inductor.ir import FixedLayout
 from torch.autograd.profiler import KinetoStepTracker, profile as _profile
 from torch.autograd.profiler_legacy import profile as _profile_legacy
 from torch.profiler import (
@@ -2998,6 +2999,64 @@ def validate_json(prof):
             assert "Overload Name" in key_averages.table()
             validate_json(prof)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    # this tests to see if we can only use a Triton backend for max autotune
+    @unittest.skipIf(
+        torch.cuda.is_available()
+        and not torch._inductor.utils.use_triton_template(
+            FixedLayout(torch.device("cuda"), torch.float16, [400, 800])
+        ),
+        "Solo triton backend not possible",
+    )
+    def test_profiler_debug_autotuner(self):
+        """
+        This test makes sure that profiling events will be present when the kernel is run using the DebugAutotuner.
+        """
+        in1 = torch.randn((400, 600), device="cuda", dtype=torch.float16)
+        in2 = torch.randn((600, 800), device="cuda", dtype=torch.float16)
+
+        def mm():
+            return torch.mm(in1, in2)
+
+        pb_mm = torch.compile(
+            mm,
+            options={
+                "benchmark_kernel": True,
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "profile_bandwidth": True,
+            },
+        )
+        comp_mm = torch.compile(
+            mm,
+            options={
+                "benchmark_kernel": True,
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+        )
+
+        with profile() as prof1:
+            pb_mm()
+        with profile() as prof2:
+            comp_mm()
+
+        def names(prof):
+            return {
+                ev.name
+                for ev in prof.events()
+                if "mm" in ev.name or "triton" in ev.name
+            }
+
+        trace1 = "/tmp/trace1_pb.json"
+        trace2 = "/tmp/trace2_nopb.json"
+        prof1.export_chrome_trace(trace1)
+        prof2.export_chrome_trace(trace2)
+
+        n1 = names(prof1)
+        n2 = names(prof2)
+        self.assertEqual(n1, n2)
+
 
 if __name__ == "__main__":
     run_tests()
@@ -854,5 +854,6 @@ def test_scaled_mm(self):
 
         self.assertExpectedInline(get_total_flops(mode), """860160""")
 
+
 if __name__ == "__main__":
     run_tests()
@@ -0,0 +1,2 @@
+# `torch._inductor.analysis`
+Contains scripts for inductor performance analysis.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+# `torch._inductor.analysis`
	`2`	`+Contains scripts for inductor performance analysis.`