pytorch
diff --git a/‎test/inductor/test_inductor_scheduler.py
Lines changed: 112 additions & 0 deletions b/‎test/inductor/test_inductor_scheduler.py
Lines changed: 112 additions & 0 deletions
diff --git a/‎test/inductor/test_utils.py
Lines changed: 107 additions & 0 deletions b/‎test/inductor/test_utils.py
Lines changed: 107 additions & 0 deletions
diff --git a/‎torch/_inductor/fx_utils.py
Lines changed: 33 additions & 0 deletions b/‎torch/_inductor/fx_utils.py
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,112 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+import torch.utils.flop_counter
+from torch._dynamo.utils import counters
+from torch._inductor.ir import FixedLayout
+from torch._inductor.utils import fresh_inductor_cache
+from torch.testing._internal.common_cuda import SM70OrLater
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    skipCUDAIf,
+)
+from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
+
+
+def FlopCounterMode(*args, **kwargs):
+    return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
+
+
+def get_total_flops(mode):
+    return sum(v for _, v in mode.flop_counts["Global"].items())
+
+
+def random_tensor(size, dtype, **kwargs):
+    if dtype in [torch.half, torch.bfloat16, torch.float, torch.double]:
+        return torch.randn(size, dtype=dtype, **kwargs)
+    elif dtype in [torch.uint8, torch.int8, torch.short, torch.int, torch.long]:
+        return torch.randint(0, 100, size, dtype=dtype, **kwargs)
+    else:
+        raise ValueError("Unsupported data type")
+
+
+def cT(device, dtype):
+    def T(*shape, requires_grad=False):
+        return random_tensor(
+            shape, requires_grad=requires_grad, device=device, dtype=dtype
+        )
+
+    return T
+
+
+class TestScheduler(TestCase):
+    @dtypes(torch.float, torch.float16)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    @parametrize(
+        "options",
+        [
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "force_disable_caches": True,
+            },
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON,ATEN",
+                "force_disable_caches": True,
+            },
+        ],
+    )
+    def test_flop_counter_op(self, device, dtype, options):
+        if device == "cpu":
+            return
+        if (
+            options["max_autotune_gemm_backends"] == "TRITON"
+            and torch.cuda.is_available()
+            and not torch._inductor.utils.use_triton_template(
+                FixedLayout(torch.device("cuda"), torch.float16, [400, 800])
+            )
+        ):
+            return
+        T = cT(device, dtype)
+
+        def composite(x, y, z):
+            tmp = torch.mm(x + 10, y / 12)
+            return torch.mm(tmp, z)
+
+        def composite_relu(x, y):
+            tmp = torch.mm(x, y)
+            return torch.relu(tmp)
+
+        test_cases = [
+            (torch.mm, [T(4, 5), T(5, 6)], {}),
+            (torch.add, [T(4, 5), T(4, 5)], {}),
+            (composite, [T(5, 4), T(4, 3), T(3, 12)], {}),
+            (composite_relu, [T(5, 4), T(4, 3)], {}),
+        ]
+        for op, example_inputs, kwargs in test_cases:
+            comp = torch.compile(op, options=options)
+            # next two lines are required, otherwise the flops will be cached from pervious runs of this function.
+            torch._dynamo.reset()
+            with fresh_inductor_cache():
+                # actually run to set the counters
+                comp(*example_inputs, **kwargs)
+                with FlopCounterMode() as mode:
+                    comp(*example_inputs, **kwargs)
+            reference_flops = get_total_flops(mode)
+
+            self.assertEqual(
+                reference_flops,
+                counters["inductor"]["flop_count"],
+                msg=f"op = {op} reference flops = {reference_flops} != counters {counters['inductor']['flop_count']}",
+            )
+            if op != torch.add:
+                self.assertNotEqual(reference_flops, 0, msg=f"op = {op} is 0 flops")
+            counters["inductor"]["flop_count"] = 0
+
+
+instantiate_device_type_tests(TestScheduler, globals())
+
+if __name__ == "__main__":
+    run_tests()
@@ -3,8 +3,10 @@
 from sympy import Symbol, sympify
 
 import torch
+from torch._inductor.fx_utils import count_flops_fx, countable_fx
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import sympy_str, sympy_subs
+from torch._inductor.virtualized import V
 
 
 class TestUtils(TestCase):
@@ -81,6 +83,111 @@ def test_sympy_str(self):
         self.assertEqual(sympy_str(sympify("a-b")), "a - b")
         self.assertEqual(sympy_str(sympify("a+-b")), "a - b")
 
+    def test_flops_fx(self):
+        def create_fx_node(
+            aten: torch._ops.OpOverloadPacket, args, kwargs
+        ) -> tuple[torch.fx.Node, torch.fx.Node]:
+            node1 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=aten,
+                args=args,
+                kwargs=kwargs,
+            )
+            name: str = aten.overloads()[0]
+            op_overload: torch._ops.OpOverload = getattr(aten, name)
+            node2 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=op_overload,
+                args=args,
+                kwargs=kwargs,
+            )
+            return node1, node2
+
+        with V.set_fake_mode(
+            torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        ):
+            trues = [
+                (
+                    torch.ops.aten.addmm,
+                    (torch.Tensor(4, 4), torch.Tensor(4, 5), torch.Tensor(5, 4)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.bmm,
+                    (torch.Tensor(10, 4, 5), torch.Tensor(10, 5, 4)),
+                    {},
+                ),
+                (torch.ops.aten.mm, (torch.Tensor(2, 3), torch.Tensor(3, 2)), {}),
+                (
+                    torch.ops.aten.convolution,
+                    (
+                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1, 1),
+                        (0, 0),
+                        (1, 1),
+                        True,
+                        (0, 0),
+                        1,
+                    ),
+                    {},
+                ),
+                (
+                    torch.ops.aten._convolution,
+                    (
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1,),
+                        (0,),
+                        (1,),
+                        True,
+                        (0,),
+                        1,
+                        False,
+                        True,
+                        False,
+                    ),
+                    {},
+                ),
+            ]
+            # we don't support pointwise ops
+            falses = [
+                (
+                    torch.ops.aten.add,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.mul,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+            ]
+            for t, args, kwargs in trues:
+                fx_node_1, fx_node_2 = create_fx_node(t, args, kwargs)
+                self.assertTrue(
+                    countable_fx(fx_node_1), f"Expected true {t}: {fx_node_1}"
+                )
+                self.assertTrue(
+                    countable_fx(fx_node_2), f"Expected true {t}: {fx_node_2}"
+                )
+                self.assertNotEqual(count_flops_fx(fx_node_1), None)
+                self.assertNotEqual(count_flops_fx(fx_node_2), None)
+            for f, args, kwargs in falses:
+                fx_node_1, fx_node_2 = create_fx_node(f, args, kwargs)
+                self.assertFalse(
+                    countable_fx(fx_node_1), f"Expected false {f}: {fx_node_1}"
+                )
+                self.assertFalse(
+                    countable_fx(fx_node_2), f"Expected false {f}: {fx_node_2}"
+                )
+
 
 if __name__ == "__main__":
     run_tests()
@@ -8,6 +8,7 @@
 import torch
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.symbolic_shapes import (
     compute_unbacked_bindings,
     rebind_unbacked,
@@ -17,6 +18,7 @@
 from torch.utils import _pytree as pytree
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_map
+from torch.utils.flop_counter import flop_registry
 
 from .virtualized import V
 
@@ -250,3 +252,34 @@ def realizes_inputs(node: torch.fx.Node) -> bool:
 
     # Otherwise, assume node isn't realized
     return False
+
+
+def count_flops_fx(node: torch.fx.Node) -> Optional[int]:
+    if isinstance(node.target, str):
+        return None
+    with FakeTensorMode(allow_non_fake_inputs=True):
+        success, args, kwargs = get_fake_args_kwargs(node)
+
+        if success:
+            with torch.utils.flop_counter.FlopCounterMode(
+                display=False
+            ) as flop_counter_mode:
+                node.target(*args, **kwargs)
+
+            counted_flops = flop_counter_mode.get_total_flops()
+            return counted_flops
+    return None
+
+
+def countable_fx(node: torch.fx.Node) -> bool:
+    """
+    Whether or not we can count the flops of an FX node.
+    """
+    assert isinstance(node, torch.fx.Node)
+    if not hasattr(node, "target"):
+        return False
+    target = node.target
+    if not hasattr(target, "overloadpacket"):
+        return target in flop_registry
+    packet = target.overloadpacket
+    return packet in flop_registry