pytorch
diff --git a/‎test/inductor/test_inductor_scheduler.py
Lines changed: 87 additions & 0 deletions b/‎test/inductor/test_inductor_scheduler.py
Lines changed: 87 additions & 0 deletions
diff --git a/‎test/inductor/test_utils.py
Lines changed: 107 additions & 0 deletions b/‎test/inductor/test_utils.py
Lines changed: 107 additions & 0 deletions
diff --git a/‎torch/_inductor/fx_utils.py
Lines changed: 30 additions & 0 deletions b/‎torch/_inductor/fx_utils.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎torch/_inductor/scheduler.py
Lines changed: 36 additions & 60 deletions b/‎torch/_inductor/scheduler.py
Lines changed: 36 additions & 60 deletions
@@ -0,0 +1,87 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+import torch.utils.flop_counter
+from torch._inductor.debug import DebugContext
+from torch._inductor.graph import GraphLowering
+from torch._inductor.virtualized import V
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_cuda import SM70OrLater
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    skipCUDAIf,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+def FlopCounterMode(*args, **kwargs):
+    return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
+
+
+def get_total_flops(mode):
+    return sum(v for _, v in mode.flop_counts["Global"].items())
+
+
+def random_tensor(size, dtype, **kwargs):
+    if dtype in [torch.half, torch.bfloat16, torch.float, torch.double]:
+        return torch.randn(size, dtype=dtype, **kwargs)
+    elif dtype in [torch.uint8, torch.int8, torch.short, torch.int, torch.long]:
+        return torch.randint(0, 100, size, dtype
57AE
=dtype, **kwargs)
+    else:
+        raise ValueError("Unsupported data type")
+
+
+def cT(device, dtype):
+    def T(*shape, requires_grad=False):
+        return random_tensor(
+            shape, requires_grad=requires_grad, device=device, dtype=dtype
+        )
+
+    return T
+
+
+class TestScheduler(TestCase):
+    @dtypes(torch.float, torch.double)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    def test_flop_counter_op(self, device, dtype):
+        T = cT(device, dtype)
+
+        def composite(x, y, z):
+            tmp = torch.mm(x + 10, y / 12)
+            return torch.mm(tmp, z)
+
+        def composite_relu(x, y):
+            tmp = torch.mm(x, y)
+            return torch.relu(tmp)
+
+        test_cases = [
+            (torch.mm, [T(4, 5), T(5, 6)], {}),
+            (torch.add, [T(4, 5), T(4, 5)], {}),
+            (composite, [T(5, 4), T(4, 3), T(3, 12)], {}),
+            (composite_relu, [T(5, 4), T(4, 3)], {}),
+        ]
+        for op, example_inputs, kwargs in test_cases:
+            comp = torch.compile(op)
+            with FlopCounterMode() as mode:
+                comp(*example_inputs, **kwargs)
+            gm = make_fx(op)(*example_inputs, **kwargs)
+            reference_flops = get_total_flops(mode)
+
+            graph = GraphLowering(gm)
+
+            with V.set_graph_handler(graph), V.set_debug_handler(DebugContext()):
+                graph.run(*example_inputs, **kwargs)
+                graph.init_wrapper_code()
+                graph._update_scheduler()
+                scheduler_flops = 0
+                for node in graph.scheduler.nodes:
+                    flops = node.estimate_flops()
+                    scheduler_flops += flops if flops is not None else 0
+            self.assertEqual(reference_flops, scheduler_flops, msg=f"op = {op}")
+
+
+instantiate_device_type_tests(TestScheduler, globals())
+
+if __name__ == "__main__":
+    run_tests()
@@ -3,8 +3,10 @@
 from sympy import Symbol, sympify
 
 import torch
+from torch._inductor.fx_utils import count_flops_fx, countable_fx
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import sympy_str, sympy_subs
+from torch._inductor.virtualized import V
 
 
 class TestUtils(TestCase):
@@ -81,6 +83,111 @@ def test_sympy_str(self):
         self.assertEqual(sympy_str(sympify("a-b")), "a - b")
         self.assertEqual(sympy_str(sympify("a+-b")), "a - b")
 
+    def test_flops_fx(self):
+        def create_fx_node(
+            aten: torch._ops.OpOverloadPacket, args, kwargs
+        ) -> tuple[torch.fx.Node, torch.fx.Node]:
+            node1 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=aten,
+                args=args,
+                kwargs=kwargs,
+            )
+            name: str = aten.overloads()[0]
+            op_overload: torch._ops.OpOverload = getattr(aten, name)
+            node2 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=op_overload,
+                args=args,
+                kwargs=kwargs,
+            )
+            return node1, node2
+
+        with V.set_fake_mode(
+            torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        ):
+            trues = [
+                (
+                    torch.ops.aten.addmm,
+                    (torch.Tensor(4, 4), torch.Tensor(4, 5), torch.Tensor(5, 4)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.bmm,
+                    (torch.Tensor(10, 4, 5), torch.Tensor(10, 5, 4)),
+                    {},
+                ),
+                (torch.ops.aten.mm, (torch.Tensor(2, 3), torch.Tensor(3, 2)), {}),
+                (
+                    torch.ops.aten.convolution,
+                    (
+                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1, 1),
+                        (0, 0),
+                        (1, 1),
+                        True,
+                        (0, 0),
+                        1,
+                    ),
+                    {},
+                ),
                 (
+                    torch.ops.aten._convolution,
+                    (
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1,),
+                        (0,),
+                        (1,),
+                        True,
+                        (0,),
+                        1,
+                        False,
+                        True,
+                        False,
+                    ),
+                    {},
+                ),
+            ]
+            # we don't support pointwise ops
+            falses = [
+                (
+                    torch.ops.aten.add,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.mul,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+            ]
+            for t, args, kwargs in trues:
+                fx_node_1, fx_node_2 = create_fx_node(t, args, kwargs)
+                self.assertTrue(
+                    countable_fx(fx_node_1), f"Expected true {t}: {fx_node_1}"
+                )
+                self.assertTrue(
+                    countable_fx(fx_node_2), f"Expected true {t}: {fx_node_2}"
+                )
+                self.assertNotEqual(count_flops_fx(fx_node_1), None)
+                self.assertNotEqual(count_flops_fx(fx_node_2), None)
+            for f, args, kwargs in falses:
+                fx_node_1, fx_node_2 = create_fx_node(f, args, kwargs)
+                self.assertFalse(
+                    countable_fx(fx_node_1), f"Expected false {f}: {fx_node_1}"
+                )
+                self.assertFalse(
+                    countable_fx(fx_node_2), f"Expected false {f}: {fx_node_2}"
+                )
+
 
 if __name__ == "__main__":
     run_tests()
@@ -8,6 +8,7 @@
 import torch
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.symbolic_shapes import (
     compute_unbacked_bindings,
     rebind_unbacked,
@@ -17,6 +18,7 @@
 from torch.utils import _pytree as pytree
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_map
+from torch.utils.flop_counter import flop_registry
 
 from .virtualized import V
 
@@ -250,3 +252,31 @@ def realizes_inputs(node: torch.fx.Node) -> bool:
 
     # Otherwise, assume node isn't realized
     return False
+
+
+def count_flops_fx(node: torch.fx.Node) -> Optional[int]:
+    if isinstance(node.target, str):
+        return None
+    with FakeTensorMode(allow_non_fake_inputs=True):
+        success, args, kwargs = get_fake_args_kwargs(node)
+
+        if success:
+            with torch.utils.flop_counter.FlopCounterMode(
+                display=False
+            ) as flop_counter_mode:
+                node.target(*args, **kwargs)
+
+            counted_flops = flop_counter_mode.get_total_flops()
+            return counted_flops
+    return None
+
+
+def countable_fx(node: torch.fx.Node) -> bool:
+    assert isinstance(node, torch.fx.Node)
+    if not hasattr(node, "target"):
+        return False
+    target = node.target
+    if not hasattr(target, "overloadpacket"):
+        return target in flop_registry
+    packet = target.overloadpacket
+    return packet in flop_registry
@@ -28,7 +28,7 @@
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
-from torch.fx.experimental.symbolic_shapes import free_symbols, free_unbacked_symbols
+from torch.fx.experimental.symbolic_shapes import free_symbols
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._triton import has_triton
@@ -39,8 +39,8 @@
 from .comm_analysis import estimate_nccl_collective_runtime
 from .dependencies import Dep, MemoryDep, StarDep, WeakDep
 from .exc import GPUTooOldForTriton, TritonMissing
+from .fx_utils import count_flops_fx, countable_fx
 from .ir import (
-    ComputedBuffer,
     get_device_type,
     GraphPartitionSignature,
     MultiOutput,
@@ -783,6 +783,21 @@ def get_buf_bytes(
 
         return buf_byte_accesses
 
+    @cache_on_self
+    def estimate_flops(self) -> int | None:
+        if self.node is None:
+            return None
+        fx_node = self.node.get_origin_node()
+        if fx_node is None:
+            return None
+        if not countable_fx(fx_node):
+            return None
+
+        flops = count_flops_fx(fx_node)
+
+        resolved_flops = V.graph.sizevars.size_hints((flops,), fallback=0)[0]
+        return resolved_flops
+
     @cache_on_self
     def get_estimated_runtime(self) -> float:
         """
@@ -823,57 +838,29 @@ def get_estimated_runtime(self) -> float:
         except Exception:
             return 0
 
-        if isinstance(self, ExternKernelSchedulerNode):
-            assert isinstance(self.node, ir.ExternKernel), f"{type(self.node)=}"
-            op = kernel_name_to_op.get(
-                getattr(self.node, "python_kernel_name", ""), None
+        if isinstance(self, FusedSchedulerNode):
+            flops_est: int | None = sum(
+                filter(
+                    None,
+                    (node.estimate_flops() for node in self.get_nodes()),
+                )
             )
+        else:
+            flops_est = self.estimate_flops()
 
-            # if there is a resolved op, dry-run using fake mode and record flop count
-            if op is not None:
-                from torch._subclasses.fake_tensor import FakeTensorMode
-                from torch.utils.flop_counter import FlopCounterMode
-
-                if any(
-                    len(free_unbacked_symbols(n.get_numel())) > 0
-                    for n in self.node.inputs
-                ):
-                    # Tensor has unbacked symints, we don't know how to estimate
-                    # runtime for that today
-                    return 0
-
-                with (
-                    FakeTensorMode() as fake_mode,
-                    FlopCounterMode(display=False) as flop_counter_mode,
-                    V.set_current_node(self.node.fx_node),
-                    V.set_fake_mode(fake_mode),
-                ):
-                    from .ir import ir_node_to_tensor
-
-                    fake_inputs = [
-                        ir_node_to_tensor(input, guard_shape=False)
-                        for input in self.node.inputs
-                    ]
-                    cls = self.node.__class__
-                    cls.process_kernel(op, *fake_inputs, **self.node.kwargs)
-
-                    # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
-                    factor = 1.0
-                    counted_flops = flop_counter_mode.get_total_flops()
-                    counted_bytes = self.get_read_write_buffers_sizes()
-                    compute_time = (factor * counted_flops / gpu_flops) * 1e9
-                    transfer_time = counted_bytes / gpu_memory_bandwidth
-
-                    # Return estimated runtime in nanoseconds
-                    return max(compute_time, transfer_time)
-
-        elif isinstance(self, FusedSchedulerNode) or isinstance(
-            self.node, ComputedBuffer
-        ):
-            # Return estimated runtime in nanoseconds (bytes / gbps)
+        if flops_est == 0 or flops_est is None:
+            # no flops estimate, so fall back to memory estimate
             return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
 
-        return 0
+        # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
+        factor = 1.0
+        counted_bytes = self.get_read_write_buffers_sizes()
+        counted_bytes = 0 if counted_bytes is None else counted_bytes
+        compute_time = (factor * flops_est / gpu_flops) * 1e9
+        transfer_time = counted_bytes / gpu_memory_bandwidth
+
+        # Return estimated runtime in nanoseconds
+        return max(compute_time, transfer_time)
 
     def get_template_node(self) -> Optional[ir.TemplateBuffer]:
         return None
@@ -987,17 +974,6 @@ def should_prune(dep: Dep) -> bool:
         node.set_read_writes(node.read_writes.remove_reads(deps_to_prune))
 
 
-# TODO(xmfan): reuse: an existing mapping for this if it exists, or formalize this into ir.py:ExternKernel
-kernel_name_to_op = {
-    "extern_kernels.convolution": torch.ops.aten.convolution,
-    "extern_kernels.mm": torch.ops.aten.mm,
-    "extern_kernels.bmm": torch.ops.aten.bmm,
-    "extern_kernels.addmm": torch.ops.aten.addmm,
-    "extern_kernels._scaled_mm": torch.ops.aten._scaled_mm,
-    "extern_kernels._scaled_grouped_mm": torch.ops.aten._scaled_grouped_mm,
-}
-
-
 class ExternKernelSchedulerNode(BaseSchedulerNode):
     def __init__(self, scheduler: Scheduler, node: ir.Operation) -> None:
         super().__init__(scheduler)