pytorch
diff --git a/‎test/dynamo/test_base_hop.py
Lines changed: 192 additions & 35 deletions b/‎test/dynamo/test_base_hop.py
Lines changed: 192 additions & 35 deletions
diff --git a/‎test/higher_order_ops/test_invoke_quant.py
Lines changed: 13 additions & 13 deletions b/‎test/higher_order_ops/test_invoke_quant.py
Lines changed: 13 additions & 13 deletions
diff --git a/‎torch/_dynamo/variables/higher_order_ops.py
Lines changed: 1 addition & 1 deletion b/‎torch/_dynamo/variables/higher_order_ops.py
Lines changed: 1 addition & 1 deletion
@@ -28,6 +28,66 @@ def __init__(self):
     def __call__(self, subgraph, *operands, scheme):
         return super().__call__(subgraph, *operands, scheme=scheme)
 
+    def gen_schema(self, subgraph, *operands, scheme):
+        # Idea 1: using inspect.signature and sample inputs to generate a schema
+        # Idea 2: we still need to know how to call into subgraph/fn given the inputs.
+        #       wrap_subgraphs gives two callable to call into subgraph.
+        from torch._higher_order_ops.schema import (
+            CFunctionSchemaGen,
+            HopArgumentInfoGen,
+        )
+        from torch._higher_order_ops.utils import (
+            check_input_alias_and_mutation_return_ouputs,
+        )
+
+        (
+            mutated_inp_idx,
+            inp_inp_alias,
+            inp_out_alias,
+            out_
8000
out_alias,
+            output,
+        ) = check_input_alias_and_mutation_return_ouputs(subgraph, operands)
+        assert (
+            len(inp_inp_alias) == 0
+            and len(inp_out_alias) == 0
+            and len(out_out_alias) == 0
+        ), f"Aliasing is not suppported for HOP subgraph. {subgraph}"
+
+        args = [
+            HopArgumentInfoGen.from_example(
+                subgraph, name="subgraph", default_value=None, is_mutated=False
+            )
+        ]
+        for idx, arg in enumerate(operands):
+            example_value = arg
+            arg_name = f"operands{idx}"
+            args.append(
+                HopArgumentInfoGen.from_example(
+                    example_value=example_value,
+                    name=arg_name,
+                    default_value=None,
+                    is_mutated=idx in mutated_inp_idx,
+                )
+            )
+
+        args.append(
+            HopArgumentInfoGen.from_example(
+                example_value=scheme,
+                name="scheme",
+                default_value=scheme,
+                is_mutated=False,
+                kw_only=True,
+            )
+        )
+        output = HopArgumentInfoGen.from_example(
+            example_value=output,
+            name="output",
+            default_value=None,
+            is_mutated=False,
+            kw_only=False,
+        )
+        return CFunctionSchemaGen.from_hop_argument_info(str(self), args, output)
+
 
 invoke_quant_test = InvokeQuantTest()
 
@@ -93,7 +153,7 @@ def f(x, y):
         self.assertEqual(len(schemas), 1)
         self.assertExpectedInline(
             str(schemas[0]),
-            """invoke_quant_test(Any subgraph, Tensor arg0, Tensor arg1, str scheme="nf4") -> ((Tensor))""",  # noqa: B950
+            """invoke_quant_test(Any subgraph, Tensor operands0, Tensor operands1, *, str scheme="nf4") -> ((Tensor))""",  # noqa: B950
         )
 
     def test_schema_gen_pytree_in_out(self):
@@ -121,7 +181,7 @@ def f(x, y):
         self.assertEqual(len(schemas), 1)
         self.assertExpectedInline(
             str(schemas[0]),
-            """invoke_quant_test(Any subgraph, Tensor arg0, Tensor arg1, str scheme="nf4") -> (Tensor, Tensor, Tensor, Tensor)""",  # noqa: B950
+            """invoke_quant_test(Any subgraph, Tensor operands0, Tensor operands1, *, str scheme="nf4") -> (Tensor, Tensor, Tensor, Tensor)""",  # noqa: B950
         )
 
     def test_schema_gen_single_return_with_mutation(self):
@@ -135,15 +195,40 @@ def inner(x, y):
 
         backend = EagerAndRecordGraphs()
 
-        @torch.compile(backend=backend, fullgraph=True)
         def f(x, y):
             return invoke_quant_test(inner, x, y, scheme="nf4")
 
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "Encountered input mutation during higher order op tracing for HOP",
-        ):
-            f(x.clone(), y)
+        torch.compile(f, backend=backend, fullgraph=True)(x.clone(), y)
+        self.assertEqual(len(backend.graphs), 1)
+        self.assertExpectedInline(
+            normalize_graph(backend.graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_x_, l_y_, scheme = 'nf4');  subgraph_0 = l_x_ = l_y_ = None
+        getitem: "f32[3, 3]" = invoke_quant_test[0];  invoke_quant_test = None
+        return (getitem,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[3, 3]", l_y_: "f32[3, 3]"):
+            add_: "f32[3, 3]" = l_x_.add_(1);  add_ = None
+
+            mul_: "f32[3, 3]" = l_y_.mul_(-1);  mul_ = None
+
+            matmul: "f32[3, 3]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+            sin: "f32[3, 3]" = matmul.sin();  matmul = None
+            cos: "f32[3, 3]" = sin.cos();  sin = None
+            return (cos,)
+""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(find_hop_schema(backend.graphs[0], invoke_quant_test)[0]),
+            """invoke_quant_test(Any subgraph, Tensor(a1!) operands0, Tensor(a2!) operands1, *, str scheme="nf4") -> ((Tensor))""",
+        )
 
     def test_schema_gen_pytree_in_out_with_mutation(self):
         def inner(x_y):
@@ -161,15 +246,46 @@ def inner(x_y):
 
         backend = EagerAndRecordGraphs()
 
-        @torch.compile(backend=backend, fullgraph=True)
         def f(x, y):
             return invoke_quant_test(inner, [x, y], scheme="nf4")
 
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "Encountered input mutation during higher order op tracing for HOP",
-        ):
-            f(x.clone(), y)
+        torch.compile(f, backend=backend, fullgraph=True)(x.clone(), y)
+        self.assertEqual(len(backend.graphs), 1)
+        self.assertExpectedInline(
+            normalize_graph(backend.graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_x_, l_y_, scheme = 'nf4');  subgraph_0 = l_x_ = l_y_ = None
+        getitem: "f32[3, 3]" = invoke_quant_test[0]
+        getitem_1: "f32[3, 3]" = invoke_quant_test[1]
+        getitem_2: "f32[3, 3]" = invoke_quant_test[2]
+        getitem_3: "f32[3, 3]" = invoke_quant_test[3];  invoke_quant_test = None
+        return (getitem, getitem_1, getitem_2, getitem_3)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[3, 3]", l_y_: "f32[3, 3]"):
+            add_: "f32[3, 3]" = l_x_.add_(1);  add_ = None
+
+            matmul: "f32[3, 3]" = l_x_ @ l_y_
+            sin: "f32[3, 3]" = matmul.sin();  matmul = None
+            child: "f32[3, 3]" = sin.cos();  sin = None
+
+            child_1: "f32[3, 3]" = l_x_ + l_y_
+            child_2: "f32[3, 3]" = l_x_ - l_y_
+
+            child_3: "f32[3, 3]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+            return (child, child_1, child_2, child_3)
+""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(find_hop_schema(backend.graphs[0], invoke_quant_test)[0]),
+            """invoke_quant_test(Any subgraph, Tensor(a1!) operands0, Tensor operands1, *, str scheme="nf4") -> (Tensor, Tensor, Tensor, Tensor)""",  # noqa: B950
+        )
 
     def test_none_input(self):
         def inner(x, y):
@@ -239,6 +355,44 @@ def forward(self, l_y_: "f32[3, 4]"):
 """,
         )
 
+    def test_auto_functionalize(self):
+        def inner(x, y):
+            x.add_(1)
+            return x + y
+
+        backend = AotEagerAndRecordGraphs()
+
+        def f(x, y):
+            return invoke_quant_test(inner, x, y, scheme="nf4")
+
+        x = torch.randn(3, 3, requires_grad=False)
+        x_clone = x.clone()
+        y = torch.randn(3, 3, requires_grad=True)
+        compiled_out = torch.compile(f, backend=backend, fullgraph=True)(x, y)
+        # assert x is not mutated
+        self.assertEqual(x, x_clone)
+        self.assertEqual(compiled_out, x + y + 1)
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertExpectedInline(
+            normalize_graph(backend.fw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[3, 3]", primals_2: "f32[3, 3]"):
+        functiona_schema_0 = self.functiona_schema_0
+        auto_functionalized_subgraph_0 = self.auto_functionalized_subgraph_0
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.higher_order.invoke_quant_test, subgraph = auto_functionalized_subgraph_0, operands1 = primals_2, scheme = 'nf4', _operands0_base_index = 0, _all_bases = [primals_1], _op_schema = functiona_schema_0);  auto_functionalized_subgraph_0 = functiona_schema_0 = None
+        getitem: "f32[3, 3]" = auto_functionalized_v2[0];  auto_functionalized_v2 = None
+        return (getitem, primals_1, primals_2)
+
+    class auto_functionalized_subgraph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]"):
+            add_: "f32[3, 3]" = torch.ops.aten.add_.Tensor(arg0_1, 1);  arg0_1 = None
+
+            add: "f32[3, 3]" = torch.ops.aten.add.Tensor(add_, arg1_1);  add_ = arg1_1 = None
+            return (add,)
+""",  # noqa: B950
+        )
+
     @torch._dynamo.config.patch(assume_static_by_default=True)
     def test_aot_eager(self):
         def inner(x, y):
@@ -265,16 +419,17 @@ def f(x, y):
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[3, 3]", primals_2: "f32[3, 3]"):
-        subgraph0 = self.subgraph0
-        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph0, primals_1, primals_2, scheme = 'nf4');  subgraph0 = None
-        getitem: "f32[3, 3]" = invoke_quant_test[0];  invoke_quant_test = None
+        functiona_schema_0 = self.functiona_schema_0
+        auto_functionalized_subgraph_0 = self.auto_functionalized_subgraph_0
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.higher_order.invoke_quant_test, subgraph = auto_functionalized_subgraph_0, operands0 = primals_1, operands1 = primals_2, scheme = 'nf4', _all_bases = [], _op_schema = functiona_schema_0);  auto_functionalized_subgraph_0 = functiona_schema_0 = None
+        getitem: "f32[3, 3]" = auto_functionalized_v2[0];  auto_functionalized_v2 = None
         return (getitem, primals_1, primals_2)
 
-    class subgraph0(torch.nn.Module):
+    class auto_functionalized_subgraph_0(torch.nn.Module):
         def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]"):
             mm: "f32[3, 3]" = torch.ops.aten.mm.default(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
-            sin: "f32[3, 3]" = torch.ops.aten.sin.default(mm);  mm = None
-            cos: "f32[3, 3]" = torch.ops.aten.cos.default(sin);  sin = None
+            sin_: "f32[3, 3]" = torch.ops.aten.sin_.default(mm);  mm = None
+            cos: "f32[3, 3]" = torch.ops.aten.cos.default(sin_);  sin_ = None
             return (cos,)
 """,  # NOQA: B950
         )
@@ -285,20 +440,21 @@ def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]"):
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[3, 3]", primals_2: "f32[3, 3]", tangents_1: "f32[3, 3]"):
-        subgraph1 = self.subgraph1
-        invoke_quant_test_1 = torch.ops.higher_order.invoke_quant_test(subgraph1, primals_1, primals_2, tangents_1, scheme = 'nf4');  subgraph1 = primals_1 = primals_2 = tangents_1 = None
-        getitem_1: "f32[3, 3]" = invoke_quant_test_1[0]
-        getitem_2: "f32[3, 3]" = invoke_quant_test_1[1];  invoke_quant_test_1 = None
+        functiona_schema_1 = self.functiona_schema_1
+        auto_functionalized_subgraph_1 = self.auto_functionalized_subgraph_1
+        auto_functionalized_v2_1 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.higher_order.invoke_quant_test, subgraph = auto_functionalized_subgraph_1, operands0 = primals_1, operands1 = primals_2, operands2 = tangents_1, scheme = 'nf4', _all_bases = [], _op_schema = functiona_schema_1);  auto_functionalized_subgraph_1 = primals_1 = primals_2 = tangents_1 = functiona_schema_1 = None
+        getitem_1: "f32[3, 3]" = auto_functionalized_v2_1[0]
+        getitem_2: "f32[3, 3]" = auto_functionalized_v2_1[1];  auto_functionalized_v2_1 = None
         return (getitem_1, getitem_2)
 
-    class subgraph1(torch.nn.Module):
+    class auto_functionalized_subgraph_1(torch.nn.Module):
         def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]", arg2_1: "f32[3, 3]"):
             mm: "f32[3, 3]" = torch.ops.aten.mm.default(arg0_1, arg1_1)
             clone: "f32[3, 3]" = torch.ops.aten.clone.default(mm)
-            sin: "f32[3, 3]" = torch.ops.aten.sin.default(mm);  mm = None
-            cos: "f32[3, 3]" = torch.ops.aten.cos.default(sin);  cos = None
-            sin_1: "f32[3, 3]" = torch.ops.aten.sin.default(sin);  sin = None
-            neg: "f32[3, 3]" = torch.ops.aten.neg.default(sin_1);  sin_1 = None
+            sin_: "f32[3, 3]" = torch.ops.aten.sin_.default(mm);  mm = None
+            cos: "f32[3, 3]" = torch.ops.aten.cos.default(sin_);  cos = None
+            sin: "f32[3, 3]" = torch.ops.aten.sin.default(sin_);  sin_ = None
+            neg: "f32[3, 3]" = torch.ops.aten.neg.default(sin);  sin = None
             mul: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg2_1, neg);  arg2_1 = neg = None
             cos_1: "f32[3, 3]" = torch.ops.aten.cos.default(clone);  clone = None
             mul_1: "f32[3, 3]" = torch.ops.aten.mul.Tensor(mul, cos_1);  mul = cos_1 = None
@@ -320,21 +476,22 @@ def inner2(x, y):
 
         x = torch.randn(3, 3)
         y = torch.randn(3, 3)
+        x_clone = x.clone()
+        y_clone = y.clone()
 
         @torch.compile(backend="eager", fullgraph=True)
         def f(inner, x, y):
             return invoke_quant_test(inner, x, y, scheme="nf4")
 
+        compiled_f = torch.compile(f, backend="eager", fullgraph=True)
+
         with self.assertRaisesRegex(
             RuntimeError, "Encountered aliasing during higher order op tracing for HOP"
         ):
-            f(inner, x, y)
+            compiled_f(inner, x, y)
 
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "Encountered input mutation during higher order op tracing for HOP",
-        ):
-            f(inner2, x, y)
+        compiled_out = compiled_f(inner2, x, y)
+        self.assertEqual(compiled_out, f(inner2, x_clone, y_clone))
 
     def test_eager_call(self):
         def inner(x, y):
 
@@ -13,7 +13,6 @@
 from torch._higher_order_ops import InvokeQuant
 from torch._inductor import config
 from torch._inductor.pattern_matcher import (
-    Arg,
     CallFunction,
     Ignored,
     Match,
@@ -119,9 +118,10 @@ def fn(x, y, z):
             logs = "\n".join(r.getMessage() for r in log.records)
             f = FileCheck()
             f.check("AFTER POST GRAD")
-            f.check("subgraph0").ch
10000
eck("subgraph1")
-            for _ in range(2):
-                f.check("torch.ops.higher_order.invoke_quant(").check_same("nf4")
+            f.check("subgraph0_1")
+            f.check("torch.ops.higher_order.invoke_quant(").check_same("nf4")
+            f.check("subgraph0_0")
+            f.check("torch.ops.higher_order.invoke_quant(").check_same("nf4")
             f.run(logs)
 
 
@@ -159,15 +159,15 @@ def fn_no_match(x, y, z):
 
         @register_graph_pattern(
             CallFunction(
-                torch.ops.aten.mm,
-                CallFunction(
-                    torch.ops.higher_order.invoke_quant,
-                    Ignored(),
-                    Ignored(),
-                    Ignored(),
-                    scheme="nf4",
-                ),
-                Arg(),
+                torch.ops.higher_order.auto_functionalized_v2,
+                Ignored(),
+                subgraph=Ignored(),
+                arg0=Ignored(),
+                arg1=Ignored(),
+                scheme="nf4",
+                quant_options=Ignored(),
+                _all_bases=Ignored(),
+                _op_schema=Ignored(),
             ),
             pass_dict=test_pass,
         )
 
@@ -3105,7 +3105,7 @@ def maybe_positional_arg_names(func):
 class BaseHOPVariable(WrapHigherOrderVariable):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.supports_input_mutation = False
+        self.supports_input_mutation = True
         self.supports_aliasing = False
 
     def python_type(self):