pytorch
diff --git a/‎test/inductor/test_pattern_matcher.py
Lines changed: 75 additions & 0 deletions b/‎test/inductor/test_pattern_matcher.py
Lines changed: 75 additions & 0 deletions
diff --git a/‎test/test_pattern_matcher_custom_op_DEBUG_NOTES.txt
Lines changed: 6 additions & 0 deletions b/‎test/test_pattern_matcher_custom_op_DEBUG_NOTES.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎torch/_inductor/fx_passes/post_grad.py
Lines changed: 1 addition & 0 deletions b/‎torch/_inductor/fx_passes/post_grad.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/_inductor/pattern_matcher.py
Lines changed: 18 additions & 3 deletions b/‎torch/_inductor/pattern_matcher.py
Lines changed: 18 additions & 3 deletions
@@ -4,6 +4,7 @@
 import os
 import unittest
 from typing import Callable, Optional
+import functools
 
 import torch
 import torch._dynamo.config as dynamo_config
@@ -44,6 +45,7 @@
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
 from torch.utils import _pytree as pytree
+from torch.library import register_fake
 
 
 aten = torch.ops.aten
@@ -1655,6 +1657,79 @@ def my_func_static(x, w, epsilon):
         test, (code,) = run_and_get_code(my_func_static, *inputs)
         self.assertTrue("static_scaled_int8_quant" not in code)
 
+    def test_mutable_op_nonview_inputs_register_replacement(self):
+        @torch.library.custom_op("mylib::foo_inplace", mutates_args={"x"})
+        def foo_inplace(x: torch.Tensor) -> None:
+            x.add_(1)
+
+        # NOTE: only returning None is supported; the custom op cannot return `out`.
+        @torch.library.custom_op("mylib::bar", mutates_args={"out"})
+        def bar_out(x: torch.Tensor, out: torch.Tensor) -> None:
+            out.copy_(x + 2)
+
+        @register_fake("mylib::bar")
+        def bar_out_fake(x: torch.Tensor, out: torch.Tensor) -> None:
+            return None
+
+        @torch.library.custom_op("mylib::foobar_out", mutates_args={"out"})
+        def foobar_out(x: torch.Tensor, out: torch.Tensor) -> None:
+            x.add_(1)
+            out.copy_(x + 7)
+
+        def mutable_ops_pattern(x, out):
+            foo_inplace(x)
+            bar_out(x, out)
+            return out
+
+        def mutable_ops_replacement(x, out):
+            foobar_out(x, out)
+            return out
+
+        inp = torch.randn(3)
+
+        my_patterns = PatternMatcherPass()
+        register_replacement(
+            search_fn=mutable_ops_pattern,
+            replace_fn=mutable_ops_replacement,
+            example_inputs=[inp.clone().detach(), inp.clone().detach()],
+            trace_fn=functools.partial(fwd_only, apply_auto_functionalize=True),
+            pass_dicts=my_patterns,
+        )
+
+        count = 0
+
+        def custom_pass(graph: torch.fx.Graph):
+            global count
+            count = my_patterns.apply(graph)
+
+        def custom_backend(
+            graph: torch.fx.GraphModule, example_inputs
+        ):
+            from torch._inductor import config
+
+            current_config = config.shallow_copy_dict()
+            from torch._inductor.compile_fx import compile_fx
+
+            current_config["post_grad_custom_post_pass"] = custom_pass
+            return compile_fx(graph, example_inputs, config_patches=current_config)
+
+        # user-function
+        @torch.compile(fullgraph=True, backend=custom_backend)
+        def f(x):
+            x = x.clone()
+            out = torch.zeros_like(x)
+            foo_inplace(x)
+            bar_out(x, out)
+            return out
+
+        def f_replaced(x):
+            x = x.clone()
+            out = torch.zeros_like(x)
+            foobar_out(x, out)
+            return out
+
+        self.assertEqual(f(inp.clone().detach()), f_replaced(inp.clone().detach()))
+        self.assertEqual(count, 1)
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_GPU:
 
@@ -0,0 +1,6 @@
+DEBUG NOTES:
+1. even the nonmutable-op version doesn't work right now, because custom op is automatically wrapped in auto_functionalized / auto_functionalized_v2,
+   while the pattern is looking for vanilla ops.
+   TODO: we should convert the pattern to auto_functionalized_v2 and then do matching.
+    - Richard said we can maybe use torch.func.functionalize + make_fx
+2. after the nonmutable-op version is fixed, we will move to mutable-op-nonview version.
@@ -217,6 +217,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
         ),
     )
     gm.graph.lint()
+    print(f"after post_grad_passes: gm: {gm}")
 
 
 def prepare_softmax_pattern(x, dim):
 
@@ -77,6 +77,7 @@
 from . import config
 from .decomposition import select_decomp_table
 from .lowering import fallback_node_due_to_unsupported_type
+from torch._subclasses.functional_tensor import FunctionalTensorMode, FunctionalTensor, dispatch_functionalize
 
 
 log = logging.getLogger(__name__)
@@ -1017,8 +1018,12 @@ def run(obj: PatternExpr, output_name: str = "output") -> str:
         """
 
         pp = PatternPrettyPrinter()
-        assert hasattr(obj, "pretty_print")
-        out_str = obj.pretty_print(pp=pp)
+        print(f"obj: {obj}, type(obj): {type(obj)}")
+        if isinstance(obj, KeywordArg):
+            out_str = obj.name
+        else:
+            assert hasattr(obj, "pretty_print")
+            out_str = obj.pretty_print(pp=pp)
 
         output = [
             f"{pp.memoized_objs_names[key]} = {pp.memoized_objs_pp[key]}"
@@ -1072,6 +1077,7 @@ def register(
         target: Union[torch.fx.node.Target, None] = None,
         prepend: bool = False,
     ) -> None:
+        print(f"target: {target}, self.pattern: {self.pattern}")
         if target is None:
             assert hasattr(self.pattern, "fns")
             for fn in self.pattern.fns:
@@ -1902,6 +1908,12 @@ def __getitem__(self, item: tuple[str, torch.fx.node.Target]) -> list[PatternEnt
         return self.patterns[item]
 
     def apply(self, gm: Union[torch.fx.GraphModule, torch.fx.Graph]) -> int:
+        import traceback
+        traceback.print_stack()
+        if "pass_pattern_" in str(traceback.format_stack()):
+            print(f"PatternMatcherPass: apply: entering, pass_pattern_, gm: {gm}")
+            for op, target in self.patterns:
+                print(f"self.patterns: op: {op}, target: {target}")
         if not self.patterns:
             return 0
         if isinstance(gm, torch.fx.GraphModule):
@@ -2082,15 +2094,18 @@ def fwd_only(
     fn: Callable[..., Any],
     args: Sequence[Any],
     *,
+    apply_auto_functionalize: bool = False,
     run_functional_passes: bool = True,
     get_decomp_fn: Optional[Callable[..., Any]] = None,
 ) -> torch.fx.GraphModule:
     """Build a normalized inference graph, for use with fx_to_pattern"""
-    # TODO - look into using aot autograd, asserting no mutating ops here
+
     with enable_python_dispatcher():
         decompositions = (
             get_decomp_fn() if get_decomp_fn is not None else select_decomp_table()
         )
+        if apply_auto_functionalize:
+            fn = dispatch_functionalize(fn)
         gm = make_fx(fn, decompositions, tracing_mode="real")(*args)
 
     from .fx_passes.post_grad import remove_noop_ops
Original file line number	Diff line number	Diff line change
`@@ -217,6 +217,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):`
`217`	`217`	`),`
`218`	`218`	`)`
`219`	`219`	`gm.graph.lint()`
	`220`	`+ print(f"after post_grad_passes: gm: {gm}")`
`220`	`221`
`221`	`222`
`222`	`223`	`def prepare_softmax_pattern(x, dim):`