pytorch
diff --git a/‎test/inductor/test_torchinductor.py
Lines changed: 29 additions & 0 deletions b/‎test/inductor/test_torchinductor.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎torch/_inductor/graph.py
Lines changed: 5 additions & 3 deletions b/‎torch/_inductor/graph.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎torch/_inductor/lowering.py
Lines changed: 32 additions & 1 deletion b/‎torch/_inductor/lowering.py
Lines changed: 32 additions & 1 deletion
@@ -9845,6 +9845,35 @@ def fn(x):
             # But because our custom op needs fixed layout, the assertions in the custom op will pass
             self.common(fn, (inp,), check_lowp=False)
 
+    @config.patch(implicit_fallbacks=True)
+    def test_mutable_custom_op_fixed_layout(self):
+        with torch.library._scoped_library("mylib", "DEF") as lib:
+            lib.define(
+                "copy_(Tensor(a!) dst, Tensor src) -> ()",
+                tags=torch.Tag.needs_fixed_stride_order,
+            )
+
+            @torch.library.impl(lib, "copy_", "Meta")
+            def _(dst, src):
+                return None
+
+            @torch.library.impl(lib, "copy_", "CompositeExplicitAutograd")
+            def _(dst, src):
+                dst.copy_(src)
+
+            def f(x):
+                full_default_3 = torch.full([3], 7.0, device="cpu")
+                chunk_cat_default_1 = torch.ops.mylib.copy_.default(full_default_3, x)
+                mul_out = torch.mul(full_default_3, full_default_3)
+                return mul_out
+
+            x = torch.arange(3, dtype=torch.float, device="cpu")
+            eager_out = f(x)
+
+            compiled_inductor_f = torch.compile(f, backend="inductor", fullgraph=True)
+            compiled_inductor_out = compiled_inductor_f(x)
+            self.assertEqual(compiled_inductor_out, eager_out)
+
     @requires_gpu()
     @config.patch(implicit_fallbacks=True)
     def test_custom_op_fixed_layout_channels_last(self):
 
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import functools
 import itertools
 import logging
 import operator
@@ -934,12 +935,13 @@ def get_custom_op_layout_constraints(target, args, kwargs):
                 # We have to set the current args because call_function will immediately
                 # evaluate this lowering after creating the fallback, without evaluating
                 # the layout constraint
-                args, kwargs = constrain_to_fx_strides(
-                    self.current_node, *args, **kwargs
+                constrain_fn = functools.partial(
+                    constrain_to_fx_strides, ignore_mutated_args_FIXME=True
                 )
+                args, kwargs = constrain_fn(self.current_node, *args, **kwargs)
                 # Also register the layout constraint so when the fallback
                 # is used again, we can constrain the args to the same layout
-                layout_constraint = constrain_to_fx_strides
+                layout_constraint = constrain_fn
             return layout_constraint, args, kwargs
 
         if target not in lowerings:
 
@@ -2004,13 +2004,44 @@ def require_channels_last(_, *args, **kwargs):
     return args, kwargs
 
 
-def constrain_to_fx_strides(fx_node, *args, **kwargs):
+def constrain_to_fx_strides(fx_node, *args, ignore_mutated_args_FIXME=False, **kwargs):
     def apply_constraint(arg, fx_arg):
         if isinstance(arg, ir.IRNode):
             stride_order = ir.get_stride_order(fx_arg.meta["val"].stride())
             return ir.ExternKernel.require_stride_order(arg, stride_order)
         return arg
 
+    # There's a silent incorrectness bug where we if we constrain a mutated arg,
+    # we may end up cloning it, writing in-place to the clone, and then using
+    # the original value (instead of the cloned value). Our short-term fix for this
+    # is to never constrain mutated args; longer term we do want to fix this.
+    # https://github.com/pytorch/pytorch/issues/128084
+    if ignore_mutated_args_FIXME:
+        assert isinstance(fx_node.target, torch._ops.OpOverload)
+        schema = fx_node.target._schema
+
+        def maybe_apply_constraint(schema_arg, arg, fx_arg):
+            if schema_arg.alias_info is not None and schema_arg.alias_info.is_write:
+                return arg
+            return apply_constraint(arg, fx_arg)
+
+        new_args = []
+        new_kwargs = {}
+
+        for idx, (arg, fx_arg) in enumerate(zip(args, fx_node.args)):
+            schema_arg = schema.arguments[idx]
+            new_args.append(maybe_apply_constraint(schema_arg, arg, fx_arg))
+
+        schema_kwargs = {arg.name: arg for arg in schema.arguments}
+
+        for key in kwargs.keys():
+            arg = kwargs[key]
+            fx_arg = fx_node.kwargs[key]
+            schema_arg = schema_kwargs[key]
+            new_kwargs[key] = maybe_apply_constraint(schema_arg, arg, fx_arg)
+
+        return tuple(new_args), new_kwargs
+
     args = tuple(
         apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)
     )