pytorch
diff --git a/‎test/inductor/test_torchinductor.py
Lines changed: 31 additions & 0 deletions b/‎test/inductor/test_torchinductor.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎torch/_inductor/graph.py
Lines changed: 4 additions & 3 deletions b/‎torch/_inductor/graph.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎torch/_inductor/lowering.py
Lines changed: 23 additions & 1 deletion b/‎torch/_inductor/lowering.py
Lines changed: 23 additions & 1 deletion
diff --git a/‎torch/_library/utils.py
Lines changed: 12 additions & 0 deletions b/‎torch/_library/utils.py
Lines changed: 12 additions & 0 deletions
@@ -10571,6 +10571,37 @@ def fn(x):
             # But because our custom op needs fixed layout, the assertions in the custom op will pass
             self.common(fn, (inp,), check_lowp=False)
 
+    @config.patch(implicit_fallbacks=True)
+    def test_mutable_custom_op_fixed_layout(self):
+        with torch.library._scoped_library("mylib", "DEF") as lib:
+            lib.define(
+                "copy_(Tensor(a!) ret, Tensor tensors, int dim) -> ()",
+                tags=torch.Tag.needs_fixed_stride_order,
+            )
+
+            @torch.library.impl(lib, "copy_", "Meta")
+            def _(ret, tensors, dim):
+                return None
+
+            @torch.library.impl(lib, "copy_", "CPU")
+            def _(ret, tensors, dim):
+                ret.copy_(tensors)
+
+            def f(x):
+                full_default_3 = torch.full([3], 7.0, device="cpu")
+                chunk_cat_default_1 = torch.ops.mylib.copy_.default(
+                    full_default_3, x, 0
+                )
+                mul_out = torch.mul(full_default_3, full_default_3)
+                return mul_out
+
+            x = torch.arange(3, dtype=torch.float, device="cpu")
+            eager_out = f(x)
+
+            compiled_inductor_f = torch.compile(f, backend="inductor", fullgraph=True)
+            compiled_inductor_out = compiled_inductor_f(x)
+            self.assertEqual(compiled_inductor_out, eager_out)
+
     @requires_gpu()
     @config.patch(implicit_fallbacks=True)
     def test_custom_op_fixed_layout_channels_last(self):
 
@@ -991,12 +991,13 @@ def get_custom_op_layout_constraints(
                 # We have to OrderedSet the current args because call_function will immediately
                 # evaluate this lowering after creating the fallback, without evaluating
                 # the layout constraint
-                args, kwargs = constrain_to_fx_strides(
-                    self.current_node, *args, **kwargs
+                constrain_fn = functools.partial(
+                    constrain_to_fx_strides, ignore_mutated_args_FIXME=True
                 )
+                args, kwargs = constrain_fn(self.current_node, *args, **kwargs)
                 # Also register the layout constraint so when the fallback
                 # is used again, we can constrain the args to the same layout
-                layout_constraint = constrain_to_fx_strides
+                layout_constraint = constrain_fn
             return layout_constraint, args, kwargs
 
         if target not in lowerings:
 
@@ -2067,13 +2067,35 @@ def require_channels_last(_, *args, **kwargs):
     return args, kwargs
 
 
-def constrain_to_fx_strides(fx_node, *args, **kwargs):
+def constrain_to_fx_strides(fx_node, *args, ignore_mutated_args_FIXME=False, **kwargs):
     def apply_constraint(arg, fx_arg):
         if isinstance(arg, ir.IRNode):
             stride_order = ir.get_stride_order(fx_arg.meta["val"].stride())
             return ir.ExternKernel.require_stride_order(arg, stride_order)
         return arg
 
+    if ignore_mutated_args_FIXME:
+        assert isinstance(fx_node.target, torch._ops.OpOverload)
+        schema = fx_node.target._schema
+
+        new_args = []
+        new_kwargs = {}
+        schema_args, schema_kwargs = torch._library.utils.schema_args_kwargs(schema)
+        for arg, fx_arg, schema_arg in zip(args, fx_node.args, schema_args):
+            if schema_arg.alias_info is not None and schema_arg.alias_info.is_write:
+                new_args.append(arg)
+            else:
+                new_args.append(apply_constraint(arg, fx_arg))
+        for key in kwargs:
+            arg = kwargs[key]
+            fx_arg = fx_node.kwargs[key]
+            schema_arg = schema_kwargs[key]
+            if schema_arg.alias_info is not None and schema_arg.alias_info.is_write:
+                new_kwargs[key] = arg
+            else:
+                new_kwargs[key] = apply_constraint(arg, fx_arg)
+        return tuple(new_args), new_kwargs
+
     args = tuple(
         apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)
     )
 
@@ -177,6 +177,18 @@ def fill_defaults(schema, args, kwargs):
     return tuple(new_args), new_kwargs
 
 
+def schema_args_kwargs(schema):
+    args = []
+    kwargs = {}
+    for i in range(len(schema.arguments)):
+        info = schema.arguments[i]
+        if info.kwarg_only:
+            kwargs[info.name] = info
+            continue
+        args.append(info)
+    return tuple(args), kwargs
+
+
 def zip_schema(
     schema: _C.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any]
 ) -> Iterable[Tuple[_C.Argument, Any]]: