pytorch
diff --git a/‎test/higher_order_ops/test_invoke_subgraph.py
Lines changed: 40 additions & 0 deletions b/‎test/higher_order_ops/test_invoke_subgraph.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎torch/_inductor/fx_passes/post_grad.py
Lines changed: 9 additions & 0 deletions b/‎torch/_inductor/fx_passes/post_grad.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎torch/_inductor/ir.py
Lines changed: 15 additions & 0 deletions b/‎torch/_inductor/ir.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎torch/fx/passes/fake_tensor_prop.py
Lines changed: 29 additions & 0 deletions b/‎torch/fx/passes/fake_tensor_prop.py
Lines changed: 29 additions & 0 deletions
@@ -617,6 +617,46 @@ def forward(self, a: "f32[8]", l_y_: "f32[8]"):
 """,
             )
 
+    def test_view_to_reshape(self):
+        @mark_compile_region
+        def gn(x):
+            x = torch.sin(x)
+            x = x.view(1, 8)
+            return torch.sin(x)
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(8, requires_grad=False)
+
+        torch._dynamo.reset()
+        backend = InductorAndRecordGraphs()
+        torch.compile(fn, backend=backend, fullgraph=True)(x)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(
+                    backend.inductor_graphs[0].print_readable(print_output=False)
+                ),
+                """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[8]"):
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1);  repeated_subgraph0 = arg0_1 = None
+        getitem: "f32[1, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        return (getitem,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[8]"):
+            sin: "f32[8]" = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+
+            view: "f32[1, 8]" = torch.ops.aten.reshape.default(sin, [1, 8]);  sin = None
+
+            sin_1: "f32[1, 8]" = torch.ops.aten.sin.default(view);  view = None
+            return (sin_1,)
+""",
+            )
+
     def test_normalize_gm(self):
         @mark_compile_region
         def gn(x, y):
 
@@ -1207,6 +1207,15 @@ def view_to_reshape(gm):
     ):
         nd.target = torch.ops.aten.reshape.default
 
+    subgraph_names: OrderedSet[str] = OrderedSet()
+    for node in sorted(gm.graph.find_nodes(op="get_attr")):
+        attr_name = node.target
+        if "." not in attr_name and attr_name not in subgraph_names:
+            sub_mod = getattr(gm, attr_name)
+            if isinstance(sub_mod, torch.fx.GraphModule):
+                subgraph_names.add(attr_name)
+                view_to_reshape(sub_mod)
+
 
 def should_prefer_unfused_addmm(match):
     inp = match.kwargs["inp"]
 
@@ -7431,6 +7431,10 @@ def _has_aliased_buffers(buffers: Sequence[IRNode]) -> bool:
 
 @ir_dataclass(frozen=False)
 class InvokeSubgraph(ExternKernel):
+    """
+    Implementation of InvokeSubgraph HOP
+    """
+
     subgraph: Optional[Subgraph] = None
     operands: Optional[list[TensorBox]] = None
     outputs: Optional[list[MultiOutput]] = None
@@ -7515,6 +7519,17 @@ def create_output(output: IRNode, ind: int):
                     skip_size_stride_alignment_checks=True,
                 )
 
+        # Force the output strides to be same as the original strides
+        new_outputs = []
+        fake_outputs = V.graph.current_node.meta["val"]
+        for idx, output in enumerate(outputs):
+            if isinstance(output, (ShapeAsConstantBuffer, NoneAsConstantBuffer)):
+                new_outputs.append(output)
+            else:
+                example_stride = handle_sym_expr(fake_outputs[idx].stride())
+                new_outputs.append(cls.require_exact_strides(output, example_stride))
+        outputs = new_outputs
+
         outputs = [create_output(output, i) for i, output in enumerate(outputs)]
         invoke_subgraph.outputs = outputs
         return outputs
 
@@ -7,6 +7,7 @@
 from torch.fx._compatibility import compatibility
 from torch.fx.experimental.proxy_tensor import py_sym_types, snapshot_fake
 from torch.fx.node import map_aggregate
+from torch.utils._ordered_set import OrderedSet
 
 
 __all__ = ["FakeTensorProp"]
@@ -36,13 +37,41 @@ def __init__(
         self._mode = mode
         mode.epoch += 1
         mode.reset_nt_tensor_id_counter()
+        self.seen_subgraphs: OrderedSet[str] = OrderedSet()
 
     def run_node(self, n: Node):
         from torch.fx.experimental.symbolic_shapes import (
             compute_unbacked_bindings,
             rebind_unbacked,
         )
 
+        if (
+            n.op == "call_function"
+            and n.target is torch.ops.higher_order.invoke_subgraph
+            and n.args[1] not in self.seen_subgraphs
+        ):
+            # Prevent redundant fake tensor prop for invoke_subgraphs. Note that
+            # there is also fake tensor caching for the entire subgraph. This
+            # happens the next time we call `run_node` for the same subgraph,
+            # which goes through super.run_node and caches the fake tensor prop.
+            # Therefore, we are propagating fake tensor through the subgraphs
+            # twice.
+            assert isinstance(n.args[1], str)
+            assert (
+                isinstance(n.args[0], torch.fx.Node)
+                and n.args[0].op == "get_attr"
+                and isinstance(n.args[0].target, str)
+            )
+            self.seen_subgraphs.add(n.args[1])
+            operands = n.args[2:]
+            example_inputs = []
+            for operand in operands:
+                assert isinstance(operand, torch.fx.Node) and "val" in operand.meta
+                example_inputs.append(operand.meta["val"])
+            return FakeTensorProp(
+                getattr(self.module, n.args[0].target), mode=self._mode
+            ).propagate(*example_inputs)
+
         result = super().run_node(n)
         rebind_unbacked(self._mode.shape_env, n, result)