pytorch
diff --git a/‎torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
Lines changed: 14 additions & 0 deletions b/‎torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎torch/_inductor/fx_passes/post_grad.py
Lines changed: 10 additions & 0 deletions b/‎torch/_inductor/fx_passes/post_grad.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎torch/_inductor/ir.py
Lines changed: 15 additions & 0 deletions b/‎torch/_inductor/ir.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎torch/fx/passes/fake_tensor_prop.py
Lines changed: 9 additions & 0 deletions b/‎torch/fx/passes/fake_tensor_prop.py
Lines changed: 9 additions & 0 deletions
@@ -13,6 +13,7 @@
 from torch import Tensor
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import detect_fake_mode, lazy_format_graph_code
+from torch._inductor.utils import OrderedSet
 from torch._logging import getArtifactLogger, trace_structured
 from torch._subclasses.functional_tensor import FunctionalTensorMode
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -183,6 +184,19 @@ def aot_dispatch_base_graph(
     # there should be *NO* mutating ops in the graph at this point.
     copy_count = assert_functional_graph(fw_module.graph)
     fw_module.graph.eliminate_dead_code()
+
+    # Call DCE on the subgraphs
+    # TODO - Consider updating the eliminate_dead_code to work recursively.
+    seen_subgraphs: OrderedSet[str] = OrderedSet()
+    for nd in fw_module.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.invoke_subgraph
+    ):
+        subgraph_name = nd.args[0].target
+        if subgraph_name not in seen_subgraphs:
+            seen_subgraphs.add(subgraph_name)
+            subgraph = getattr(fw_module, nd.args[0].target)
+            subgraph.graph.eliminate_dead_code()
+            subgraph.recompile()
     fw_module.recompile()
 
     copy_count2 = assert_functional_graph(fw_module.graph)
 
@@ -1207,6 +1207,16 @@ def view_to_reshape(gm):
     ):
         nd.target = torch.ops.aten.reshape.default
 
+    seen_subgraphs: OrderedSet[str] = OrderedSet()
+    for nd in gm.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.invoke_subgraph
+    ):
+        subgraph_name = nd.args[0].target
+        if subgraph_name not in seen_subgraphs:
+            seen_subgraphs.add(subgraph_name)
+            subgraph = getattr(gm, nd.args[0].target)
+            view_to_reshape(subgraph)
+
 
 def should_prefer_unfused_addmm(match):
     inp = match.kwargs["inp"]
 
@@ -7431,6 +7431,10 @@ def _has_aliased_buffers(buffers: Sequence[IRNode]) -> bool:
 
 @ir_dataclass(frozen=False)
 class InvokeSubgraph(ExternKernel):
+    """
+    Implementation of InvokeSubgraph HOP
+    """
+
     subgraph: Optional[Subgraph] = None
     operands: Optional[list[TensorBox]] = None
     outputs: Optional[list[MultiOutput]] = None
@@ -7515,6 +7519,17 @@ def create_output(output: IRNode, ind: int):
                     skip_size_stride_alignment_checks=True,
                 )
 
+        # Force the output strides to be same as the original strides
+        new_outputs = []
+        fake_outputs = V.graph.current_node.meta["val"]
+        for idx, output in enumerate(outputs):
+            if isinstance(output, (ShapeAsConstantBuffer, NoneAsConstantBuffer)):
+                new_outputs.append(output)
+            else:
+                example_stride = handle_sym_expr(fake_outputs[idx].stride())
+                new_outputs.append(cls.require_exact_strides(output, example_stride))
+        outputs = new_outputs
+
         outputs = [create_output(output, i) for i, output in enumerate(outputs)]
         invoke_subgraph.outputs = outputs
         return outputs
 
@@ -43,6 +43,15 @@ def run_node(self, n: Node):
             rebind_unbacked,
         )
 
+        if (
+            n.op == "call_function"
+            and n.target is torch.ops.higher_order.invoke_subgraph
+        ):
+            subgraph_example_inputs = [a.meta["val"] for a in n.args[2:]]  # type: ignore[union-attr,arg-type]
+            FakeTensorProp(
+                getattr(self.module, n.args[0].target), mode=self._mode  # type: ignore[union-attr,arg-type]
+            ).propagate(*subgraph_example_inputs)
+
         result = super().run_node(n)
         rebind_unbacked(self._mode.shape_env, n, result)