pytorch
diff --git a/‎test/dynamo/test_decorators.py
Lines changed: 5 additions & 30 deletions b/‎test/dynamo/test_decorators.py
Lines changed: 5 additions & 30 deletions
diff --git a/‎test/dynamo/test_subclasses.py
Lines changed: 2 additions & 2 deletions b/‎test/dynamo/test_subclasses.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_dynamo/output_graph.py
Lines changed: 12 additions & 0 deletions b/‎torch/_dynamo/output_graph.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎torch/_dynamo/variables/builder.py
Lines changed: 15 additions & 0 deletions b/‎torch/_dynamo/variables/builder.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎torch/_dynamo/variables/optimizer.py
Lines changed: 2 additions & 2 deletions b/‎torch/_dynamo/variables/optimizer.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_functorch/aot_autograd.py
Lines changed: 2 additions & 0 deletions b/‎torch/_functorch/aot_autograd.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch/_inductor/utils.py
Lines changed: 0 additions & 6 deletions b/‎torch/_inductor/utils.py
Lines changed: 0 additions & 6 deletions
@@ -425,41 +425,16 @@ def forward(self, a, *args):
     def _test_mark_static_address(self, guarded):
         # This test verifies that dynamo properly marks inputs as static
         # when using the mark_static_address API.
-        # On 1st compile, we expect the input to be marked as static, with guarded
-        # set depending on the `guarded` flag.
-        # On 2nd compile, we expect the input to be unmarked
-        # if inlining NN modules, we expect metadata to be present on the tensor, indicating
-        # the static address type of the input
-        # if not inlining NN modules, we expect the tensor to be present in the buffers attribute
-        # of the graph.
+        # For both inline_inbuilt_nn_modules True and False, we expect the
+        # tensor to be present in the buffers attribute of the graph.
 
         compiles_with_buffers = 0
         compiles = 0
 
         def debug_compiler(gm, _):
             nonlocal compiles_with_buffers
             nonlocal compiles
-            if torch._dynamo.config.inline_inbuilt_nn_modules:
-                input_node = [
-                    n
-                    for n in gm.graph.nodes
-                    if n.op == "placeholder" and n.name == "l_x_"
-                ]
-                self.assertEqual(len(input_node), 1)
-                input_node = input_node[0]
-                if compiles == 0:
-                    self.assertEqual(
-                        input_node.meta["tensor_dict"]["_dynamo_static_input_type"],
-                        "guarded" if guarded else "unguarded",
-                    )
-                elif compiles == 1:
-                    self.assertFalse(
-                        "_dynamo_static_input_type" in input_node.meta["tensor_dict"]
-                    )
-                else:
-                    raise RuntimeError(f"Unexpected number of compiles: {compiles}")
-            else:
-                compiles_with_buffers += len(gm._buffers) > 0
+            compiles_with_buffers += len(gm._buffers) > 0
             compiles += 1
             return gm
 
@@ -472,7 +447,7 @@ def fn(x):
         torch._dynamo.mark_static_address(inp, guard=guarded)
 
         fn(inp)
-        if not torch._dynamo.config.inline_inbuilt_nn_modules:
+        if guarded:
             self.assertEqual(compiles_with_buffers, 1)
 
         inp2 = torch.ones(2)
@@ -482,7 +457,7 @@ def fn(x):
         # should not be incremented
         fn(inp2)
 
-        if not torch._dynamo.config.inline_inbuilt_nn_modules:
+        if guarded:
             self.assertEqual(compiles_with_buffers, 1)
 
         self.assertEqual(compiles, 2 if guarded else 1)
 
@@ -1848,9 +1848,9 @@ def inner_compile(
             extern_node_serializer: Optional[Callable[[list[Any]], Any]] = None,
         ):
             if dynamic:
-                self.assertEqual(static_input_idxs, [2, 3, 4])
+                self.assertEqual(static_input_idxs, [0, 1, 2, 3, 4])
             else:
-                self.assertEqual(static_input_idxs, [1, 2])
+                self.assertEqual(static_input_idxs, [0, 1, 2])
             return gm
 
         compiler = functools.partial(compile_fx, inner_compile=inner_compile)
 
@@ -1361,6 +1361,13 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
 
             tx.output.current_tracer._maybe_preserve_original_meta(tx, output_node)
             if not config.do_not_emit_runtime_asserts:
+                # There is a rare scenario where codegen_suffix adds a new entry
+                # to self.nn_modules while `root` knows only about the
+                # nn_modules at the time of its creation. This causes failures
+                # while creating the graph module because self.graph and root
+                # are out of sync. This only happens for `get_attr` nodes, so
+                # here we clean up the get_attr nodes that are unused.
+                self.remove_unused_get_attr_nodes()
                 insert_deferred_runtime_asserts(
                     self.shape_env,
@@ -1568,6 +1575,11 @@ def example_inputs(self) -> list[torch.Tensor]:
         result = [arg.example for arg in self.graphargs]
         return result
 
+    def remove_unused_get_attr_nodes(self) -> None:
+        for node in reversed(list(self.graph.nodes)):
+            if node.op == "get_attr" and len(list(node.users)) == 0:
+                self.remove_node(node)
+
     def remove_unused_graphargs(self) -> None:
         # NB: It's always OK to drop GraphArg for symbols that ended up being
         # specialized.  You don't even have to make a guard for it, because
 
@@ -1664,6 +1664,21 @@ def wrap_tensor(self, value: torch.Tensor):
                 value, self.name, source=source
             )
 
+        if get_static_address_type(value) == "guarded":
+            # If it's a guarded tensor, we can install the parameter directly
+            # into  the Fx graph instead of lifting it as an input. Lifting
+            # offers no benefit,  such as regional compilation, since we still
+            # guard on the tensor's ID.  Moreover, installing it in the Fx graph
+            # eliminates the pre-graph bytecode  required to extract the tensor
+            # from locals/globals, reducing overhead.  This can lead to
+            # significant cost savings, especially for optimizers  handling many
+            # tensors.
+            self.install_guards(GuardBuilder.ID_MATCH)
+            self.assert_not_wrapped_by_this_graph(value)
+            return self.tx.output.register_attr_or_module(
+                value, self.name, source=source
+            )
+
         if is_constant_source(source):
             self.assert_not_wrapped_by_this_graph(value)
             return self.tx.output.register_attr_or_module(
 
@@ -358,7 +358,7 @@ def wrap_tensor(self, tx: "InstructionTranslator", tensor_value):
             # mark these tensors as static for cudagraphs
             mark_static_address(tensor_value)
             source = self.tensor_to_source[tensor_value]
-            self.static_tensor_names.add(tx.output.module_key_name(source.name))
+            self.static_tensor_names.add(tx.output.module_key_name(source.name()))
         elif tensor_value in self.grad_to_source:
             source = self.grad_to_source[tensor_value]
         else:
@@ -367,7 +367,7 @@ def wrap_tensor(self, tx: "InstructionTranslator", tensor_value):
 
             global_name = tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, tensor_value)
             source = GlobalWeakRefSource(global_name)
-            self.static_tensor_names.add(tx.output.module_key_name(source.name))
+            self.static_tensor_names.add(tx.output.module_key_name(source.name()))
 
         return VariableTracker.build(tx, tensor_value, source)
 
 
@@ -1031,6 +1031,8 @@ def _try_get_metadata_from_dynamo(
         aot_autograd_arg_pos_to_source.append(source)
 
     # Collect the dynamo graph inputs
+    # TODO(mlazos): Revisit if this is still needed. With Dynamo install ID
+    # matched tensors back into the Fx graph, this might not be necessary.
     static_input_indices = []
     for pos, node in enumerate(mod.graph.find_nodes(op="placeholder")):
         assert hasattr(node, "_dynamo_source")
 
@@ -2085,12 +2085,6 @@ def num_fw_fixed_arguments(dynamo_gm_num_inputs: int, aot_fw_gm_num_inputs: int)
     # AOT won't lift any parameters if we're inlining NN Modules
     # however desugaring subclasses will still add arguments
     # resulted in extra fixed inputs https://github.com/pytorch/pytorch/issues/130502
-    if (
-        torch._dynamo.config.inline_inbuilt_nn_modules
-        and not torch._dynamo.utils.is_parameter_freezing()
-    ):
-        return 0
-
     return aot_fw_gm_num_inputs - dynamo_gm_num_inputs - num_rng_seed_offset_inputs