pytorch
diff --git a/‎benchmarks/dynamo/common.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/common.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/inductor/test_aot_inductor.py
Lines changed: 24 additions & 16 deletions b/‎test/inductor/test_aot_inductor.py
Lines changed: 24 additions & 16 deletions
diff --git a/‎torch/_inductor/codegen/cpp.py
Lines changed: 1 addition & 1 deletion b/‎torch/_inductor/codegen/cpp.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/_inductor/codegen/triton.py
Lines changed: 3 additions & 3 deletions b/‎torch/_inductor/codegen/triton.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎torch/_inductor/compile_fx.py
Lines changed: 18 additions & 15 deletions b/‎torch/_inductor/compile_fx.py
Lines changed: 18 additions & 15 deletions
diff --git a/‎torch/_inductor/config.py
Lines changed: 1 addition & 1 deletion b/‎torch/_inductor/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/fx/graph_module.py
Lines changed: 3 additions & 2 deletions b/‎torch/fx/graph_module.py
Lines changed: 3 additions & 2 deletions
@@ -2917,7 +2917,7 @@ def parse_args(args=None):
     )
     parser.add_argument("--cosine", action="store_true", help="use cosine similarity")
     parser.add_argument(
-        "--freezing", action="store_true", help="turn on freezing", default=False
+        "--freezing", action="store_true", help="turn on freezing", default=None
     )
     parser.add_argument(
         "--inductor-config",
 
@@ -122,6 +122,10 @@
     raise
 
 
+def _is_cpu_freezing(self):
+    return (config.freezing is None or config.freezing) and self.device != GPU_TYPE
+
+
 class AOTInductorTestsTemplate:
     def test_simple(self):
         class Model(torch.nn.Module):
@@ -4137,16 +4141,16 @@ def forward(self, a):
         a = torch.randn(batch, M, K, device=self.device)
         example_inputs = (a,)
 
-        kernel_calls = (
-            [
+        is_cpu_freezing = _is_cpu_freezing(self)
+        if self.device == GPU_TYPE:
+            kernel_calls = [
                 ("triton_poi_fused_0", 1),
                 (f"aoti_torch_{GPU_TYPE}_addmm_out", 2),
             ]
-            if self.device == GPU_TYPE
-            else [
-                ("aoti_torch_cpu_addmm_out", 2),
-            ]
-        )
+        elif is_cpu_freezing:
+            kernel_calls = [("cpp_fused_0", 1)]
+        else:
+            kernel_calls = [("aoti_torch_cpu_addmm_out", 2)]
 
         # test default debug printing all tensor values codegen
         with config.patch({"aot_inductor.debug_intermediate_value_printer": "2"}):
@@ -4170,7 +4174,9 @@ def forward(self, a):
                 ).run(code)
 
         # test printing selected kernel's tensor values codegen
-        filtered_kernel_name = f"aoti_torch_{self.device}_addmm_out"
+        filtered_kernel_name = (
+            "cpp_fused_0" if is_cpu_freezing else f"aoti_torch_{self.device}_addmm_out"
+        )
         with config.patch(
             {
                 "aot_inductor.debug_intermediate_value_printer": "2",
@@ -4181,7 +4187,7 @@ def forward(self, a):
                 AOTIRunnerUtil.legacy_compile, model, example_inputs
             )
             filtered_kernel_calls = [
-                (filtered_kernel_name, 2),
+                (filtered_kernel_name, 1 if is_cpu_freezing else 2),
             ]
             for kernel_call, count in filtered_kernel_calls:
                 FileCheck().check_count(
@@ -4226,17 +4232,18 @@ def forward(self, a):
         batch = 2
         a = torch.randn(batch, M, K, device=self.device)
         example_inputs = (a,)
-        kernel_calls = (
-            f"aoti_torch_{GPU_TYPE}_addmm_out"
-            if self.device == GPU_TYPE
-            else "aoti_torch_cpu_addmm_out"
+
+        kernel_call = (
+            "graph_1_cpp_fused_0"
+            if _is_cpu_freezing(self)
+            else f"aoti_torch_{self.device}_addmm_out"
         )
         with config.patch({"cpp.enable_kernel_profile": enable_kernel_profile}):
             _, code = run_and_get_cpp_code(
                 AOTIRunnerUtil.compile, model, example_inputs
             )
             shim_fn_codes = (
-                f'RECORD_FUNCTION("{kernel_calls}", c10::ArrayRef<c10::IValue>());'
+                f'RECORD_FUNCTION("{kernel_call}", c10::ArrayRef<c10::IValue>());'
             )
             if enable_kernel_profile:
                 FileCheck().check(shim_fn_codes).run(code)
@@ -4486,14 +4493,15 @@ def forward(self, a, b, c):
         so_path, code = run_and_get_cpp_code(
             AOTIRunnerUtil.legacy_compile, model, example_inputs
         )
-        lowerbound_check = "u1 >= 1" if mark_unbacked else "u0 >= 2"
+        varname = f"u{int(mark_unbacked) + (2 if _is_cpu_freezing(self) else 0)}"
+        lowerbound_check = f"{varname} >= {1 if mark_unbacked else 2}"
         FileCheck().check_count(lowerbound_check, 1).run(code)
 
         compiled = AOTIRunnerUtil.legacy_load(self.device, so_path)
         compiled(*example_inputs)
 
         # Check the runtime assertion.
-        with self.assertRaisesRegex(Exception, ""):
+        with self.assertRaises(Exception):
             unexpected_inputs = (torch.ones(0, device=self.device), b, c)
             compiled(*unexpected_inputs)
 
 
@@ -5262,7 +5262,7 @@ def codegen_group(self, name=None) -> str:
                 prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
                 code.writelines(
                     [
-                        f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+                        f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>());'
                     ]
                 )
             for old, new in self.args.aliases():
 
@@ -3695,12 +3695,12 @@ def add_constexpr_arg(arg_name):
                 if (
                     len(non_constexpr_signature(signature)) == 4
                 ):  # input, output and 2 args
-                    tile_hint = "tile_hint=TileHint.SQUARE,"
+                    tile_hint = " tile_hint=TileHint.SQUARE,"
                 else:
-                    tile_hint = "tile_hint=TileHint.DEFAULT,"
+                    tile_hint = " tile_hint=TileHint.DEFAULT,"
             heuristics_line = f"""
                 @triton_heuristics.{self._get_heuristic()}(
-                    size_hints={size_hints!r}, {tile_hint}
+                    size_hints={size_hints!r},{tile_hint}
                     filename=__file__,
                     triton_meta={triton_meta!r},
                     inductor_meta={inductor_meta!r},
 
@@ -13,7 +13,7 @@
 import warnings
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from contextlib import AbstractContextManager
+from contextlib import AbstractContextManager, nullcontext
 from inspect import currentframe
 from itertools import count
 from operator import attrgetter
@@ -1656,35 +1656,34 @@ def compile_fx_aot(
     model_: GraphModule,
     example_inputs_: list[InputType],
     inner_compile: _CompileFxCallable = compile_fx_inner,
-    config_patches: Optional[dict[str, str]] = None,
+    config_patches: Optional[dict[str, Any]] = None,
 ) -> Union[list[str], str]:
     assert isinstance(model_, GraphModule), model_
 
     # [See NOTE] Unwrapping subclasses AOT
     unwrap_tensor_subclass_parameters(model_)
 
-    config_patches: dict[str, Any] = (
-        {"cpp_wrapper": True}
-        if config_patches is None
-        else {**config_patches, "cpp_wrapper": True}
-    )
+    if config_patches is None:
+        config_patches = {}
 
-    output_path = config_patches.get(
-        "aot_inductor.output_path", config.aot_inductor.output_path
+    config_patches.update(
+        cpp_wrapper=True,
+        freezing=config.freezing
+        if config.freezing is not None
+        else not config.aot_inductor.use_runtime_constant_folding,
     )
 
-    if output_path:
+    if output_path := config_patches.get(
+        "aot_inductor.output_path", config.aot_inductor.output_path
+    ):
         assert not output_path.endswith(".pt2"), (
             "The output path for aot_compile should not have an extension with .pt2 "
             "this is for specifying the output path for the .so in AOTInductor. "
             "If you would like to package the AOTInductor generated files "
             "into a pt2, please call `torch._inductor.aoti_compile_and_package`."
         )
     else:
-        config_patches = {
-            **config_patches,
-            "aot_inductor.output_path": code_hash(model_.code),
-        }
+        config_patches["aot_inductor.output_path"] = code_hash(model_.code)
 
     extern_node_serializer = config_patches.pop("extern_node_serializer", None)
     saved_compile_id = model_.meta.get("dynamo_compile_id", None)
@@ -1789,7 +1788,11 @@ def fw_compiler_freezing(
         if tracing_context.fw_metadata:
             static_input_idxs = tracing_context.fw_metadata.static_input_indices
 
-    with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
+    with (
+        mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+        if fake_mo
97AE
de
+        else nullcontext()
+    ):
         optimized_function = inner_compile(
             opt_model,
             aot_example_inputs,
 
@@ -861,7 +861,7 @@ def decide_compile_threads() -> int:
 # Freezing will attempt to inline weights as constants in optimization
 # and run constant folding and other optimizations on them. After freezing, weights
 # can no longer be updated.
-freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1"
+freezing: Optional[bool] = get_tristate_env("TORCHINDUCTOR_FREEZING")
 
 # Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead
 # of potentially keeping multiple copies of weights.
 
@@ -535,8 +535,9 @@ def __init__(
         if self.graph._tracer_extras:
 
-        # Dictionary to store metadata
-        self.meta: dict[str, Any] = {}
+        # Dictionary to store metadata.  Initialize with the root metadata, if present,
+        # to avoid losing information when doing fx transformations.
+        self.meta: dict[str, Any] = root.meta if isinstance(root, GraphModule) else {}
         self._replace_hooks: list[Callable] = []
         self._create_node_hooks: list[Callable] = []
         self._erase_node_hooks: list[Callable] = []
Original file line number	Diff line number	Diff line change
`@@ -2917,7 +2917,7 @@ def parse_args(args=None):`
`2917`	`2917`	`)`
`2918`	`2918`	`parser.add_argument("--cosine", action="store_true", help="use cosine similarity")`
`2919`	`2919`	`parser.add_argument(`
`2920`		`- "--freezing", action="store_true", help="turn on freezing", default=False`
	`2920`	`+ "--freezing", action="store_true", help="turn on freezing", default=None`
`2921`	`2921`	`)`
`2922`	`2922`	`parser.add_argument(`
`2923`	`2923`	`"--inductor-config",`
Original file line number	Diff line number	Diff line change
`@@ -5262,7 +5262,7 @@ def codegen_group(self, name=None) -> str:`
`5262`	`5262`	`prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""`
`5263`	`5263`	`code.writelines(`
`5264`	`5264`	`[`
`5265`		`- f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'`
	`5265`	`+ f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>());'`
`5266`	`5266`	`]`
`5267`	`5267`	`)`
`5268`	`5268`	`for old, new in self.args.aliases():`