pytorch
diff --git a/‎torch/__init__.py
Lines changed: 2 additions & 2 deletions b/‎torch/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_dynamo/output_graph.py
Lines changed: 5 additions & 6 deletions b/‎torch/_dynamo/output_graph.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎torch/_functorch/aot_autograd.py
Lines changed: 4 additions & 1 deletion b/‎torch/_functorch/aot_autograd.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎torch/_subclasses/fake_tensor.py
Lines changed: 4 additions & 0 deletions b/‎torch/_subclasses/fake_tensor.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/_subclasses/meta_utils.py
Lines changed: 10 additions & 2 deletions b/‎torch/_subclasses/meta_utils.py
Lines changed: 10 additions & 2 deletions
@@ -2359,10 +2359,10 @@ def apply_options(self, options: _Optional[dict[str, _Any]]):
                     )
             self.config[attr_name] = val
 
-    def __call__(self, model_, inputs_):
+    def __call__(self, model_, inputs_, **kwargs):
         from torch._inductor.compile_fx import compile_fx
 
-        return compile_fx(model_, inputs_, config_patches=self.config)
+        return compile_fx(model_, inputs_, config_patches=self.config, **kwargs)
 
     def get_compiler_config(self):
         from torch._inductor.compile_fx import get_patched_config_dict
 
@@ -1499,8 +1499,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
             compiled_fns = []
             with self.restore_global_state():
                 for specialization in backend_specializations
-                    modified_gm = specialize(gm, specialization)
-                    compiled_fns.append(self.call_user_compiler(modified_gm))
+                    compiled_fns.append(self.call_user_compiler(modified_gm, specialization))
 
             from torch.fx._lazy_graph_module import _LazyGraphModule
 
@@ -1545,16 +1544,16 @@ def placeholders(self) -> list[fx.Node]:
     def graphargs(self) -> list[GraphArg]:
         return [node.meta["grapharg"] for node in self.placeholders]
 
-    def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+    def call_user_compiler(self, gm: fx.GraphModule, **kwargs) -> CompiledFn:
         with dynamo_timed(
             "OutputGraph.call_user_compiler",
             phase_name="backend_compile",
             log_pt2_compile_event=True,
             dynamo_compile_column_us="aot_autograd_cumulative_compile_time_us",
         ):
-            return self._call_user_compiler(gm)
+            return self._call_user_compiler(gm, **kwargs)
 
-    def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+    def _call_user_compiler(self, gm: fx.GraphModule, **kwargs) -> CompiledFn:
         assert self.compiler_fn is not None
         tot = 0
         placeholders = []
@@ -1584,7 +1583,7 @@ def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             compiler_fn = self.compiler_fn
             if config.verify_correctness:
                 compiler_fn = WrapperBackend(compiler_fn)
-            compiled_fn = compiler_fn(gm, self.example_inputs())
+            compiled_fn = compiler_fn(gm, self.example_inputs(), **kwargs)
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except (TensorifyScalarRestartAnalysis, ShortenTraceback):
 
@@ -489,6 +489,7 @@ def process_inputs(
     fake_mode: FakeTensorMode,
     shape_env: Optional[ShapeEnv],
     ignore_shape_env: bool = False,
+    specialization = None,
 ) -> FakifiedFlatArgs:
     with fake_mode:
 
@@ -547,6 +548,7 @@ def convert(idx, x):
                 symbolic_context=symbolic_context,
                 source=source,
                 trace=trace,
+                specialization=specialization
             )
             return result
 
@@ -1083,6 +1085,7 @@ def aot_module_simplified(
     cudagraphs: Optional[BoxedBool] = None,
     boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
     ignore_shape_env: bool = False,
+    specializations = None,
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
@@ -1154,7 +1157,7 @@ def aot_module_simplified(
     )
     fake_mode, shape_env = construct_fake_mode(full_args, aot_config)
     fake_flat_args = process_inputs(
-        full_args, aot_config, fake_mode, shape_env, ignore_shape_env
+        full_args, aot_config, fake_mode, shape_env, ignore_shape_env, specializations=specializations
     )
 
     def dispatch_and_compile():
 
@@ -354,6 +354,7 @@ def from_real_tensor(
         source: Optional[Source] = None,
         symbolic_context: Optional[SymbolicContext] = None,
         trace: bool = True,
+        specialization = None,
     ) -> FakeTensor:
         # see note [Tensor Fakification and Symbol Caching]
         if not symbolic_context and not source and shape_env:
@@ -408,6 +409,7 @@ def mk_fake_tensor(
             source=source,
             symbolic_context=symbolic_context,
             trace=trace,
+            specialization=specialization
         )
         if out is NotImplemented:
             raise UnsupportedFakeTensorException("meta converter nyi")
@@ -2864,6 +2866,7 @@ def from_tensor(
         source: Optional[Source] = None,
         symbolic_context: Optional[SymbolicContext] = None,
         trace: bool = True,
+        specialization = None,
     ) -> FakeTensor:
         shape_env: Optional[ShapeEnv] = self.shape_env
         if static_shapes is None:
@@ -2880,6 +2883,7 @@ def from_tensor(
             source=source,
             symbolic_context=symbolic_context,
             trace=trace,
+            specialization=specialization,
         )
 
 
 
@@ -276,8 +276,15 @@ def describe_storage(
         return r
 
     def describe_tensor(
-        self, t: torch.Tensor, *, recurse: bool = True, trace: bool = False
+        self, t: torch.Tensor, *, recurse: bool = True, trace: bool = False, specialization=None
     ) -> MetaTensorDesc:
+        if specialization:
+            t = t.to('meta')
+            shape = list(t.shape)
+            for i, hint in zip(specialization.idxs, specialization.hints):
+                shape[i] = hint
+            t = torch.ones(shape, dtype=t.dtype, device='meta')
+
         is_leaf = safe_is_leaf(t)
         is_view = t._is_view()
         is_sparse = t.is_sparse
@@ -1844,6 +1851,7 @@ def __call__(
         # when source is not None.  Because we refakify after Dynamo is done,
         # we don't want to dump info again from AOTAutograd, it is redundant.
         trace: bool = True,
+        specialization = None,
     ) -> _TensorT:
         callback_: _MetaTensorCallback[_TensorT]
         if callback is None:
@@ -1886,7 +1894,7 @@ def __call__(
 
         # Describe the tensor.  NB: do NOT disable ambient modes, we may need
         # to query them when figuring out what to put in here
-        t_desc = self.describer.describe_tensor(t, trace=trace)
+        t_desc = self.describer.describe_tensor(t, trace=trace, specialization=specialization)
 
         if trace:
             assert source is not None