pytorch
diff --git a/‎test/inductor/test_torchinductor.py
Lines changed: 44 additions & 0 deletions b/‎test/inductor/test_torchinductor.py
Lines changed: 44 additions & 0 deletions
diff --git a/‎torch/__init__.py
Lines changed: 2 additions & 2 deletions b/‎torch/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_dynamo/output_graph.py
Lines changed: 65 additions & 6 deletions b/‎torch/_dynamo/output_graph.py
Lines changed: 65 additions & 6 deletions
diff --git a/‎torch/_dynamo/repro/after_dynamo.py
Lines changed: 5 additions & 3 deletions b/‎torch/_dynamo/repro/after_dynamo.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎torch/_dynamo/variables/builder.py
Lines changed: 4 additions & 0 deletions b/‎torch/_dynamo/variables/builder.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/_export/non_strict_utils.py
Lines changed: 5 additions & 3 deletions b/‎torch/_export/non_strict_utils.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎torch/_functorch/aot_autograd.py
Lines changed: 4 additions & 1 deletion b/‎torch/_functorch/aot_autograd.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎torch/_inductor/compile_fx.py
Lines changed: 6 additions & 0 deletions b/‎torch/_inductor/compile_fx.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎torch/_subclasses/fake_tensor.py
Lines changed: 9 additions & 1 deletion b/‎torch/_subclasses/fake_tensor.py
Lines changed: 9 additions & 1 deletion
@@ -10464,6 +10464,50 @@ def f(x):
             self.assertEqual(out_ref.stride(), out_test.stride())
             self.assertEqual(x_ref, x_test)
 
+    @requires_gpu()
+    @skip_if_not_triton
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    def test_inductor_multiple_specializations(self):
+        from triton.testing import do_bench
+
+        @torch.compile(
+            options={
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+            dynamic=False,
+        )
+        def inductor_matmul(a, b):
+            torch._check(a.shape[0] == b.shape[1])
+            return (m, torch.mm(a, b))
+
+        m = 16
+        k = 1280
+        dynamic_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+        dynamic_specialized_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+        b = torch.randn(k, m, device=GPU_TYPE, dtype=torch.bfloat16)
+        torch._dynamo.decorators.mark_dynamic(
+            dynamic_a,
+            0,
+        )
+        torch._dynamo.decorators.mark_dynamic(
+            dynamic_specialized_a,
+            0,
+            specialize_on=[lambda x0: x0 == 16],
+        )
+        torch._dynamo.decorators.mark_dynamic(
+            b,
+            1,
+        )
+        dynamic = do_bench(lambda: inductor_matmul(dynamic_a, b))
+        torch._dynamo.reset()
+        dynamic_specialized = do_bench(
+            lambda: inductor_matmul(dynamic_specialized_a, b)
+        )
+        self.assertGreaterEqual(dynamic, dynamic_specialized)
+
     @requires_gpu()
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
         def f(x):
 
@@ -2359,10 +2359,10 @@ def apply_options(self, options: _Optional[dict[str, _Any]]):
                     )
             self.config[attr_name] = val
 
-    def __call__(self, model_, inputs_):
+    def __call__(self, model_, inputs_, **kwargs):
         from torch._inductor.compile_fx import compile_fx
 
-        return compile_fx(model_, inputs_, config_patches=self.config)
+        return compile_fx(model_, inputs_, config_patches=self.config, **kwargs)
 
     def get_compiler_config(self):
         from torch._inductor.compile_fx import get_patched_config_dict
 
@@ -33,7 +33,7 @@
 import sys
 import traceback
 import weakref
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
 
 import sympy
@@ -44,6 +44,7 @@
 import torch.nn
 import torch.utils._pytree as pytree
 from torch import fx
+from torch._C._dynamo import guards
 from torch._dynamo.exc import ShortenTraceback, TensorifyScalarRestartAnalysis
 from torch._guards import (
     CompileContext,
@@ -61,6 +62,7 @@
     guard_scalar,
     is_symbolic,
     ShapeEnv,
+    Specialization,
 )
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 from torch.multiprocessing.reductions import StorageWeakRef
@@ -157,6 +159,8 @@
 graph_sizes_log = torch._logging.getArtifactLogger(__name__, "graph_sizes")
 trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
 
+RootGuardManager = guards.RootGuardManager
+
 
 @dataclass(frozen=True)
 class VariableTrackerCacheKey:
@@ -1528,7 +1532,62 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
             counters["stats"]["unique_graphs"] += 1
             # This is safe because we pre-process name to be unique
-            self.install_global_unsafe(name, compiled_fn)
+            if specializations := old_fake_mode.shape_env.specializations:
+                specialization_guards = []
+                specialization_cache: dict[Specialization, Callable[[Any], Any]] = {}
+                preserved_graphargs = [
+                    replace(node.meta["grapharg"], _example=None)
+                    for node in self.placeholders
+                ]
+                sources = [a.source for a in self.graphargs]
+                for specialization in specializations:
+                    source_index = sources.index(specialization.source)
+                    check_fn_source = inspect.getsource(specialization.check_fn).strip()
+                    check_fn = guards.LAMBDA_GUARD(  # type: ignore[attr-defined]
+                        specialization.check_fn,
+                        [check_fn_source],
+                    )
+
+                    log.debug(
+                        "Compiling backend specialized graph with specialization=%s",
+                        check_fn_source,
+                    )
+
+                    specialization_guards.append(
+                        (
+                            functools.partial(
+                                lambda idx, args, check_fn=check_fn: check_fn(
+                                    args[idx]
+                                ),
+                                source_index,
+                            ),
+                            specialization,
+                        )
+                    )
+
+                @torch._dynamo.disable(reason="do not trace Dynamo-compiled graph")
+                def specialized_dispatch(*args, **kwargs):
+                    for check_fn, specialization in specialization_guards:
+                        if check_fn(args):
+                            if specialization in specialization_cache:
+                                return specialization_cache[specialization](
+                                    *args, **kwargs
+                                )
+                            for node, grapharg, arg in zip(
+                                self.placeholders, preserved_graphargs, args
+                            ):
+                                node.meta["grapharg"] = replace(grapharg, _example=arg)
+                            specialization_cache[specialization] = (
+                                self.call_user_compiler(
+                                    gm, specialization=specialization
+                                )
+                            )
+                            return specialization_cache[specialization](*args, **kwargs)
+                    return compiled_fn(*args, **kwargs)
+
+                self.install_global_unsafe(name, specialized_dispatch)
+            else:
+                self.install_global_unsafe(name, compiled_fn)
 
             cg = PyCodegen(tx)
             cg.make_call_generated_code(name)
@@ -1542,16 +1601,16 @@ def placeholders(self) -> list[fx.Node]:
     def graphargs(self) -> list[GraphArg]:
         return [node.meta["grapharg"] for node in self.placeholders]
 
-    def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+    def call_user_compiler(self, gm: fx.GraphModule, **kwargs) -> CompiledFn:
         with dynamo_timed(
             "OutputGraph.call_user_compiler",
             phase_name="backend_compile",
             log_pt2_compile_event=True,
             dynamo_compile_column_us="aot_autograd_cumulative_compile_time_us",
         ):
-            return self._call_user_compiler(gm)
+            return self._call_user_compiler(gm, **kwargs)
 
-    def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+    def _call_user_compiler(self, gm: fx.GraphModule, **kwargs) -> CompiledFn:
         assert self.compiler_fn is not None
         tot = 0
         placeholders = []
@@ -1581,7 +1640,7 @@ def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             compiler_fn = self.compiler_fn
             if config.verify_correctness:
                 compiler_fn = WrapperBackend(compiler_fn)
-            compiled_fn = compiler_fn(gm, self.example_inputs())
+            compiled_fn = compiler_fn(gm, self.example_inputs(), **kwargs)
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except (TensorifyScalarRestartAnalysis, ShortenTraceback):
 
@@ -110,7 +110,7 @@ def add_paths(exc):
             # Check for either accuracy (level 4) or other type of failures.
             if config.repro_level == 4:
                 # Check Accuracy
-                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs, **kwargs)
                 if _accuracy_fails(gm, example_inputs, compiler_fn):
                     log.warning(
                         "Accuracy failed for the TorchDynamo produced graph. Creating script to minify the error."
@@ -125,7 +125,9 @@ def add_paths(exc):
                     raise exc
             else:
                 try:
-                    compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+                    compiled_gm = compiler_fn(
+                        copy.deepcopy(gm), example_inputs, **kwargs
+                    )
                     run_fwd_maybe_bwd(compiled_gm, example_inputs)
                 except Exception as exc:
                     log.warning(
@@ -147,7 +149,7 @@ def add_paths(exc):
                     add_paths(exc)
                     raise
         else:
-            compiled_gm = compiler_fn(gm, example_inputs)
+            compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
 
         return compiled_gm
 
 
@@ -3053,6 +3053,7 @@ def update_dim2constraint(dim, constraint_range, name):
     dynamic_strides = []
     constraint_sizes = []
     constraint_strides = []
+    specialize_on = []
     for i in range(e.dim()):
         # NB: mark dynamic has precedence over static
         marked_strict_unbacked = i in getattr(
@@ -3063,6 +3064,8 @@ def update_dim2constraint(dim, constraint_range, name):
         marked_weak_dynamic = i in getattr(e, "_dynamo_weak_dynamic_indices", set())
         marked_static = i in getattr(e, "_dynamo_static_indices", set())
 
+        specialize_on.append(getattr(e, "_specialize_on", {}).get(i, []))
+
         # Reflect the user directive in the frame_state
         # For dynamic, apply None always
 
@@ -3182,6 +3185,7 @@ def update_dim2constraint(dim, constraint_range, name):
         dynamic_strides=dynamic_strides,
         constraint_sizes=constraint_sizes,
         constraint_strides=constraint_strides,
+        specialize_on=specialize_on,
         view_base_context=view_base_context,
         tensor_source=source,
         shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
 
@@ -143,9 +143,11 @@ def fakify(
             constraint_sizes[i] = RelaxedUnspecConstraint(warn_only=False)  # type: ignore[call-overload]
         else:
             dynamic_sizes.append(DimDynamic.STATIC)
-    symbolic_context = StatelessSymbolicContext(
-        dynamic_sizes=dynamic_sizes,
-        constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+    symbolic_context: StatelessSymbolicContext = (  # make mypy happy
+        StatelessSymbolicContext(
+            dynamic_sizes=dynamic_sizes,
+            constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+        )
     )
     t_id = id(t)
     assert mode.shape_env is not None
 
@@ -31,7 +31,7 @@
     _pytree_subclasses_that_lose_info,
     make_fx,
 )
-from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.fx.experimental.symbolic_shapes import ShapeEnv, Specialization
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 
@@ -489,6 +489,7 @@ def process_inputs(
     fake_mode: FakeTensorMode,
     shape_env: Optional[ShapeEnv],
     ignore_shape_env: bool = False,
+    specialization: Optional[Specialization] = None,
 ) -> FakifiedFlatArgs:
     with fake_mode:
 
@@ -547,6 +548,7 @@ def convert(idx, x):
                 symbolic_context=symbolic_context,
                 source=source,
                 trace=trace,
+                specialization=specialization,
             )
             return result
 
@@ -1084,6 +1086,7 @@ def aot_module_simplified(
     cudagraphs: Optional[BoxedBool] = None,
     boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
     ignore_shape_env: bool = False,
+    specialization: Optional[Specialization] = None,
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
 
@@ -127,6 +127,7 @@
 
     from torch._inductor.output_code import _StrideExprStr
     from torch._ops import OpOverload
+    from torch.fx.experimental.symbolic_shapes import Specialization
 
     from .ir import ExternKernelNode
 
@@ -1914,6 +1915,7 @@ def compile_fx(
     config_patches: Optional[dict[str, Any]] = None,
     decompositions: Optional[dict[OpOverload, Callable[..., Any]]] = None,
     ignore_shape_env: bool = False,
+    specialization: Optional[Specialization] = None,
 ) -> Union[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str]]:
     """
     Main entry point for compiling given FX graph.  Despite the fact that this
@@ -1939,6 +1941,7 @@ def compile_fx(
                 inner_compile=config.patch(config_patches)(inner_compile),
                 decompositions=decompositions,
                 ignore_shape_env=ignore_shape_env,
+                specialization=specialization,
             )
 
     # TODO: This probably shouldn't be a recursive call
@@ -1995,13 +1998,15 @@ def compile_fx(
                     inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
                     decompositions=decompositions,
                     ignore_shape_env=ignore_shape_env,
+                    specialization=specialization,
                 )
 
     recursive_compile_fx = functools.partial(
         compile_fx,
         inner_compile=inner_compile,
         decompositions=decompositions,
         ignore_shape_env=ignore_shape_env,
+        specialization=specialization,
     )
 
     if not graph_returns_tuple(model_):
@@ -2332,6 +2337,7 @@ def bw_compiler(
                     cudagraphs=cudagraphs,
                     boxed_forward_device_index=forward_device,
                     ignore_shape_env=ignore_shape_env,
+                    specialization=specialization,
                 )(model_, example_inputs_)
             except ShortenTraceback as e:
                 # We will also shorten the traceback inside dynamo.
 
@@ -59,7 +59,11 @@
 
     from torch._guards import Source
     from torch._ops import OpOverload
-    from torch.fx.experimental.symbolic_shapes import ShapeEnv, SymbolicContext
+    from torch.fx.experimental.symbolic_shapes import (
+        ShapeEnv,
+        Specialization,
+        SymbolicContext,
+    )
 
 log = logging.getLogger(__name__)
 
@@ -354,6 +358,7 @@ def from_real_tensor(
         source: Optional[Source] = None,
         symbolic_context: Optional[SymbolicContext] = None,
         trace: bool = True,
+        specialization: Optional[Specialization] = None,
     ) -> FakeTensor:
         # see note [Tensor Fakification and Symbol Caching]
         if not symbolic_context and not source and shape_env:
@@ -408,6 +413,7 @@ def mk_fake_tensor(
             source=source,
             symbolic_context=symbolic_context,
             trace=trace,
+            specialization=specialization,
         )
         if out is NotImplemented:
             raise UnsupportedFakeTensorException("meta converter nyi")
@@ -2864,6 +2870,7 @@ def from_tensor(
         source: Optional[Source] = None,
         symbolic_context: Optional[SymbolicContext] = None,
         trace: bool = True,
+        specialization: Optional[Specialization] = None,
     ) -> FakeTensor:
         shape_env: Optional[ShapeEnv] = self.shape_env
         if static_shapes is None:
@@ -2880,6 +2887,7 @@ def from_tensor(
             source=source,
             symbolic_context=symbolic_context,
             trace=trace,
+            specialization=specialization,
         )