pytorch · bobrenjc93 · May 1, 2025 · May 2, 2025 · May 2, 2025 · May 3, 2025
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-14256e6040d9e14698a877924456cdd92bfcd01d
+8eeef7f5b5363e9f35576184659226cc082311d6
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -10464,6 +10464,52 @@ def f(x):
             self.assertEqual(out_ref.stride(), out_test.stride())
             self.assertEqual(x_ref, x_test)
 
+    @requires_gpu()
+    @skip_if_not_triton
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    def test_inductor_multiple_specializations(self):
+        from triton.testing import do_bench
+
+        @torch.compile(
+            options={
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+            dynamic=False,
+        )
+        def inductor_matmul(a, b):
+            torch._check(a.shape[0] == b.shape[1])
+            return (m, torch.mm(a, b))
+
+        m = 16
+        k = 1280
+        dynamic_a = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
-        dynamic_a = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+        dynamic_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
-        dynamic_a = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+        dynamic_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+        dynamic_specialized_a = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
-        dynamic_specialized_a = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+        dynamic_specialized_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
-        dynamic_specialized_a = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+        dynamic_specialized_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+        b = torch.randn(k, m, device="cuda", dtype=torch.bfloat16)
-        b = torch.randn(k, m, device="cuda", dtype=torch.bfloat16)
+        b = torch.randn(k, m, device=GPU_TYPE, dtype=torch.bfloat16)
-        b = torch.randn(k, m, device="cuda", dtype=torch.bfloat16)
+        b = torch.randn(k, m, device=GPU_TYPE, dtype=torch.bfloat16)
+        torch._dynamo.decorators.mark_dynamic(
+            dynamic_a,
+            0,
+        )
+        torch._dynamo.decorators.mark_dynamic(
+            dynamic_specialized_a,
+            0,
+            backend_specializations=[
+                (16, lambda x0: x0 == 16),
+            ],
+        )
+        torch._dynamo.decorators.mark_dynamic(
+            b,
+            1,
+        )
+        dynamic = do_bench(lambda: inductor_matmul(dynamic_a, b))
+        torch._dynamo.reset()
+        dynamic_specialized = do_bench(
+            lambda: inductor_matmul(dynamic_specialized_a, b)
+        )
+        self.assertGreaterEqual(dynamic, dynamic_specialized)
+
     @requires_gpu()
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
         def f(x):

diff --git a/torch/__init__.py b/torch/__init__.py
@@ -2359,10 +2359,10 @@ def apply_options(self, options: _Optional[dict[str, _Any]]):
                     )
             self.config[attr_name] = val
 
-    def __call__(self, model_, inputs_):
+    def __call__(self, model_, inputs_, **kwargs):
         from torch._inductor.compile_fx import compile_fx
 
-        return compile_fx(model_, inputs_, config_patches=self.config)
+        return compile_fx(model_, inputs_, config_patches=self.config, **kwargs)
 
     def get_compiler_config(self):
         from torch._inductor.compile_fx import get_patched_config_dict

diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
@@ -606,7 +606,9 @@ def mark_dynamic(t, index, *, min=None, max=None, backend_specializations=None):
         # TODO(voz): Should we bounds check?
         t._dynamo_dynamic_indices.add(index)
         t._dynamo_dynamic_range.add(_DimRange(index, min, max))
-        t._backend_specializations[index] = backend_specializations
+        t._backend_specializations[index] = (
+            backend_specializations if backend_specializations is not None else []
+        )
         return
 
     assert isinstance(index, (list, tuple))

diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
@@ -44,6 +44,7 @@
 import torch.nn
 import torch.utils._pytree as pytree
 from torch import fx
+from torch._C._dynamo import guards
 from torch._dynamo.exc import ShortenTraceback, TensorifyScalarRestartAnalysis
 from torch._guards import (
     CompileContext,
@@ -157,6 +158,8 @@
 graph_sizes_log = torch._logging.getArtifactLogger(__name__, "graph_sizes")
 trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
 
+RootGuardManager = guards.RootGuardManager
+
 
 @dataclass(frozen=True)
 class VariableTrackerCacheKey:
@@ -1496,8 +1499,34 @@ def compile_and_call_fx_graph(self, tx, rv, root):
                 # a lot of fake_tensor ownership assumptions and runs afoul of detect_fake_mode
                 self.tracing_context.fake_mode = backend_fake_mode
 
+            specialized_compiles = []
             with self.restore_global_state():
                 compiled_fn = self.call_user_compiler(gm)
+                sources = [a.source for a in self.graphargs]
+                for specialization in old_fake_mode.shape_env.backend_specializations:
+                    source_index = sources.index(specialization.source)
+                    check_fn_source = inspect.getsource(specialization.check_fn).strip()
+                    check_fn = guards.LAMBDA_GUARD(  # type: ignore[attr-defined]
+                        specialization.check_fn,
+                        [check_fn_source],
+                    )
+
+                    log.debug(
+                        "Compiling backend specialized graph with specialization=%s",
+                        check_fn_source,
+                    )
+
+                    specialized_compiles.append(
+                        (
+                            functools.partial(
+                                lambda idx, args, check_fn=check_fn: check_fn(
+                                    args[idx]
+                                ),
+                                source_index,
+                            ),
+                            self.call_user_compiler(gm, specialization=specialization),
+                        )
+                    )
 
             from torch.fx._lazy_graph_module import _LazyGraphModule
 
@@ -1528,7 +1557,18 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
             counters["stats"]["unique_graphs"] += 1
             # This is safe because we pre-process name to be unique
-            self.install_global_unsafe(name, compiled_fn)
+            if specialized_compiles:
+
+                @torch._dynamo.disable(reason="do not trace Dynamo-compiled graph")
+                def specialized_dispatch(*args, **kwargs):
+                    for check_fn, specialized_compiled_fn in specialized_compiles:
+                        if check_fn(args):
+                            return specialized_compiled_fn(*args, **kwargs)
+                    return compiled_fn(*args, **kwargs)
+
+                self.install_global_unsafe(name, specialized_dispatch)
+            else:
+                self.install_global_unsafe(name, compiled_fn)
 
             cg = PyCodegen(tx)
             cg.make_call_generated_code(name)
@@ -1542,16 +1582,16 @@ def placeholders(self) -> list[fx.Node]:
     def graphargs(self) -> list[GraphArg]:
         return [node.meta["grapharg"] for node in self.placeholders]
 
-    def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+    def call_user_compiler(self, gm: fx.GraphModule, **kwargs) -> CompiledFn:
         with dynamo_timed(
             "OutputGraph.call_user_compiler",
             phase_name="backend_compile",
             log_pt2_compile_event=True,
             dynamo_compile_column_us="aot_autograd_cumulative_compile_time_us",
         ):
-            return self._call_user_compiler(gm)
+            return self._call_user_compiler(gm, **kwargs)
 
-    def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+    def _call_user_compiler(self, gm: fx.GraphModule, **kwargs) -> CompiledFn:
         assert self.compiler_fn is not None
         tot = 0
         placeholders = []
@@ -1581,7 +1621,7 @@ def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             compiler_fn = self.compiler_fn
             if config.verify_correctness:
                 compiler_fn = WrapperBackend(compiler_fn)
-            compiled_fn = compiler_fn(gm, self.example_inputs())
+            compiled_fn = compiler_fn(gm, self.example_inputs(), **kwargs)
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except (TensorifyScalarRestartAnalysis, ShortenTraceback):

diff --git a/torch/_dynamo/repro/after_dynamo.py b/torch/_dynamo/repro/after_dynamo.py
@@ -110,7 +110,7 @@ def add_paths(exc):
             # Check for either accuracy (level 4) or other type of failures.
             if config.repro_level == 4:
                 # Check Accuracy
-                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs, **kwargs)
                 if _accuracy_fails(gm, example_inputs, compiler_fn):
                     log.warning(
                         "Accuracy failed for the TorchDynamo produced graph. Creating script to minify the error."
@@ -125,7 +125,9 @@ def add_paths(exc):
                     raise exc
             else:
                 try:
-                    compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+                    compiled_gm = compiler_fn(
+                        copy.deepcopy(gm), example_inputs, **kwargs
+                    )
                     run_fwd_maybe_bwd(compiled_gm, example_inputs)
                 except Exception as exc:
                     log.warning(
@@ -147,7 +149,7 @@ def add_paths(exc):
                     add_paths(exc)
                     raise
         else:
-            compiled_gm = compiler_fn(gm, example_inputs)
+            compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
 
         return compiled_gm
 

diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
@@ -3053,6 +3053,7 @@ def update_dim2constraint(dim, constraint_range, name):
     dynamic_strides = []
     constraint_sizes = []
     constraint_strides = []
+    backend_specializations = []
     for i in range(e.dim()):
         # NB: mark dynamic has precedence over static
         marked_strict_unbacked = i in getattr(
@@ -3063,6 +3064,10 @@ def update_dim2constraint(dim, constraint_range, name):
         marked_weak_dynamic = i in getattr(e, "_dynamo_weak_dynamic_indices", set())
         marked_static = i in getattr(e, "_dynamo_static_indices", set())
 
+        backend_specializations.append(
+            getattr(e, "_backend_specializations", {}).get(i, [])
+        )
+
         # Reflect the user directive in the frame_state
         # For dynamic, apply None always
 
@@ -3182,6 +3187,7 @@ def update_dim2constraint(dim, constraint_range, name):
         dynamic_strides=dynamic_strides,
         constraint_sizes=constraint_sizes,
         constraint_strides=constraint_strides,
+        backend_specializations=backend_specializations,
         view_base_context=view_base_context,
         tensor_source=source,
         shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,

diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
@@ -143,9 +143,11 @@ def fakify(
             constraint_sizes[i] = RelaxedUnspecConstraint(warn_only=False)  # type: ignore[call-overload]
         else:
             dynamic_sizes.append(DimDynamic.STATIC)
-    symbolic_context = StatelessSymbolicContext(
-        dynamic_sizes=dynamic_sizes,
-        constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+    symbolic_context: StatelessSymbolicContext = (  # make mypy happy
+        StatelessSymbolicContext(
+            dynamic_sizes=dynamic_sizes,
+            constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+        )
     )
     t_id = id(t)
     assert mode.shape_env is not None

diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -31,7 +31,7 @@
     _pytree_subclasses_that_lose_info,
     make_fx,
 )
-from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.fx.experimental.symbolic_shapes import BackendSpecialization, ShapeEnv
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 
@@ -489,6 +489,7 @@ def process_inputs(
     fake_mode: FakeTensorMode,
     shape_env: Optional[ShapeEnv],
     ignore_shape_env: bool = False,
+    specialization: Optional[BackendSpecialization] = None,
 ) -> FakifiedFlatArgs:
     with fake_mode:
 
@@ -547,6 +548,7 @@ def convert(idx, x):
                 symbolic_context=symbolic_context,
                 source=source,
                 trace=trace,
+                specialization=specialization,
             )
             return result
 
@@ -1084,6 +1086,7 @@ def aot_module_simplified(
     cudagraphs: Optional[BoxedBool] = None,
     boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
     ignore_shape_env: bool = False,
+    specialization: Optional[BackendSpecialization] = None,
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
@@ -1155,7 +1158,12 @@ def aot_module_simplified(
     )
     fake_mode, shape_env = construct_fake_mode(full_args, aot_config)
     fake_flat_args = process_inputs(
-        full_args, aot_config, fake_mode, shape_env, ignore_shape_env
+        full_args,
+        aot_config,
+        fake_mode,
+        shape_env,
+        ignore_shape_env,
+        specialization=specialization,
     )
 
     def dispatch_and_compile():

diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -127,6 +127,7 @@
 
     from torch._inductor.output_code import _StrideExprStr
     from torch._ops import OpOverload
+    from torch.fx.experimental.symbolic_shapes import BackendSpecialization
 
     from .ir import ExternKernelNode
 
@@ -1914,6 +1915,7 @@ def compile_fx(
     config_patches: Optional[dict[str, Any]] = None,
     decompositions: Optional[dict[OpOverload, Callable[..., Any]]] = None,
     ignore_shape_env: bool = False,
+    specialization: Optional[BackendSpecialization] = None,
 ) -> Union[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str]]:
     """
     Main entry point for compiling given FX graph.  Despite the fact that this
@@ -1939,6 +1941,7 @@ def compile_fx(
                 inner_compile=config.patch(config_patches)(inner_compile),
                 decompositions=decompositions,
                 ignore_shape_env=ignore_shape_env,
+                specialization=specialization,
             )
 
     # TODO: This probably shouldn't be a recursive call
@@ -1995,13 +1998,15 @@ def compile_fx(
                     inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
                     decompositions=decompositions,
                     ignore_shape_env=ignore_shape_env,
+                    specialization=specialization,
                 )
 
     recursive_compile_fx = functools.partial(
         compile_fx,
         inner_compile=inner_compile,
         decompositions=decompositions,
         ignore_shape_env=ignore_shape_env,
+        specialization=specialization,
     )
 
     if not graph_returns_tuple(model_):
@@ -2332,6 +2337,7 @@ def bw_compiler(
                     cudagraphs=cudagraphs,
                     boxed_forward_device_index=forward_device,
                     ignore_shape_env=ignore_shape_env,
+                    specialization=specialization,
                 )(model_, example_inputs_)
             except ShortenTraceback as e:
                 # We will also shorten the traceback inside dynamo.

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
@@ -59,7 +59,11 @@
 
     from torch._guards import Source
     from torch._ops import OpOverload
-    from torch.fx.experimental.symbolic_shapes import ShapeEnv, SymbolicContext
+    from torch.fx.experimental.symbolic_shapes import (
+        BackendSpecialization,
+        ShapeEnv,
+        SymbolicContext,
+    )
 
 log = logging.getLogger(__name__)
 
@@ -354,6 +358,7 @@ def from_real_tensor(
         source: Optional[Source] = None,
         symbolic_context: Optional[SymbolicContext] = None,
         trace: bool = True,
+        specialization: Optional[BackendSpecialization] = None,
     ) -> FakeTensor:
         # see note [Tensor Fakification and Symbol Caching]
         if not symbolic_context and not source and shape_env:
@@ -408,6 +413,7 @@ def mk_fake_tensor(
             source=source,
             symbolic_context=symbolic_context,
             trace=trace,
+            specialization=specialization,
         )
         if out is NotImplemented:
             raise UnsupportedFakeTensorException("meta converter nyi")
@@ -2864,6 +2870,7 @@ def from_tensor(
         source: Optional[Source] = None,
         symbolic_context: Optional[SymbolicContext] = None,
         trace: bool = True,
+        specialization: Optional[BackendSpecialization] = None,
     ) -> FakeTensor:
         shape_env: Optional[ShapeEnv] = self.shape_env
         if static_shapes is None:
@@ -2880,6 +2887,7 @@ def from_tensor(
             source=source,
             symbolic_context=symbolic_context,
             trace=trace,
+            specialization=specialization,
         )
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		14256e6040d9e14698a877924456cdd92bfcd01d
		8eeef7f5b5363e9f35576184659226cc082311d6
Copy link Contributor eellison May 12, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. intentional?