pytorch
diff --git a/‎test/inductor/test_aot_inductor.py
Lines changed: 110 additions & 7 deletions b/‎test/inductor/test_aot_inductor.py
Lines changed: 110 additions & 7 deletions
diff --git a/‎torch/_inductor/sizevars.py
Lines changed: 34 additions & 22 deletions b/‎torch/_inductor/sizevars.py
Lines changed: 34 additions & 22 deletions
@@ -1299,13 +1299,7 @@ def forward(self, values, repeats, mask, embeddings, x, z, scalar):
 
                 unbacked_add_expr = backed + unbacked
                 repeated = x.repeat(unbacked_add_expr, 1)
-                return torch.cat(
-                    [
-                        repeated,
-                        index_select,
-                    ],
-                    dim=1,
-                )
+                return torch.cat([repeated, index_select], dim=1)
 
         example_inputs = (
             torch.ones(64, dtype=torch.int64, device=self.device),
@@ -1327,6 +1321,115 @@ def forward(self, values, repeats, mask, embeddings, x, z, scalar):
         }
         self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
 
+    def test_size_with_unbacked_add_expr_transitive(self):
+        # Edge case with torch._check(expr1, expr2) + torch._check(expr2, unbacked).
+        # When generating example input sizes for autotuning, it should coalesce
+        # expr1, expr2, unbacked into a single size.
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Repro(torch.nn.Module):
+            def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
+                u0, u1, random_unbacked = lst.tolist()
+                torch._check_is_size(u0)
+                torch._check_is_size(u1)
+                backed = z.size(0)
+                backed1 = z.size(1)
+
+                repeated = x.repeat(backed + u0, 1)
+                repeated1 = y.repeat(backed1 + u1, 1)
+                out = torch.empty_like(repeated)
+                add_kernel[(out.numel(),)](
+                    repeated, repeated, out, out.numel(), BLOCK_SIZE=2
+                )
+
+                torch._check(repeated1.size(0) == out.size(0))
+                torch._check(out.size(0) == random_unbacked)
+
+                index = torch.repeat_interleave(values, repeats)
+                index_select = torch.index_select(embeddings, 0, index)
+
+                cat = torch.cat([out, index_select], dim=1)
+                add = repeated + repeated1
+                return cat, add
+
+        example_inputs = (
+            torch.ones(64, dtype=torch.int64, device=self.device),
+            torch.ones(64, dtype=torch.int64, device=self.device) * 24,
+            torch.ones((768,), dtype=torch.int64, device=self.device).bool(),
+            torch.randn((401, 8), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.ones(758, 758, dtype=torch.int64, device=self.device),
+            torch.tensor(
+                [10, 10, 2 * (758 + 10)], dtype=torch.int32, device=self.device
+            ),
+        )
+        spec = {
+            "values": (Dim.DYNAMIC,),
+            "repeats": (Dim.DYNAMIC,),
+            "mask": (Dim.DYNAMIC,),
+            "embeddings": (Dim.DYNAMIC, Dim.STATIC),
+            "x": (Dim.DYNAMIC, Dim.STATIC),
+            "y": (Dim.DYNAMIC, Dim.STATIC),
+            "z": (Dim.DYNAMIC, Dim.DYNAMIC),
+            "lst": (Dim.STATIC,),
+        }
+        self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
+
+    @config.patch({"unbacked_symint_fallback": 1024})
+    def test_size_with_unbacked_add_and_mul_expr(self):
+        # Edge case with torch._check(add_expr, mul_expr). When generating example
+        # input sizes for autotuning, make sure they coalesce into a single size.
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Repro(torch.nn.Module):
+            def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
+                u0, u1, u2 = lst.tolist()
+                torch._check_is_size(u0)
+                torch._check_is_size(u1)
+                torch._check_is_size(u2)
+                backed = z.size(0)
+                backed1 = z.size(1)
+
+                unbacked_add_expr = backed + u0
+                unbacked_mul_expr = backed1 + (u1 * u2)
+                repeated0 = x.repeat(unbacked_add_expr, 1)
+                repeated1 = y.repeat(unbacked_mul_expr, 1)
+                out0 = torch.empty_like(repeated0)
+                out1 = torch.empty_like(repeated1)
+                add_kernel[(out0.numel(),)](
+                    repeated0, repeated0, out0, out0.numel(), BLOCK_SIZE=2
+                )
+                add_kernel[(out1.numel(),)](
+                    repeated1, repeated1, out1, out1.numel(), BLOCK_SIZE=2
+                )
+
+                return torch.cat([out1, out0], dim=1)
+
+        example_inputs = (
+            torch.ones(64, dtype=torch.int64, device=self.device),
+            torch.ones(64, dtype=torch.int64, device=self.device) * 24,
+            torch.ones((768,), dtype=torch.int64, device=self.device).bool(),
+            torch.randn((401, 8), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.ones(758, 758, dtype=torch.int64, device=self.device),
+            torch.tensor([10, 5, 2], dtype=torch.int32, device=self.device),
+        )
+        spec = {
+            "values": (Dim.DYNAMIC,),
+            "repeats": (Dim.DYNAMIC,),
+            "mask": (Dim.DYNAMIC,),
+            "embeddings": (Dim.DYNAMIC, Dim.STATIC),
+            "x": (Dim.DYNAMIC, Dim.STATIC),
+            "y": (Dim.DYNAMIC, Dim.STATIC),
+            "z": (Dim.DYNAMIC, Dim.DYNAMIC),
+            "lst": (Dim.STATIC,),
+        }
+        self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
+
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
     def test_fallback_kernel_with_symexpr_output(self):
         if self.device != GPU_TYPE:
 
@@ -8,7 +8,11 @@
 import sympy
 from sympy import Expr
 
-from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols, ShapeEnv
+from torch.fx.experimental.symbolic_shapes import (
+    free_unbacked_symbols,
+    has_free_unbacked_symbols,
+    ShapeEnv,
+)
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing
 from torch.utils._sympy.symbol import symbol_is_type, SymT
@@ -62,7 +66,7 @@ def __init__(self, shape_env=None) -> None:
         self.shape_env = shape_env
         self.var_to_val = self.shape_env.var_to_val
         self.replacements: dict[sympy.Symbol, Expr] = self.shape_env.replacements
-        self.unbacked_replacements: dict[Expr, Expr] = {}
+        self.unbacked_replacements: Optional[dict[Expr, Expr]] = None
         # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
         # The basic idea is if we have some complicated sympy expression
         # f(s0), we may choose to precompute it on the host and then replace
@@ -639,7 +643,7 @@ def _stride_vars(
                 )
         return strides
 
-    def _get_unbacked_replacements(self, expr: Expr) -> dict[Expr, Expr]:
+    def _get_unbacked_replacements(self) -> dict[Expr, Expr]:
         """
         This helps with covering unbacked symint cases where you may have two
         expressions: s0 + u0 and u1. And s0 + u0 is known to be equal to u1
@@ -649,33 +653,41 @@ def _get_unbacked_replacements(self, expr: Expr) -> dict[Expr, Expr]:
         hint for both s0 + u0 and u1, but it first needs to know they are equal.
         Then it can substitute s0 + u0 for u1.
         """
-        if expr in self.unbacked_replacements:
-            return self.unbacked_replacements[expr]
+        if self.unbacked_replacements is not None:
+            return self.unbacked_replacements
 
-        runtime_asserts = itertools.chain.from_iterable(
-            self.shape_env.deferred_runtime_asserts.get(u, [])
-            for u in free_unbacked_symbols(expr)
-        )
-        equalities = (
-            assertion.expr
-            for assertion in runtime_asserts
-            if isinstance(assertion.expr, sympy.Equality)
-        )
-        replacements = {eq.rhs: eq.lhs for eq in equalities}
+        self.unbacked_replacements = {}
+        for assertions in self.shape_env.deferred_runtime_asserts.values():
+            for assertion in assertions:
+                if not isinstance(assertion.expr, sympy.Equality):
+                    continue
 
-        self.unbacked_replacements[expr] = replacements
-        return replacements
+                lhs, rhs = assertion.expr.lhs, assertion.expr.rhs
+                l2r = lhs.compare(rhs) == 1  # see sympy.Basic.compare
+                src = lhs if l2r else rhs
+                dst = rhs if l2r else lhs
+                self.unbacked_replacements[src] = dst
+        return self.unbacked_replacements
 
     def atomically_apply_size_hint(
         self, expr: Union[Expr, int], *, fallback: Optional[int] = None
     ) -> Union[Expr, int]:
-        if isinstance(expr, int):
+        if isinstance(expr, (int, sympy.Integer)):
             return int(expr)
 
-        # Make sure to substitute with the factored version
-        # e.g. 10*(s0 + u0) instead of 10*s0 + 10*u0
-        unbacked_replacements = self._get_unbacked_replacements(expr)
-        expr = sympy.factor(expr).subs(unbacked_replacements)
+        if has_free_unbacked_symbols(expr):
+
+            def _sub_unbacked_exprs(expr: Expr) -> Expr:
+                replacements = self._get_unbacked_replacements()
+                while True:
+                    new_expr = expr.subs(replacements)
+                    if new_expr == expr:
+                        return new_expr
+                    expr = sympy.factor(new_expr)
+
+            # Make sure to substitute with the factored version
+            # e.g. 10*(s0 + u0) instead of 10*s0 + 10*u0
+            expr = _sub_unbacked_exprs(sympy.factor(expr))
 
         # For multiple expressions that depend on an unbacked symint,
         # we want to compute them consistently for a size hint we have chosen.