pytorch
diff --git a/‎test/inductor/test_max_autotune.py
Lines changed: 44 additions & 0 deletions b/‎test/inductor/test_max_autotune.py
Lines changed: 44 additions & 0 deletions
diff --git a/‎torch/_inductor/ir.py
Lines changed: 5 additions & 3 deletions b/‎torch/_inductor/ir.py
Lines changed: 5 additions & 3 deletions
@@ -9,6 +9,7 @@
 from torch._dynamo import reset
 from torch._dynamo.exc import BackendCompilerFailed
 from torch._dynamo.testing import rand_strided, reset_rng_state
+from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.autotune_process import (
     BenchmarkRequest,
@@ -979,6 +980,49 @@ def mock_lookup(self, *args, **kwargs):
                 torch.compile(lambda a, b: a.matmul(b))(a, b)
             self.assertIn("NoValidChoicesError", str(context.exception))
 
+    @unittest.skipIf(
+        not torch.cuda.is_available()
+        or torch.cuda.get_device_properties().total_memory < 2e10,
+        "Only if the GPU has at least 20GB memory to be safe",
+    )
+    @config.patch(force_shape_pad=True, max_autotune=True)
+    def test_linear_and_cel(self):
+        """
+        Similate a GPU without enough SMs. Make sure max-autotune still
+        works even when the MultiTritonTemplate encapsulates just extern
+        kernels.
+        """
+
+        def mock_is_big_gpu(*args, **kwargs):
+            return False
+
+        B, T, C, V = 32, 1024, 768, 50257
+
+        linear = nn.Linear(C, V).bfloat16().to(device=GPU_TYPE)
+        ce = torch.nn.CrossEntropyLoss()
+
+        def f(x, y):
+            x.grad = None
+            linear.weight.grad = None
+            linear.bias.grad = None
+
+            loss = ce(linear(x), y)
+            loss.backward()
+            return loss
+
+        x = torch.randn(B * T, C, requires_grad=True).cuda().bfloat16()
+        x.retain_grad()
+        y = torch.randint(0, V, (B * T,)).cuda()
+
+        import torch._inductor.utils as inductor_utils
+
+        with unittest.mock.patch.object(inductor_utils, "is_big_gpu", mock_is_big_gpu):
+            opt_f = torch.compile(f)
+
+            expect = (f(x, y), x.grad, linear.weight.grad, linear.bias.grad)
+            actual = (opt_f(x, y), x.grad, linear.weight.grad, linear.bias.grad)
+            assert same(expect, actual, tol=1e-2), f"ref:\n{expect}\nact:\n{actual}"
+
 
 @instantiate_parametrized_tests
 class TestMaxAutotuneRemoteCache(TestCase):
 
@@ -1150,9 +1150,11 @@ def inner_reduction_splits(
                 # No need to split.
                 return ReductionHint.INNER, split
             if input_node is not None and isinstance(input_node, TensorBox):
-                new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
-                    input_node
-                )
+                with patch.object(FlexibleLayout, "allow_indexing", True):
+                    (
+                        new_ranges,
+                        new_reduction_ranges,
+                    ) = extract_input_node_reduction_ranges(input_node)
                 if new_ranges is not None and new_reduction_ranges is not None:
                     extracted_numel_hint = V.graph.sizevars.symbolic_hint(
                         sympy_product(new_ranges + new_reduction_ranges)