pytorch
diff --git a/‎test/dynamo/test_aot_autograd_cache.py
Lines changed: 43 additions & 0 deletions b/‎test/dynamo/test_aot_autograd_cache.py
Lines changed: 43 additions & 0 deletions
diff --git a/‎torch/_dynamo/output_graph.py
Lines changed: 5 additions & 3 deletions b/‎torch/_dynamo/output_graph.py
Lines changed: 5 additions & 3 deletions
@@ -272,6 +272,49 @@ def fn(x, y):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
 
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_multi_graph_specialization(self):
+        """
+        Verify multi graph specializations all cache hit
+        """
+
+        def fn(x):
+            return x * 5
+
+        a = torch.randn(5)
+        a8 = torch.randn(8)
+        a16 = torch.randn(16)
+        torch._dynamo.mark_dynamic(
+            a,
+            0,
+            specialize_on=[
+                lambda x: x == 8,
+                lambda x: x == 16,
+            ],
+        )
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        compiled_fn(a)
+        compiled_fn(a8)
+        compiled_fn(a16)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 3)
+
+        self._clear_dynamo_and_codecache()
+
+        # A second call should hit on all 3 graphs
+        compiled_fn(a)
+        compiled_fn(a8)
+        compiled_fn(a16)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 3)
+
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
 
@@ -51,6 +51,7 @@
     CompileId,
     GlobalContextCheckpointState,
     Source,
+    tracing,
     TracingContext,
 )
 from torch._subclasses.fake_tensor import FakeTensor
@@ -1753,9 +1754,10 @@ def specialized_dispatch(*args, **kwargs):
                                 # Modify gm so AOTAutogradCache key changes per specialization
                                 gm.meta["specialization"] = specialization
                                 example_inputs: list[Tensor] = list(args)
-                                specialization_cache[specialization] = (
-                                    self.call_user_compiler(gm, example_inputs)
-                                )
+                                with tracing(self.tracing_context):
+                                    specialization_cache[specialization] = (
+                                        self.call_user_compiler(gm, example_inputs)
+                                    )
 
                             return specialization_cache[specialization](*args, **kwargs)
                     return compiled_fn(*args, **kwargs)