pytorch
diff --git a/‎test/inductor/test_codecache.py
Lines changed: 66 additions & 3 deletions b/‎test/inductor/test_codecache.py
Lines changed: 66 additions & 3 deletions
diff --git a/‎torch/_inductor/output_code.py
Lines changed: 10 additions & 8 deletions b/‎torch/_inductor/output_code.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎torch/_inductor/runtime/cache_dir_utils.py
Lines changed: 4 additions & 0 deletions b/‎torch/_inductor/runtime/cache_dir_utils.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/_inductor/standalone_compile.py
Lines changed: 18 additions & 2 deletions b/‎torch/_inductor/standalone_compile.py
Lines changed: 18 additions & 2 deletions
@@ -1471,11 +1471,15 @@ def backend(gm_, args_, **kwargs_):
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("device", (GPU_TYPE, "cpu"))
     @parametrize("format", ("binary", "unpacked"))
     @parametrize("dynamic", (False, True))
-    def test_basic(self, format: str, dynamic: bool) -> None:
-        mod = torch.nn.Linear(1, 3)
-        x = torch.randn(4, 1)
+    def test_basic(self, device: str, format: str, dynamic: bool) -> None:
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+
+        mod = torch.nn.Linear(1, 3, device=device)
+        x = torch.randn
10000
(4, 1, device=device)
         if dynamic:
             torch._dynamo.mark_dynamic(x, 0)
 
@@ -1562,6 +1566,65 @@ def f(x):
                 compiled_out = loaded(*args)[0]
                 self.assertEqual(eager_out, compiled_out)
 
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("device", (GPU_TYPE, "cpu"))
+    def test_modify_unpacked_file(self, device: str) -> None:
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+
+        x = torch.ones(4, device=device)
+
+        def f(x):
+            with torch.no_grad():
+                return 2 * x, x.sin()
+
+        eager_out = f(x)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with fresh_inductor_cache():
+                gm, args, kwargs = self.capture(f)(x)
+                assert not kwargs
+
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_out = compiled_artifact(*args)
+                self.assertEqual(eager_out, compiled_out)
+
+                compiled_artifact.save(path=temp_dir, format="unpacked")
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+            with fresh_inductor_cache():
+                # Now modify the output file and expect to see the changes
+                for subdir in os.listdir(temp_dir):
+                    if subdir in ["aotautograd", "fxgraph"]:
+                        continue
+                    subdir_path = os.path.join(temp_dir, subdir)
+                    for file in os.listdir(subdir_path):
+                        file_path = os.path.join(subdir_path, file)
+                        assert os.path.isfile(file_path)
+                        with open(file_path) as f:
+                            file_contents = f.read()
+                        if device == GPU_TYPE:
+                            file_contents = file_contents.replace(
+                                "tmp1 = 2.0", "tmp1 = 8.0"
+                            )
+                        else:
+                            assert device == "cpu"
+                            file_contents = file_contents.replace(
+                                "auto tmp1 = static_cast<float>(2.0);",
+                                "auto tmp1 = static_cast<float>(8.0);",
+                            )
+                        with open(file_path, "w") as f:
+                            f.write(file_contents)
+
+                loaded = torch._inductor.CompiledArtifact.load(
+                    path=temp_dir, format="unpacked"
+                )
+                compiled_out = loaded(
8000
*args)
+                self.assertEqual(4 * eager_out[0], compiled_out[0])
+
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
 
     @unittest.skipIf(IS_FBCODE, "torch import error")
 
@@ -654,14 +654,9 @@ def prepare_for_serialization(self) -> None:
         self.current_callable = None
         self.recursively_apply_fns = None
 
-    def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
-        from torch._dynamo.utils import counters, dynamo_timed
-        from torch._inductor.codecache import (
-            cpp_prefix_path,
-            get_path,
-            PyCodeCache,
-            write_atomic,
-        )
+    def write_to_disk(self) -> str:
+        from torch._dynamo.utils import counters
+        from torch._inductor.codecache import cpp_prefix_path, get_path, write_atomic
 
         # See _save_graph(); we don't store the callable in the cache entry so
         # recreate it here from the PyCodeCache disk cache.
@@ -682,6 +677,13 @@ def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
                     self.source_code = code
 
             write_atomic(artifact_path, code, make_dirs=True)
+        return artifact_path
+
+    def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
+        from torch._dynamo.utils import dynamo_timed
+        from torch._inductor.codecache import PyCodeCache
+
+        artifact_path = self.write_to_disk()
 
         try:
             with dynamo_timed(
 
@@ -37,11 +37,15 @@ def triton_cache_dir(device: int) -> str:
 
 @contextmanager
 def temporary_cache_dir(directory: str) -> Generator[None, None, None]:
+    from torch._inductor.utils import clear_inductor_caches
+
     original = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
     os.environ["TORCHINDUCTOR_CACHE_DIR"] = directory
     try:
+        clear_inductor_caches()
         yield
     finally:
+        clear_inductor_caches()
         if original is None:
             del os.environ["TORCHINDUCTOR_CACHE_DIR"]
         else:
 
@@ -3,6 +3,7 @@
 import copy
 import logging
 import os
+import pickle
 import shutil
 from contextlib import AbstractContextManager, nullcontext
 from typing import Any, Callable, Literal, Optional, TYPE_CHECKING
@@ -69,7 +70,7 @@ def save(
                     "CompiledArtifact.save failed to save since there's no artifact to save"
                 )
             artifact_bytes, cache_info = self._artifacts
-            assert len(cache_info.aot_autograd_artifacts) == 1
+            assert len(cache_info.aot_autograd_artifacts) == 1, cache_info
             key = cache_info.aot_autograd_artifacts[0]
 
             if format == "binary":
@@ -92,9 +93,24 @@ def save(
                     assert os.path.isdir(path)
                     shutil.rmtree(path, ignore_errors=True)
 
+                from .codecache import FxGraphCache
+
                 with temporary_cache_dir(path):
                     # This function unpacks the cache artifacts to disk
-                    torch.compiler.load_cache_artifacts(artifact_bytes)
+                    loaded_cache_info = torch.compiler.load_cache_artifacts(
+                        artifact_bytes
+                    )
+                    assert loaded_cache_info is not None
+                    # Now write all the output_code artifacts to disk so that
+                    # they can be inspected and modified
+                    for key in loaded_cache_info.inductor_artifacts:
+                        subdir = FxGraphCache._get_tmp_dir_for_key(key)
+                        assert os.path.exists(subdir)
+                        for path in sorted(os.listdir(subdir)):
+                            with open(os.path.join(subdir, path), "rb") as f:
+                                graph = pickle.load(f)
+                            output_file = graph.write_to_disk()
+                            log.info("Output code written to: %s", output_file)
 
     @staticmethod
     def load(