pytorch
diff --git a/‎test/inductor/test_static_cuda_launcher.py
Lines changed: 5 additions & 0 deletions b/‎test/inductor/test_static_cuda_launcher.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎torch/_inductor/runtime/static_cuda_launcher.py
Lines changed: 16 additions & 0 deletions b/‎torch/_inductor/runtime/static_cuda_launcher.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎torch/_inductor/runtime/triton_heuristics.py
Lines changed: 17 additions & 3 deletions b/‎torch/_inductor/runtime/triton_heuristics.py
Lines changed: 17 additions & 3 deletions
diff --git a/‎torch/_inductor/triton_bundler.py
Lines changed: 13 additions & 0 deletions b/‎torch/_inductor/triton_bundler.py
Lines changed: 13 additions & 0 deletions
@@ -54,6 +54,11 @@ def _make_launcher(
         cubin_file = self.write_cubin_to_tmp(compiled_kernel)
         compiled_kernel._cubin_path = cubin_file
         result = StaticallyLaunchedCudaKernel(compiled_kernel)
+        # Test reload cubin from raw here
+        old_cubin_path = result.cubin_path
+        assert old_cubin_path is not None
+        result.cubin_path = None
+        result.reload_cubin_from_raw(old_cubin_path)
         device_interface = get_interface_for_device("cuda")
         result.load_kernel(device_interface.current_device())
         return result
 
@@ -1,4 +1,5 @@
 import functools
+import os
 from typing import Any, Optional
 from typing_extensions import Unpack
 
@@ -34,6 +35,7 @@ class StaticallyLaunchedCudaKernel:
 
     def __init__(self, kernel: CompiledKernel) -> None:
         self.name = kernel.src.fn.__name__
+        self.cubin_raw = kernel.asm.get("cubin", None)
         self.cubin_path = kernel._cubin_path
 
         # Used by torch.compile to filter constants in older triton versions
@@ -87,6 +89,19 @@ def __init__(self, kernel: CompiledKernel) -> None:
                 "Static cuda launcher only supports num_ctas == 1"
             )
 
+    def reload_cubin_from_raw(self, filepath: str) -> str:
+        """
+        If the cubin file triton generated gets deleted under us, we can
+        reload it from the raw cubin file.
+        """
+        if self.cubin_path is None:
+            assert self.cubin_raw is not None
+            os.makedirs(os.path.dirname(filepath), exist_ok=True)
+            with open(filepath, "wb") as f:
+                f.write(self.cubin_raw)
+                self.cubin_path = filepath
+        return self.cubin_path
+
     def load_kernel(self, device: int) -> None:
         from torch._C import _StaticCudaLauncher
 
@@ -100,6 +115,7 @@ def load_kernel(self, device: int) -> None:
         )
         # Don't need the cubin path anymore now that we've loaded
         self.cubin_path = None
+        self.cubin_raw = None
 
     @staticmethod
     @functools.lru_cache
 
@@ -506,6 +506,16 @@ def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
         self.launchers = []
         return old_values
 
+    def prepare_for_caching(self) -> None:
+        """
+        Statically Launched CUDA Kernels have a raw cubin on them
+        that we don't need to store in the cache(since TritonBundler handles the collection for us)
+        """
+        for result in self.compile_results:
+            if isinstance(result, StaticTritonCompileResult):
+                # Don't save this in the inductor cache, as it is very large
+                result.kernel.cubin_raw = None
+
     def __getstate__(self) -> dict[str, Any]:
         assert not self.launchers, (
             "pickle should not be called with after make_launchers()"
@@ -1268,9 +1278,13 @@ def reload_cubin_path(self):
             f"{self.kernel.name}.cubin",
         )
         if not os.path.exists(cubin_location):
-            raise RuntimeError(
-                "Cubin file saved by TritonBundler not found at %s", cubin_location
-            )
+            if self.kernel.cubin_raw is not None:
+                # We saved the raw cubin, so write it to he appropriate location
+                self.kernel.reload_cubin_from_raw(cubin_location)
+            else:
+                raise RuntimeError(
+                    "Cubin file saved by TritonBundler not found at %s", cubin_location
+                )
         self.kernel.cubin_path = cubin_location
 
     def make_launcher(self) -> LauncherType:
 
@@ -173,7 +173,9 @@ def put_static_autotuner(cls, key: str, kernel: "CachingAutotuner") -> None:  #
             # for FXGraphCache
             old_values = kernel.prepare_for_pickle()
             new_kernel = copy.deepcopy(kernel)
+            new_kernel.prepare_for_caching()
             new_kernel._reload_kernel = None
+
             entries.append(
                 StaticallyLaunchedAutotuner(
                     key,
@@ -223,6 +225,17 @@ def load_autotuners(
         kernel_names = []
         with dynamo_timed("TritonBundler.load_cached_static_autotuners"):
             for result in static_autotuners:
+                try:
+                    # Make sure the cubin path exists and is valid
+                    for compile_result in result.kernel.compile_results:
+                        compile_result.reload_cubin_path()
+                except RuntimeError as e:
+                    log.warning(
+                        "Failed to reload cubin file statically launchable autotuner %s: %s",
+                        result.kernel_name,
+                        e,
+                    )
+                    continue
                 # We make a future instead of returning the kernel here so that
                 # kernels that are not statically launchable (i.e. cache miss)
                 # can launch a worker without waiting on the blocking step of