pytorch
diff --git a/‎test/inductor/test_static_cuda_launcher.py
Lines changed: 19 additions & 0 deletions b/‎test/inductor/test_static_cuda_launcher.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎torch/_inductor/async_compile.py
Lines changed: 4 additions & 0 deletions b/‎torch/_inductor/async_compile.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/_inductor/runtime/compile_tasks.py
Lines changed: 15 additions & 10 deletions b/‎torch/_inductor/runtime/compile_tasks.py
Lines changed: 15 additions & 10 deletions
@@ -2,6 +2,7 @@
 import os
 import random
 import tempfile
+from unittest import mock
 
 import torch
 from torch._dynamo.device_interface import get_interface_for_device
@@ -496,6 +497,24 @@ def fn(x):
         compiled_result = compiled_fn(arg)
         self.assertEqual(eager_result, compiled_result)
 
+    @skipIfRocm
+    def test_disable_static_cuda_launcher(self):
+        @torch.compile
+        def fn(x, y):
+            return torch.cat(((x * 4), y + 10))
+
+        # Test that static cuda launcher is in fact disabled
+        with torch._inductor.config.patch("use_static_cuda_launcher", False):
+            x = torch.rand(20, device="cuda")
+            y = torch.rand(20, device="cuda")
+            with mock.patch(
+                "torch._inductor.runtime.triton_heuristics.StaticTritonCompileResult.make_launcher"
+            ) as mocked:
+                result = fn(x, y)
+                mocked.assert_not_called()
+
+            self.assertEqual(result, torch.cat(((x * 4), y + 10)))
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
@@ -351,11 +351,15 @@ def reload_kernel_in_parent():
             # process pool is running, so pass them to the subprocess to reset.
             env_vars = ["TORCHINDUCTOR_CACHE_DIR", "TRITON_CACHE_DIR"]
             extra_env = {v: os.environ[v] for v in env_vars if v in os.environ}
+            extra_config = {
+                "use_static_cuda_launcher": torch._inductor.config.use_static_cuda_launcher
+            }
 
             task = self.process_pool().submit(
                 _worker_compile_triton,
                 load_kernel,
                 extra_env,
+                extra_config,
             )
 
             def get_result() -> CachingAutotuner:
 
@@ -8,7 +8,7 @@
 import warnings
 from pathlib import Path
 from types import ModuleType
-from typing import Callable, TYPE_CHECKING
+from typing import Any, Callable, TYPE_CHECKING
 
 
 if TYPE_CHECKING:
@@ -48,15 +48,20 @@ def _set_triton_ptxas_path() -> None:
 
 
 def _worker_compile_triton(
-    load_kernel: Callable[[], CachingAutotuner], extra_env: dict[str, str]
+    load_kernel: Callable[[], CachingAutotuner],
+    extra_env: dict[str, str],
+    extra_config: dict[str, Any],
 ) -> tuple[CachingAutotuner, int]:
     _set_triton_ptxas_path()
     os.environ.update(extra_env)
-    start_ns = time.time_ns()
-    kernel = load_kernel()
-    kernel.precompile(warm_cache_only=True)
-    elapsed_ns = time.time_ns() - start_ns
-    kernel.prepare_for_pickle()
-    # We can release this memory in the compile subprocesses:
-    linecache.clearcache()
-    return kernel, elapsed_ns // 1000
+    from torch._inductor import config
+
+    with config.patch(extra_config):
+        start_ns = time.time_ns()
+        kernel = load_kernel()
+        kernel.precompile(warm_cache_only=True)
+        elapsed_ns = time.time_ns() - start_ns
+        kernel.prepare_for_pickle()
+        # We can release this memory in the compile subprocesses:
+        linecache.clearcache()
+        return kernel, elapsed_ns // 1000