[BE] Import CUDATemplateCaller non-lazily in select_algorithm.py

henrylhtsang · henrylhtsang · commit e593dcdbed19 · 2025-05-16T15:02:08.000-07:00
Differential Revision: [D74911280](https://our.internmc.facebook.com/intern/diff/D74911280/) [ghstack-poisoned]
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -29,6 +29,7 @@
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.testing import rand_strided
 from torch._dynamo.utils import counters, dynamo_timed, identity, preserve_rng_state
+from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.utils import clear_on_fresh_inductor_cache
 from torch.utils._filelock import FileLock
 from torch.utils._ordered_set import OrderedSet
@@ -1824,8 +1825,6 @@ def __call__(
         precompilation_timeout_seconds: int = 60 * 60,
         return_multi_template=False,
     ):
-        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
-
         # Templates selected with input_gen_fns require specific input data to avoid IMA
         # Passing custom input gen fns to benchmark_fusion NYI, so skip deferred template selection
         # TODO(jgong5): support multi-template on CPU
@@ -2131,10 +2130,6 @@ def wait_on_futures():
                 timeout=precompilation_timeout_seconds,
             ):
                 if e := future.exception():
-                    from torch._inductor.codegen.cuda.cuda_kernel import (
-                        CUDATemplateCaller,
-                    )
-
                     if isinstance(e, CUDACompileError) and isinstance(
                         futures[future], CUDATemplateCaller
                     ):
@@ -2253,8 +2248,6 @@ def benchmark_choices(
             try:
                 timing = cls.benchmark_choice(choice, autotune_args)
             except CUDACompileError as e:
-                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
-
                 if not isinstance(choice, CUDATemplateCaller):
                     log.error(
                         "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
@@ -2265,8 +2258,6 @@ def benchmark_choices(
                 log.warning("Not yet implemented: %s", e)
                 timing = float("inf")
             except RuntimeError as e:
-                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
-
                 if not isinstance(choice, CUDATemplateCaller):
                     log.error(
                         "CUDA runtime error during autotuning: \n%s. \nIgnoring this choice.",