pytorch · henrylhtsang · May 13, 2025 · May 15, 2025 · May 16, 2025 · May 19, 2025
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -720,7 +720,7 @@ def make_run_fn(
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
         # Generate partial function.
-        return functools.partial(
+        ret = functools.partial(
             run_method,
             *args,
             *self.extra_args,
@@ -729,6 +729,20 @@ def make_run_fn(
             stream_ptr,
         )
 
+        # sanity check to make sure we cleanup run fn properly
+        try:
+            ret()
+        except RuntimeError as e:
+            err_msg = str(e)
+
+            def raise_runtime_error():
+                raise RuntimeError(err_msg)
+
+            self.cleanup_run_fn()
+            return raise_runtime_error
+
+        return ret
+
     def update_workspace_size(self) -> None:
         if self._workspace_size_updated:
             return

diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -1789,6 +1789,16 @@ def create_precompile_key(
 
 
 class AlgorithmSelectorCache(PersistentCache):
+    """
+    A persistent cache for algorithm selection results used in autotuning of GEMMs
+    and convolutions.
+
+    This classes includes precompilation and benchmarking of the kernels.
+
+    The cache is keyed by input characteristics (sizes, strides, dtypes, etc.) but
+    doesn't depend on the output layout.
+    """
+
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
@@ -2265,16 +2275,26 @@ def benchmark_choices(
                 log.warning("Not yet implemented: %s", e)
                 timing = float("inf")
             except RuntimeError as e:
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
                 msg = str(e)
                 if "invalid argument" in msg:
                     msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
                 else:
                     if "illegal memory access" in msg:
                         msg += "\n\nEither error in template or triton bug.\n"
-                log.error(
-                    "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
-                    msg,
-                )
+
+                if isinstance(choice, CUDATemplateCaller):
+                    log.debug(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                        exc_info=True,
+                    )
+                else:
+                    log.error(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                    )
                 timing = float("inf")
             except AssertionError as e:
                 raise AssertionError(  # noqa: B904