pytorch
diff --git a/‎torch/_inductor/autotune_process.py
Lines changed: 15 additions & 1 deletion b/‎torch/_inductor/autotune_process.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎torch/_inductor/select_algorithm.py
Lines changed: 19 additions & 4 deletions b/‎torch/_inductor/select_algorithm.py
Lines changed: 19 additions & 4 deletions
@@ -718,7 +718,7 @@ def make_run_fn(
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
         # Generate partial function.
-        return functools.partial(
+        ret = functools.partial(
             run_method,
             *args,
             *self.extra_args,
@@ -727,6 +727,20 @@ def make_run_fn(
             stream_ptr,
         )
 
+        # sanity check to make sure we cleanup run fn properly
+        try:
+            ret()
+        except RuntimeError as e:
+            err_msg = str(e)
+
+            def raise_runtime_error():
+                raise RuntimeError(err_msg)
+
+            self.cleanup_run_fn()
+            return raise_runtime_error
+
+        return ret
+
     def update_workspace_size(self) -> None:
         if self._workspace_size_updated:
             return
 
@@ -2265,16 +2265,31 @@ def benchmark_choices(
                 log.warning("Not yet implemented: %s", e)
                 timing = float("inf")
             except RuntimeError as e:
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+                if not isinstance(choice, CUDATemplateCaller):
+                    log.error(
+                        "CUDA runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        e,
+                    )
                 msg = str(e)
                 if "invalid argument" in msg:
                     msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
                 else:
                     if "illegal memory access" in msg:
                         msg += "\n\nEither error in template or triton bug.\n"
-                log.error(
-                    "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
-                    msg,
-                )
+
+                if isinstance(choice, CUDATemplateCaller):
+                    log.debug(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                        exc_info=True,
+                    )
+                else:
+                    log.error(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                    )
                 timing = float("inf")
             except AssertionError as e:
                 raise AssertionError(  # noqa: B904