[cutlass backend] Reduce log level for cutlass runtime error

henrylhtsang · henrylhtsang · commit c94a2f672279 · 2025-05-12T21:52:37.000-07:00
Differential Revision: [D74629230](https://our.internmc.facebook.com/intern/diff/D74629230/) ghstack-source-id: 283608237 Pull Request resolved: #153457
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -721,7 +721,7 @@ def make_run_fn(
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
         # Generate partial function.
-        return functools.partial(
+        ret = functools.partial(
             run_method,
             *args,
             *self.extra_args,
@@ -730,6 +730,18 @@ def make_run_fn(
             stream_ptr,
         )
 
+        # sanity check to make sure we cleanup run fn properly
+        try:
+            ret()
+        except RuntimeError as e:
+            err_msg = str(e)
+            def dummy_function():
+                raise RuntimeError(err_msg)
+            self.cleanup_run_fn()
+            return dummy_function
+
+        return ret
+
     def update_workspace_size(self) -> None:
         if self._workspace_size_updated:
             return
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -2265,16 +2265,30 @@ def benchmark_choices(
                 log.warning("Not yet implemented: %s", e)
                 timing = float("inf")
             except RuntimeError as e:
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+                if not isinstance(choice, CUDATemplateCaller):
+                    log.error(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
+                        e,
+                    )
                 msg = str(e)
                 if "invalid argument" in msg:
                     msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+                elif "illegal memory access" in msg:
+                    msg += "\n\nEither error in template or triton bug.\n"
+
+                if isinstance(choice, CUDATemplateCaller):
+                    log.debug(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                        exc_info=True,
+                    )
                 else:
-                    if "illegal memory access" in msg:
-                        msg += "\n\nEither error in template or triton bug.\n"
-                log.error(
-                    "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
-                    msg,
-                )
+                    log.error(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                    )
                 timing = float("inf")
             except AssertionError as e:
                 raise AssertionError(  # noqa: B904