[cutlass backend] Reduce log level for cutlass compilation error (#153397)

henrylhtsang · pytorchmergebot · commit 76f182f8e064 · 2025-05-16T21:46:14.000Z
Differential Revision: [D74596410](https://our.internmc.facebook.com/intern/diff/D74596410/) This change should only affect cutlass backend. We realize that we are going to have Cuda compilation errors, and we do a really good job handling them and caching them. So reduce the logging levels there. Pull Request resolved: #153397 Approved by: https://github.com/ColinPeppler, https://github.com/Skylion007
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -2076,14 +2076,15 @@ def precompile_with_captured_stdout(choice) -> tuple[None, int]:
             return None, elapsed_ns // 1000
 
         def on_complete(future):
-            _, precompile_elapsed_us = future.result()
-            elapsed_seconds = precompile_elapsed_us / 1e6
-            elapsed_times[future] = elapsed_seconds
-            log.debug(
-                "Precompilation complete for future: %s, elapsed time: %.02fs",
-                future,
-                elapsed_seconds,
-            )
+            if not future.exception():
+                _, precompile_elapsed_us = future.result()
+                elapsed_seconds = precompile_elapsed_us / 1e6
+                elapsed_times[future] = elapsed_seconds
+                log.debug(
+                    "Precompilation complete for future: %s, elapsed time: %.02fs",
+                    future,
+                    elapsed_seconds,
+                )
 
         executor = ThreadPoolExecutor(max_workers=num_workers)
         async_compile = torch._inductor.async_compile.AsyncCompile()
@@ -2130,9 +2131,23 @@ def wait_on_futures():
                 timeout=precompilation_timeout_seconds,
             ):
                 if e := future.exception():
-                    log.error(
-                        "Exception %s for benchmark choice %s", e, futures[future]
+                    from torch._inductor.codegen.cuda.cuda_kernel import (
+                        CUDATemplateCaller,
                     )
+
+                    if isinstance(e, CUDACompileError) and isinstance(
+                        futures[future], CUDATemplateCaller
+                    ):
+                        log.debug(
+                            "Exception %s for benchmark choice %s",
+                            e,
+                            futures[future],
+                            exc_info=True,
+                        )
+                    else:
+                        log.error(
+                            "Exception %s for benchmark choice %s", e, futures[future]
+                        )
                 else:
                     counters["inductor"]["select_algorithm_num_precompiles"] += 1
                     log.info(
@@ -2238,10 +2253,13 @@ def benchmark_choices(
             try:
                 timing = cls.benchmark_choice(choice, autotune_args)
             except CUDACompileError as e:
-                log.error(
-                    "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
-                    str(e),
-                )
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+                if not isinstance(choice, CUDATemplateCaller):
+                    log.error(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
+                        e,
+                    )
                 timing = float("inf")
             except NotImplementedError as e:
                 log.warning("Not yet implemented: %s", e)