8000 [cutlass backend] Reduce log level for cutlass runtime error by henrylhtsang · Pull Request #153457 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

[cutlass backend] Reduce log level for cutlass runtime error #153457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion torch/_inductor/autotune_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ def make_run_fn(
workspace_ptr = c_void_p(self.workspace.data_ptr())

# Generate partial function.
return functools.partial(
ret = functools.partial(
run_method,
*args,
*self.extra_args,
Expand All @@ -729,6 +729,20 @@ def make_run_fn(
stream_ptr,
)

# sanity check to make sure we cleanup run fn properly
try:
ret()
except RuntimeError as e:
err_msg = str(e)

def raise_runtime_error():
raise RuntimeError(err_msg)

self.cleanup_run_fn()
return raise_runtime_error

return ret

def update_workspace_size(self) -> None:
if self._workspace_size_updated:
return
Expand Down
28 changes: 24 additions & 4 deletions torch/_inductor/select_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,16 @@ def create_precompile_key(


class AlgorithmSelectorCache(PersistentCache):
"""
A persistent cache for algorithm selection results used in autotuning of GEMMs
and convolutions.

This classes includes precompilation and benchmarking of the kernels.

The cache is keyed by input characteristics (sizes, strides, dtypes, etc.) but
doesn't depend on the output layout.
"""

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

Expand Down Expand Up @@ -2265,16 +2275,26 @@ def benchmark_choices(
log.warning("Not yet implemented: %s", e)
timing = float("inf")
except RuntimeError as e:
from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller

msg = str(e)
if "invalid argument" in msg:
msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
else:
if "illegal memory access" in msg:
msg += "\n\nEither error in template or triton bug.\n"
log.error(
"Runtime error during autotuning: \n%s. \nIgnoring this choice.",
msg,
)

if isinstance(choice, CUDATemplateCaller):
log.debug(
"Runtime error during autotuning: \n%s. \nIgnoring this choice.",
msg,
exc_info=True,
)
else:
log.error(
"Runtime error during autotuning: \n%s. \nIgnoring this choice.",
msg,
)
timing = float("inf")
except AssertionError as e:
raise AssertionError( # noqa: B904
Expand Down
Loading
0