8000 [ROCm] add flag torch.backends.miopen.immediate by naromero77amd · Pull Request #158951 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,14 @@ void Context::setBenchmarkLimitCuDNN(int b) {
benchmark_limit_cudnn = b;
}

bool Context::immediateMiopen() const {
return immediate_miopen;
}

void Context::setImmediateMiopen(bool b) {
immediate_miopen = b;
}

bool Context::allowTF32CuBLAS() const {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
Expand Down
3 changes: 3 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ class TORCH_API Context {
void setBenchmarkCuDNN(bool);
int benchmarkLimitCuDNN() const;
void setBenchmarkLimitCuDNN(int);
bool immediateMiopen() const;
void setImmediateMiopen(bool);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool);
bool deterministicMkldnn() const;
Expand Down Expand Up @@ -440,6 +442,7 @@ class TORCH_API Context {
bool enabled_overrideable = true;
bool allow_fp16_bf16_reduction_mathSDP = false;
bool benchmark_cudnn = false;
bool immediate_miopen = false;
Float32MatmulPrecision float32_matmul_precision =
c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
? at::Float32MatmulPrecision::HIGH
Expand Down
18 changes: 6 additions & 12 deletions aten/src/ATen/native/miopen/Conv_miopen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,8 +724,7 @@ void raw_miopen_convolution_forward_out(
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);

if (deterministic && !benchmark) {
// immediate mode is triggered for the specific combination of benchmark=off deterministic=on
if (at::globalContext().immediateMiopen()) {
uint64_t solution_id;
Workspace workspace = chooseSolution<miopenConvFwdAlgorithm_t>(args, &solution_id);

Expand Down Expand Up @@ -833,8 +832,7 @@ void raw_miopen_depthwise_convolution_forward_out(
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);

if (deterministic && !benchmark) {
// immediate mode is triggered for the specific combination of benchmark=off deterministic=on
if (at::globalContext().immediateMiopen()) {
uint64_t solution_id;
Workspace workspace = chooseSolution<miopenConvFwdAlgorithm_t>(args, &solution_id);

Expand Down Expand Up @@ -989,8 +987,7 @@ void raw_miopen_convolution_backward_weight_out(
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);

if (deterministic && !benchmark) {
// immediate mode is triggered for the specific combination of benchmark=off deterministic=on
if (at::globalContext().immediateMiopen()) {
uint64_t solution_id;
Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);

Expand Down Expand Up @@ -1034,8 +1031,7 @@ void raw_miopen_depthwise_convolution_backward_weight_out(
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);

if (deterministic && !benchmark) {
// immediate mode is triggered for the specific combination of benchmark=off deterministic=on
if (at::globalContext().immediateMiopen()) {
uint64_t solution_id;
Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);

Expand Down Expand Up @@ -1240,8 +1236,7 @@ void raw_miopen_convolution_backward_input_out(
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);

if (deterministic && !benchmark) {
// immediate mode is triggered for the specific combination of benchmark=off deterministic=on
if (at::globalContext().immediateMiopen()) {
uint64_t solution_id;
Workspace workspace = chooseSolution<miopenConvBwdDataAlgorithm_t>(args, &solution_id);

Expand Down Expand Up @@ -1350,8 +1345,7 @@ void raw_miopen_depthwise_convolution_backward_input_out(
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);

if (deterministic && !benchmark) {
// immediate mode is triggered for the specific combination of benchmark=off deterministic=on
if (at::globalContext().immediateMiopen()) {
uint64_t solution_id;
Workspace workspace = chooseSolution<miopenConvBwdDataAlgorithm_t>(args, &solution_id);

Expand Down
13 changes: 13 additions & 0 deletions docs/source/backends.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,19 @@ These backends include:

```

## torch.backends.miopen

```{eval-rst}
.. automodule:: torch.backends.miopen
```

```{eval-rst}
.. attribute:: immediate

A :class:`bool` that, if True, causes MIOpen to use Immediate Mode
(https://rocm.docs.amd.com/projects/MIOpen/en/latest/how-to/find-and-immediate.html).
```

## torch.backends.mps

```{eval-rst}
Expand Down
2 changes: 2 additions & 0 deletions torch/_C/__init__.pyi.in
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,8 @@ def _get_mkldnn_enabled() -> _bool: ... # THPModule_userEnabledMkldnn
def _set_mkldnn_enabled(arg: _bool) -> None: ... # THPModule_setUserEnabledMkldnn
def _get_cudnn_benchmark() -> _bool: ... # THPModule_benchmarkCuDNN
def _set_cudnn_benchmark(arg: _bool) -> None: ... # THPModule_setBenchmarkCuDNN
def _get_miopen_immediate() -> _bool: ... # THPModule_userImmediateMiopen
def _set_miopen_immediate(arg: _bool) -> None: ... # THPModule_setUserImmediateMiopen
def _get_cudnn_deterministic() -> _bool: ... # THPModule_deterministicCuDNN
def _set_cudnn_deterministic(arg: _bool) -> None: ... # THPModule_setDeterministicCuDNN
def _get_mkldnn_deterministic() -> _bool: ... # THPModule_deterministicMkldnn
Expand Down
1 change: 1 addition & 0 deletions torch/_dynamo/trace_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@
"torch._C._get_cublas_allow_tf32",
"torch._C._get_cudnn_allow_tf32",
"torch._C._get_cudnn_benchmark",
"torch._C._get_miopen_immediate",
"torch._C._get_cudnn_deterministic",
"torch._C._get_cudnn_enabled",
"torch._C._get_custom_class_python_wrapper",
Expand Down
1 change: 1 addition & 0 deletions torch/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def __init__(self, m, name):
cusparselt as cusparselt,
kleidiai as kleidiai,
mha as mha,
miopen as miopen,
mkl as mkl,
mkldnn as mkldnn,
mps as mps,
Expand Down
53 changes: 53 additions & 0 deletions torch/backends/miopen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# mypy: allow-untyped-defs
import sys
from contextlib import contextmanager

import torch
from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule


def set_flags(
_immediate=None,
):
orig_flags = (torch._C._get_miopen_immediate(),)
if _immediate is not None:
torch._C._set_miopen_immediate(_immediate)
return orig_flags


@contextmanager
def flags(
immediate=False,
):
with __allow_nonbracketed_mutation():
orig_flags = set_flags(
immediate,
)
try:
yield
finally:
# recover the previous values
with __allow_nonbracketed_mutation():
set_flags(*orig_flags)


# The magic here is to allow us to intercept code like this:
#
# torch.backends.<miopen|mkldnn>.immediate = True


class MiopenModule(PropModule):
def __init__(self, m, name):
super().__init__(m, name)

immediate = ContextProp(
torch._C._get_miopen_immediate, torch._C._set_miopen_immediate
)


# This is the sys.modules replacement trick, see
# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
sys.modules[__name__] = MiopenModule(sys.modules[__name__], __name__)

# Add type annotation for the replaced module
immediate: bool
25 changes: 25 additions & 0 deletions torch/csrc/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,29 @@ static PyObject* THPModule_benchmarkCuDNN(PyObject* _unused, PyObject* noargs) {
Py_RETURN_FALSE;
}

static PyObject* THPModule_setImmediateMiopen(
PyObject* _unused,
PyObject* arg) {
HANDLE_TH_ERRORS
TORCH_CHECK(
PyBool_Check(arg),
"set_immediate_miopen expects a bool, "
"but got ",
THPUtils_typename(arg));
at::globalContext().setImmediateMiopen(arg == Py_True);
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}

static PyObject* THPModule_immediateMiopen(
PyObject* _unused,
PyObject* noargs) {
if (at::globalContext().immediateMiopen()) {
Py_RETURN_TRUE;
}
Py_RETURN_FALSE;
}

static PyObject* THPModule_setAllowTF32CuBLAS(
PyObject* _unused,
PyObject* arg) {
Expand Down Expand Up @@ -1642,6 +1665,8 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
{"_set_onednn_allow_tf32", THPModule_setAllowTF32OneDNN, METH_O, nullptr},
{"_get_cudnn_benchmark", THPModule_benchmarkCuDNN, METH_NOARGS, nullptr},
{"_set_cudnn_benchmark", THPModule_setBenchmarkCuDNN, METH_O, nullptr},
{"_get_miopen_immediate", THPModule_immediateMiopen, METH_NOARGS, nullptr},
{"_set_miopen_immediate", THPModule_setImmediateMiopen, METH_O, nullptr},
{"_get_cudnn_deterministic",
THPModule_deterministicCuDNN,
METH_NOARGS,
Expand Down
Loading
0