pytorch
diff --git a/‎aten/src/ATen/native/ForeachUtils.h
Lines changed: 35 additions & 35 deletions b/‎aten/src/ATen/native/ForeachUtils.h
Lines changed: 35 additions & 35 deletions
diff --git a/‎aten/src/ATen/native/cuda/FusedAdamWKernel.cu
Lines changed: 5 additions & 4 deletions b/‎aten/src/ATen/native/cuda/FusedAdamWKernel.cu
Lines changed: 5 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/fused_adam_impl.cu
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/fused_adam_impl.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/fused_adamw_impl.cu
Lines changed: 51 additions & 20 deletions b/‎aten/src/ATen/native/cuda/fused_adamw_impl.cu
Lines changed: 51 additions & 20 deletions
diff --git a/‎test/test_optim.py
Lines changed: 4 additions & 5 deletions b/‎test/test_optim.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎torch/optim/adam.py
Lines changed: 4 additions & 10 deletions b/‎torch/optim/adam.py
Lines changed: 4 additions & 10 deletions
@@ -326,41 +326,41 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
           "-th Tensor is not.");
       return {t->device(), t->scalar_type()};
     }();
-    TORCH_CHECK(
-        std::all_of(
-            nested_tensorlist.cbegin(),
-            nested_tensorlist.cend(),
-            [&](const auto& tensorlist) -> bool {
-              if (tensorlist.size() == 0) {
-                return true;
-              }
-              const auto& tensor = tensorlist[tensor_index];
-              // note(crcrpar): Currently the scope of this function is
-              // optimizers so there could be `state_steps` and other scalars
-              // whose elements are float tensors no matter what the parameter's
-              // dtype is.
-              if (!tensor.has_value()) {
-                return true;
-              } else {
-                const auto s = tensor->scalar_type();
-                const auto d = tensor->device();
-                // Note: `step` or `state_step` is float32 by default.
-                if (key.first == d) {
-                  return key.second == s || s == at::ScalarType::Float ||
-                      s == at::ScalarType::Double;
-                } else if (d.is_cpu()) {
-                  // note(crcrpar): There are some test cases (e.g.
-                  // TestOptim::test_adam) where state_steps are on CPU and the
-                  // others are on CUDA. Currently a state_step Tensor has the
-                  // dtype of float.
-                  return s == at::ScalarType::Float ||
-                      s == at::ScalarType::Double;
-                } else {
-                  return false;
-                }
-              }
-            }),
-        "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
+    // TORCH_CHECK(
+    //     std::all_of(
+    //         nested_tensorlist.cbegin(),
+    //         nested_tensorlist.cend(),
+    //         [&](const auto& tensorlist) -> bool {
+    //           if (tensorlist.size() == 0) {
+    //             return true;
+    //           }
+    //           const auto& tensor = tensorlist[tensor_index];
+    //           // note(crcrpar): Currently the scope of this function is
+    //           // optimizers so there could be `state_steps` and other scalars
+    //           // whose elements are float tensors no matter what the parameter's
+    //           // dtype is.
+    //           if (!tensor.has_value()) {
+    //             return true;
+    //           } else {
+    //             const auto s = tensor->scalar_type();
+    //             const auto d = tensor->device();
+    //             // Note: `step` or `state_step` is float32 by default.
+    //             if (key.first == d) {
+    //               return key.second == s || s == at::ScalarType::Float ||
+    //                   s == at::ScalarType::Double;
+    //             } else if (d.is_cpu()) {
+    //               // note(crcrpar): There are some test cases (e.g.
+    //               // TestOptim::test_adam) where state_steps are on CPU and the
+    //               // others are on CUDA. Currently a state_step Tensor has the
+    //               // dtype of float.
+    //               return s == at::ScalarType::Float ||
+    //                   s == at::ScalarType::Double;
+    //             } else {
+    //               return false;
+    //             }
+    //           }
+    //         }),
+    //     "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
     if (!grouped_tensors_with_indices.count(key)) {
       grouped_tensors_with_indices.insert(
           {key,
 
@@ -50,10 +50,11 @@ void _fused_adamw_kernel_cuda_(
         grad_scale,
         found_inf);
   } else {
-    TORCH_CHECK(
-        at::native::check_fast_path_restrictions(
-            {params, grads, exp_avgs, exp_avg_sqs}),
-        "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
+    // TORCH_CHECK(
+    //     at::native::check_fast_path_restrictions(
+    //         {params, grads, exp_avgs, exp_avg_sqs}),
+    //     "params, grads, exp_avgs, and exp_avg_sqs must have same dtype,
+    //     device, and layout");
     _fused_adamw_cuda_impl_(
         params,
         grads,
 
@@ -36,7 +36,7 @@ void _fused_adam_cuda_impl_(
         kHalf,
         kBFloat16,
         params[0].scalar_type(),
-        "fused_adam_kernel_cuda",
+        "fused_adam_mp_kernel_cuda",
         [&]() {
           multi_tensor_apply_for_fused_optimizer<4>(
               tensor_lists,
 
@@ -31,26 +31,57 @@ void _fused_adamw_cuda_impl_(
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
   const float* lr_ptr = nullptr;
 
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      kHalf,
-      kBFloat16,
-      params[0].scalar_type(),
-      "fused_adamw_kernel_cuda",
-      [&]() {
-        multi_tensor_apply_for_fused_optimizer<4>(
-            tensor_lists,
-            state_steps,
-            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ADAMW, false>(),
-            lr_ptr, // unused
-            lr,
-            beta1,
-            beta2,
-            weight_decay,
-            eps,
-            maximize,
-            grad_scale_ptr,
-            found_inf_ptr);
-      });
+  if (params[0].scalar_type() != exp_avgs[0].scalar_type()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        params[0].scalar_type(),
+        "fused_adamw_kernel_cuda",
+        [&]() {
+          multi_tensor_apply_for_fused_optimizer<4>(
+              tensor_lists,
+              state_steps,
+              FusedAdamMathFunctorMP<
+                  scalar_t,
+                  float,
+                  float,
+                  BFloat16,
+                  BFloat16,
+                  4,
+                  ADAM_MODE::ADAMW,
+                  false>(),
+              lr_ptr, // unused
+              lr,
+              beta1,
+              beta2,
+              weight_decay,
+              eps,
+              maximize,
+              grad_scale_ptr,
+              found_inf_ptr);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        params[0].scalar_type(),
+        "fused_adamw_kernel_cuda",
+        [&]() {
+          multi_tensor_apply_for_fused_optimizer<4>(
+              tensor_lists,
+              state_steps,
+              FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ADAMW, false>(),
+              lr_ptr, // unused
+              lr,
+              beta1,
+              beta2,
+              weight_decay,
+              eps,
+              maximize,
+              grad_scale_ptr,
+              found_inf_ptr);
+        });
+  }
 }
 
 // The following overload simply has a Tensor lr
 
@@ -2237,17 +2237,16 @@ def test_non_empty_state(self, device, dtype, optim_info):
 
     @onlyCUDA
     @optims(
-        [o for o in optim_db if o.optim_cls.__name__ == "Adam"], dtypes=[torch.float32]
+        [o for o in optim_db if o.optim_cls.__name__ in ["Adam", "AdamW"]],
+        dtypes=[torch.float32],
     )
-    def test_bf16_fused_adam(self, device, dtype, optim_info):
+    def test_bf16_fused(self, device, dtype, optim_info):
         optim_inputs = optim_info.optim_inputs_func(device=device, dtype=dtype)
         optim_cls = optim_info.optim_cls
         for optim_input in optim_inputs:
             kwargs = optim_input.kwargs
             # currently not supported
-            if kwargs.get("amsgrad", False) or kwargs.get(
-                "decoupled_weight_decay", False
-            ):
+            if kwargs.get("amsgrad", False):
                 continue
             kwargs["fused"] = True
 
 
@@ -837,16 +837,10 @@ def _fused_adam(
     lr_dict: Optional[DeviceDict] = (
         {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
     )
-    # grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-    #     [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]  # type: ignore[list-item]
-    # )
-    # replace this with better implementation
-    grouped_tensors = {
-        (params[0].device, params[0].dtype): (
-            (params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps),
-            None,
-        )
-    }
+    # TODO: currently the check that the state are properly correspondent to their param dtype + device is removed!!!!
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]  # type: ignore[list-item]
+    )
     for (device, _), (
         (
             device_params_,