pytorch
diff --git a/‎torch/_meta_registrations.py
Lines changed: 4 additions & 4 deletions b/‎torch/_meta_registrations.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎torch/optim/adam.py
Lines changed: 41 additions & 6 deletions b/‎torch/optim/adam.py
Lines changed: 41 additions & 6 deletions
diff --git a/‎torch/optim/optimizer.py
Lines changed: 68 additions & 1 deletion b/‎torch/optim/optimizer.py
Lines changed: 68 additions & 1 deletion
@@ -7060,10 +7060,10 @@ def _fn(self, *args, **kwargs):
 @register_meta(aten.lerp)
 @out_wrapper()
 def lerp(start, end, weight):
-    torch._check(
-        start.dtype == end.dtype,
-        lambda: f"expected dtype {start.dtype} for `end`, but got dtype {end.dtype}",
-    )
+    # torch._check(
+    #     start.dtype == end.dtype,
+    #     lambda: f"expected dtype {start.dtype} for `end`, but got dtype {end.dtype}",
+    # )
     args = [start, end]
     if isinstance(weight, TensorLike):
         if weight.ndim != 0:
 
@@ -166,24 +166,46 @@ def _init_group(
                     state["step"] = (
                         torch.zeros(
                             (),
-                            dtype=_get_scalar_dtype(is_fused=group["fused"]),
+                            dtype=(
+                                _get_scalar_dtype(is_fused=group["fused"])
+                                if "step" not in self._dtype_policy
+                                else self._dtype_policy["step"](p)
+                            ),
                             device=p.device,
                         )
                         if group["capturable"] or group["fused"]
                         else torch.tensor(0.0, dtype=_get_scalar_dtype())
                     )
                     # Exponential moving average of gradient values
                     state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
+                        p,
+                        dtype=(
+                            p.dtype
+                            if "exp_avg" not in self._dtype_policy
+                            else self._dtype_policy["exp_avg"](p)
+                        ),
+                        memory_format=torch.preserve_format,
                     )
                     # Exponential moving average of squared gradient values
                     state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
+                        p,
+                        dtype=(
+                            p.dtype
+                            if "exp_avg_sq" not in self._dtype_policy
+                            else self._dtype_policy["exp_avg_sq"](p)
+                        ),
+                        memory_format=torch.preserve_format,
                     )
                     if group["amsgrad"]:
                         # Maintains max of all exp. moving avg. of sq. grad. values
                         state["max_exp_avg_sq"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
+                            p,
+                            dtype=(
+                                p.dtype
+                                if "max_exp_avg_sq" not in self._dtype_policy
+                                else self._dtype_policy["max_exp_avg_sq"](p)
+                            ),
+                            memory_format=torch.preserve_format,
                         )
 
                 exp_avgs.append(state["exp_avg"])
@@ -384,8 +406,16 @@ def _single_tensor_adam(
 
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
-        exp_avg = exp_avgs[i]
-        exp_avg_sq = exp_avg_sqs[i]
+        exp_avg = (
+            exp_avgs[i]
+            if exp_avgs[i].dtype == grad.dtype
+            else exp_avgs[i].to(grad.dtype)
+        )
+        exp_avg_sq = (
+            exp_avg_sqs[i]
+            if exp_avg_sqs[i].dtype == grad.dtype
+            else exp_avg_sqs[i].to(grad.dtype)
+        )
         step_t = state_steps[i]
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
@@ -530,6 +560,11 @@ def _single_tensor_adam(
         if amsgrad and torch.is_complex(params[i]):
             max_exp_avg_sqs[i] = torch.view_as_complex(max_exp_avg_sqs[i])
 
+        if exp_avgs[i].dtype != exp_avg.dtype:
+            exp_avgs[i].copy_(exp_avg)
+        if exp_avg_sqs[i].dtype != exp_avg_sq.dtype:
+            exp_avg_sqs[i].copy_(exp_avg_sq)
+
 
 def _multi_tensor_adam(
     params: list[Tensor],
 
@@ -338,6 +338,8 @@ class Optimizer:
     _optimizer_load_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
     _optimizer_load_state_dict_post_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
 
+    _dtype_policy: dict[str, Callable[[torch.Tensor], torch.dtype]]
+
     def __init__(self, params: ParamsT, defaults: dict[str, Any]) -> None:  # noqa: D107
         torch._C._log_api_usage_once("python.optimizer")
         self.defaults = defaults
@@ -347,6 +349,7 @@ def __init__(self, params: ParamsT, defaults: dict[str, Any]) -> None:  # noqa:
         self._optimizer_state_dict_post_hooks = OrderedDict()
         self._optimizer_load_state_dict_pre_hooks = OrderedDict()
         self._optimizer_load_state_dict_post_hooks = OrderedDict()
+        self._dtype_policy = OrderedDict()
 
         self._patch_step_function()
 
@@ -864,7 +867,7 @@ def load_state_dict(self, state_dict: StateDict) -> None:
 
         if len(groups) != len(saved_groups):
             raise ValueError(
-                "loaded state dict has a different number of parameter groups"
+                "loaded state dict has a different number of " "parameter groups"
             )
         param_lens = (len(g["params"]) for g in groups)
         saved_lens = (len(g["params"]) for g in saved_groups)
@@ -1000,6 +1003,70 @@ def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]
         """
         raise NotImplementedError
 
+    def dtype_policy(self) -> dict[str, Callable[[torch.Tensor], torch.dtype]]:
+        r"""Gets the dtype policy for the optimizer.
+
+        Returns the optimizer's dtype_policy. See the docs for set_dtype_policy for more details.
+
+        """
+        return self._dtype_policy
+
+    def set_dtype_policy(
+        self, policy: dict[str, Callable[[torch.Tensor], torch.dtype]]
+    ) -> None:
+        r"""Set the dtype policy for the optimizer.
+
+        By default, the optimizer initializes state to be the same dtype as the parameter. This
+        function allows the user to enable mixed precision training for the optimizer by specifying
+        lower or higher precision dtypes for state corresponding to a parameter.
+
+        A dtype policy is a dictionary mapping optimizer state to a desired dtype given a parameter.
+        For example, Adam(W) has state ``exp_avg`` and ``exp_avg_sq`` mapping to momentum and
+        variance respectively. The default policy would semantically be the following:
+
+        .. code-block:: python
+
+            default_dtype_policy = {
+                "exp_avg": lambda p: p.dtype,
+                "exp_avg_sq": lambda p: p.dtype,
+            }
+
+
+        If we wanted momentum (exp_avg) to match the param but variance (exp_avg_sq) to be BF16 when
+        the parameter is a float, then the policy would look like:
+
+        .. code-block:: python
+
+            mixed_precision_dtype_policy = {
+                "exp_avg_sq": lambda p: torch.bfloat16 if p.dtype == torch.float else p.dtype
+                # no need to specify "exp_avg" since the default will fall back to p's dtype already
+            }
+
+            model = ...
+            optim = torch.optim.AdamW(model.named_parameters())
+            optim.set_dtype_policy(mixed_precision_dtype_policy)
+
+            # at this point, state has not been initialized
+
+            # run forward and backward
+            loss = model(...)
+            loss.backward()
+
+            # at first step, state will be initialized according to the set policy
+            optim.step()
+            optim.zero_grad()
+
+
+        The new policy will only be applied for any new state initalized after the policy has been
+        set. State loaded from an existing state_dict will not be affected. Previously initialized
+        state will also not be affected.
+
+        Args:
+            policy (Dict[str, Callable]): A dictionary mapping optimizer state keys (str) to a Callable
+                that will intake the parameter.
+        """
+        self._dtype_policy = policy
+
     @torch._disable_dynamo
     def add_param_group(self, param_group: dict[str, Any]) -> None:
         r"""Add a param group to the :class:`Optimizer` s `param_groups`.