pytorch
diff --git a/‎test/distributed/checkpoint/test_compatibility.py
Lines changed: 25 additions & 0 deletions b/‎test/distributed/checkpoint/test_compatibility.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎torch/distributed/checkpoint/_nested_dict.py
Lines changed: 18 additions & 2 deletions b/‎torch/distributed/checkpoint/_nested_dict.py
Lines changed: 18 additions & 2 deletions
diff --git a/‎torch/distributed/checkpoint/_traverse.py
Lines changed: 46 additions & 6 deletions b/‎torch/distributed/checkpoint/_traverse.py
Lines changed: 46 additions & 6 deletions
diff --git a/‎torch/distributed/checkpoint/_version.py
Lines changed: 6 additions & 0 deletions b/‎torch/distributed/checkpoint/_version.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎torch/distributed/checkpoint/default_planner.py
Lines changed: 35 additions & 0 deletions b/‎torch/distributed/checkpoint/default_planner.py
Lines changed: 35 additions & 0 deletions
@@ -70,6 +70,31 @@ def test_storage_meta(self) -> None:
         self.assertEqual(storage_meta.save_id, writer.save_id)
         self.assertEqual(storage_meta.load_id, reader.load_id)
 
+    @with_temp_dir
+    def test_with_v_2_3(self) -> None:
+        sd = {
+            "a": torch.zeros(4, 4),
+            "dict": {
+                "dict_a": {"dict_a_1": 1, "dict_a_2": 2},
+                "dict_b": {"dict_b_1": 1, "dict_b_2": 2},
+            },
+            "list": [0, 1, 2, 3, 4, 5],
+        }
+        load_sd = {
+            "a": torch.ones(4, 4),
+            "dict": {
+                "dict_a": {"dict_a_1": 2, "dict_a_2": 4},
+                "dict_b": {"dict_b_1": 2, "dict_b_2": 4},
+            },
+            "list": [10, 11, 12, 13, 14, 15],
+        }
+
+        dcp._version._act_like_version = "2_3"
+        dcp.save(sd, checkpoint_id=self.temp_dir)
+        dcp._version._act_like_version = None
+        dcp.load(load_sd, checkpoint_id=self.temp_dir)
+        self.assertEqual(sd, load_sd)
+
 
 if __name__ == "__main__":
     run_tests()
@@ -3,7 +3,14 @@
 
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 
-from ._traverse import OBJ_PATH, set_element, STATE_DICT_ITEM, traverse_state_dict
+from . import _version
+from ._traverse import (
+    OBJ_PATH,
+    set_element,
+    STATE_DICT_ITEM,
+    traverse_state_dict,
+    traverse_state_dict_v_2_3,
+)
 
 
 """
@@ -40,7 +47,16 @@ def flat_copy(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
         flattened[new_fqn] = value
         mappings[new_fqn] = path
 
-    traverse_state_dict(state_dict, flat_copy)
+    # We started to flatten dictionary since v2.4. But in order to not break
+    # the checkpoints that were saved before v2.4, we need to keep the old
+    # traversal so that we can reconstruct those checkpoints.
+    use_v_2_3 = (
+        _version._derived_version is not None and _version._derived_version == "2_3"
+    )
+    if use_v_2_3:
+        traverse_state_dict_v_2_3(state_dict, flat_copy)
+    else:
+        traverse_state_dict(state_dict, flat_copy)
     return flattened, mappings
 
 
 
@@ -40,14 +40,11 @@ def traverse_state_dict(
 ) -> None:
     """
     Invoke ``visitor`` for each value recursively in ``state_dict``.
-    Mapping, list, and tuple will be flattened and other value types are treated
-    as the terminal values and will invoke ``visitor``.
-    Mapping is treated as non terminal node and will be flattened.
-    List and tuple, on the other hand, will not be flattened unless containing other
-    mapping containers or tensors.
+    Mapping will be traversed and ``visitor`` will be applied to the leaf elements.
+    ``visitor`` will only be applied to elements in a list or a tuple, if the
+    container contains tensors or mappings.
     """
 
-    # a value is terminal if it has no other containers values inside it
     def _is_terminal(value: STATE_DICT_ITEM) -> bool:
         values: Collection[STATE_DICT_ITEM]
         if isinstance(value, Mapping):
@@ -78,6 +75,49 @@ def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
         _traverse_obj((str(key),), value)
 
 
+def traverse_state_dict_v_2_3(
+    state_dict: STATE_DICT_TYPE,
+    visitor: Callable[[OBJ_PATH, STATE_DICT_ITEM], None],
+    keep_traversing: Callable[[STATE_DICT_ITEM], bool] = _keep_visiting_tensors,
+) -> None:
+    """
+    Traversal is short-circuited when if finds a collection for which ``keep_visiting_tensors`` evaluates
+    to false for all elements.
+    By default, all collections with at least one ``torch.Tensor`` element are traversed.
+    Visitor takes a path argument that is a tuple of the keys used to reach it.
+    """
+
+    # a value is terminal if it has no other containers values inside it
+    def _is_terminal(value: STATE_DICT_ITEM) -> bool:
+        values: Collection[STATE_DICT_ITEM]
+        if isinstance(value, Mapping):
+            values = value.values()
+        elif isinstance(value, list):
+            values = value
+        else:
+            return True
+
+        for entry in values:
+            if isinstance(entry, (Mapping, list)) and not _is_terminal(entry):
+                return False
+            if keep_traversing is not None and keep_traversing(entry):
+                return False
+        return True
+
+    def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        if _is_terminal(value):
+            visitor(path, value)
+        elif isinstance(value, Mapping):
+            for k, v in value.items():
+                _traverse_obj(path + (str(k),), v)
+        elif isinstance(value, list):
+            for i, v in enumerate(value):
+                _traverse_obj(path + (i,), v)
+
+    for key, value in state_dict.items():
+        _traverse_obj((str(key),), value)
+
+
 def set_element(
     root_dict: STATE_DICT_TYPE, path: OBJ_PATH, value: STATE_DICT_ITEM
 ) -> None:
 
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from typing import Optional
+
+
+_derived_version: Optional[str] = None
@@ -46,6 +46,8 @@
 )
 from torch.distributed.checkpoint.utils import find_state_dict_object
 
+from . import _version
+
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -195,6 +197,39 @@ def set_up_planner(
 
     def create_local_plan(self) -> LoadPlan:
         assert self.metadata is not None
+        if self.flatten_state_dict:
+            # To support checkpoints that are saved before v2.4, we have to
+            # differentiate if the missing keys are due to old checkpoints.
+            # The contracts are:
+            # 1. There are 3 cases when we found a missing key.
+            #    1.1 Actual missing key, but allow_partial_load is False
+            #    1.2 Actual missing key, but allow_partial load is True
+            #    1.3 Old checkpoint, but allow_partial_load is False
+            #    1.4 Old checkpoint, but allow_partial_load is True
+            # 2. If we found a missing key, we first convert the keys back to
+            #    the key format of v2.3
+            # 3. If the previous missing keys are in the v2.3 keys, we assume
+            #    this is a old checkpoint.
+            # 4. Pass the state_dict to `create_default_local_load_plan()`,
+            #    which has the logic to check missing for allow_partial_load.
+            # So for 1.2 and 1.4 cases, we delegate allow_partial_load check to
+            # `create_default_local_load_plan()`. The logic here is to determine
+            # whether the checkpoint belong to 2.3 (or before) or 2.4 (or after).
+            current_keys = set(self.state_dict.keys())
+            load_keys = set(self.metadata.state_dict_metadata.keys())
+            missing_keys = load_keys - current_keys
+            if missing_keys:
+                _version._derived_version = "2_3"
+                old_state_dict, old_mappings = flatten_state_dict(
+                    self.original_state_dict
+                )
+                old_keys = set(old_state_dict.keys())
+                if old_keys & missing_keys:
+                    self.state_dict, self.mappings = old_state_dict, old_mappings
+                # _derived_version is only used by flatten_state_dict now.
+                # Set it back to None so that later we can save to a new version.
+                _version._derived_version = None
+
         return create_default_local_load_plan(
             self.state_dict, self.metadata, not self.allow_partial_load
         )