pytorch · awaelchli · Jul 13, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
@@ -210,10 +210,12 @@ def load_bytes(self, read_item: ReadItem, value: io.BytesIO) -> None:
             set_element(
                 self.original_state_dict,
                 self.mappings[read_item.dest_index.fqn],
-                torch.load(value),
+                torch.load(value, weights_only=False),
             )
         else:
-            self.state_dict[read_item.dest_index.fqn] = torch.load(value)
+            self.state_dict[read_item.dest_index.fqn] = torch.load(
+                value, weights_only=False
+            )
 
     def resolve_tensor(self, read_item: ReadItem):
         tensor = self.lookup_tensor(read_item.dest_index)

diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
@@ -654,7 +654,11 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
                     else:
                         tensor = cast(
                             Tensor,
-                            torch.load(cast(IO[bytes], file_slice), map_location="cpu"),
+                            torch.load(
+                                cast(IO[bytes], file_slice),
+                                map_location="cpu",
+                                weights_only=True,
+                            ),
                         )
                         tensor = narrow_tensor_by_index(
                             tensor, req.storage_offsets, req.lengths

diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py
@@ -84,7 +84,9 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         # TODO: read on each host, instead of only the coordinator
         if self.is_coordinator:
             assert self.checkpoint_id is not None
-            torch_state_dict = torch.load(self.checkpoint_id, map_location="cpu")
+            torch_state_dict = torch.load(
+                self.checkpoint_id, map_location="cpu", weights_only=False
+            )
             if planner.flatten_state_dict:
                 torch_state_dict, _ = flatten_state_dict(torch_state_dict)
         else:
@@ -231,7 +233,7 @@ def torch_save_to_dcp(
         To avoid OOM, it's recommended to only run this function on a single rank.
     """
 
-    state_dict = torch.load(torch_save_path)
+    state_dict = torch.load(torch_save_path, weights_only=False)
     # we don't need stateful behavior here because the expectation is anything loaded by
     # torch.load would not contain stateful objects.
     _save_state_dict(

diff --git a/torch/distributed/checkpoint/planner.py b/torch/distributed/checkpoint/planner.py
@@ -330,7 +330,7 @@ class LoadPlanner:
     >>>
     >>>     def load_bytes(self, read_item, value):
     >>>         # Remove the "foo_" prefix
-    >>>         self.original_state_dict[read_item.dest_index.fqn[4:]] = torch.load(value)
+    >>>         self.original_state_dict[read_item.dest_index.fqn[4:]] = torch.load(value, weights_only=False)
 
 
     Modifying resolve_tensor and commit_tensor to handle load time transformation.

diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -108,7 +108,7 @@ def _broadcast_object(
         )
         dist.broadcast(data_recv_tensor, src=src_rank, group=group, async_op=False)
         buffer = io.BytesIO(data_recv_tensor.cpu().numpy())
-        obj = torch.load(buffer, map_location=device)
+        obj = torch.load(buffer, map_location=device, weights_only=False)
     return obj