pytorch
diff --git a/‎torchrec/distributed/embeddingbag.py
Lines changed: 2 additions & 8 deletions b/‎torchrec/distributed/embeddingbag.py
Lines changed: 2 additions & 8 deletions
diff --git a/‎torchrec/distributed/model_parallel.py
Lines changed: 38 additions & 0 deletions b/‎torchrec/distributed/model_parallel.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎torchrec/distributed/sharding_plan.py
Lines changed: 14 additions & 0 deletions b/‎torchrec/distributed/sharding_plan.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎torchrec/distributed/test_utils/test_model_parallel.py
Lines changed: 73 additions & 0 deletions b/‎torchrec/distributed/test_utils/test_model_parallel.py
Lines changed: 73 additions & 0 deletions
@@ -1516,15 +1516,9 @@ def update_shards(
         current_state = self.state_dict()
         # TODO: Save Optimizers
 
-        saved_weights = {}
         # TODO: Saving lookups tensors to CPU to eventually avoid recreating them completely again
-        for i, lookup in enumerate(self._lookups):
-            for attribute, tbe_module in lookup.named_modules():
-                if type(tbe_module) is DenseTableBatchedEmbeddingBagsCodegen:
-                    saved_weights[str(i) + "." + attribute] = tbe_module.weights.cpu()
-                    # Note: lookup.purge should delete tbe_module and weights
-                    # del tbe_module.weights
-                    # del tbe_module
+        # TODO: Ensure lookup tensors are actually being deleted
+        for _, lookup
10000
 in enumerate(self._lookups):
             # pyre-ignore
             lookup.purge()
 
 
@@ -35,6 +35,7 @@
 from torchrec.distributed.types import (
     EnumerableShardingSpec,
     ModuleSharder,
+    ParameterSharding,
     ShardedModule,
     ShardingEnv,
     ShardingEnv2D,
@@ -612,6 +613,43 @@ def _reset_parameters(module: nn.Module) -> None:
             if hasattr(m, "reset_parameters"):
                 m.reset_parameters()
 
+    def reshard(
+        self,
+        path_to_sharded_module: str,
+        changed_shard_to_params: Dict[str, ParameterSharding],
+    ) -> None:
+        """
+        Reshards a module in the DMP. This is useful when the sharding plan for a module
+        changes during training.
+
+        Args:
+            path_to_sharded_module (str): The path to the sharded module in the DMP.
+            changed_shard_to_params (Dict[str, ParameterSharding]): The delta between original sharding plan
+                and new sharding plan for the module.
+        """
+        steps = path_to_sharded_module.split(".")
+        sharded_module = self.module
+        for s in steps:
+            sharded_module = getattr(sharded_module, s)
+
+        assert isinstance(sharded_module, ShardedModule)
+        assert changed_shard_to_params is not None
+        sharder_key = sharded_module.unsharded_module_type
+        sharder = self._sharder_map[sharder_key]
+        assert hasattr(
+            sharder, "reshard"
+        ), "reshard is not implemented for this sharder"
+        sharded_module = sharder.reshard(  # pyre-ignore
+            sharded_module,
+            changed_shard_to_params,
+            self._env,
+            self.device,
+        )
+
+        self._optim: CombinedOptimizer = self._init_optim(self._dmp_wrapped_module)
+        self._plan.plan[path_to_sharded_module] = sharded_module.module_sharding_plan
+        return sharded_module
+
 
 class DMPCollection(DistributedModelParallel):
     """
 
@@ -410,6 +410,20 @@ def _get_parameter_sharding(
 ]
 
 
+def get_sharding_constructor_from_type(
+    sharding_type: ShardingType,
+) -> Callable[..., ParameterShardingGenerator]:
+    sharding_type_to_constructor = {
+        ShardingType.TABLE_WISE: table_wise,
+        ShardingType.ROW_WISE: row_wise,
+        ShardingType.COLUMN_WISE: column_wise,
+        ShardingType.TABLE_ROW_WISE: table_row_wise,
+        ShardingType.GRID_SHARD: grid_shard,
+        ShardingType.DATA_PARALLEL: data_parallel,
+    }
+    return sharding_type_to_constructor[sharding_type]
+
+
 def data_parallel() -> ParameterShardingGenerator:
     """
     Returns a generator of ParameterShardingPlan for `ShardingType::DATA_PARALLEL` for construct_module_sharding_plan.
 
@@ -22,6 +22,7 @@
 from torchrec.distributed.test_utils.test_model import TestSparseNN, TestSparseNNBase
 from torchrec.distributed.test_utils.test_sharding import (
     create_test_sharder,
+    dynamic_sharding_test,
     SharderType,
     sharding_single_rank_test,
 )
@@ -186,6 +187,78 @@ def _test_sharding(
             lengths_dtype=lengths_dtype,
         )
 
+    def _test_dynamic_sharding(
+        self,
+        sharders: List[ModuleSharder[nn.Module]],
+        backend: str = "gloo",
+        world_size: int = 2,
+        local_size: Optional[int] = None,
+        world_size_2D: Optional[int] = None,
+        node_group_size: Optional[int] = None,
+        model_class: Type[TestSparseNNBase] = TestSparseNN,
+        qcomms_config: Optional[QCommsConfig] = None,
+        app
F438
ly_optimizer_in_backward_config: Optional[
+            Dict[str, Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]]
+        ] = None,
+        variable_batch_size: bool = False,
+        variable_batch_per_feature: bool = False,
+        has_weighted_tables: bool = True,
+        global_constant_batch: bool = False,
+        pooling: PoolingType = PoolingType.SUM,
+        data_type: DataType = DataType.FP32,
+        use_inter_host_allreduce: bool = False,
+        allow_zero_batch_size: bool = False,
+        custom_all_reduce: bool = False,
+        use_offsets: bool = False,
+        indices_dtype: torch.dtype = torch.int64,
+        offsets_dtype: torch.dtype = torch.int64,
+        lengths_dtype: torch.dtype = torch.int64,
+        sharding_type: ShardingType = None,  # pyre-ignore
+        random_seed: int = 0,
+    ) -> None:
+        """
+        Tests the reshard API with dynamic_sharding_test, which creates 2 identical models
+        one of which is resharded, and then compares the predictions of the 2 models.
+        """
+        self._build_tables_and_groups(data_type=data_type)
+        constraints = {}
+        if sharding_type is not None:
+            for table in self.tables:
+                name = table.name
+                # Default sharding type constraints
+                constraints[name] = ParameterConstraints(
+                    sharding_types=[sharding_type.value],
+                )
+
+        self._run_multi_process_test(
+            callable=dynamic_sharding_test,
+            world_size=world_size,
+            local_size=local_size,
+            world_size_2D=world_size_2D,
+            node_group_size=node_group_size,
+            model_class=model_class,
+            tables=self.tables if pooling == PoolingType.SUM else self.mean_tables,
+            weighted_tables=self.weighted_tables if has_weighted_tables else None,
+            embedding_groups=self.embedding_groups,
+            sharders=sharders,
+            backend=backend,
+            optim=EmbOptimType.EXACT_SGD,
+            constraints=constraints,
+            qcomms_config=qcomms_config,
+            variable_batch_size=variable_batch_size,
+            apply_optimizer_in_backward_config=apply_optimizer_in_backward_config,
+            variable_batch_per_feature=variable_batch_per_feature,
+            global_constant_batch=global_constant_batch,
+            use_inter_host_allreduce=use_inter_host_allreduce,
+            allow_zero_batch_size=allow_zero_batch_size,
+            custom_all_reduce=custom_all_reduce,
+            use_offsets=use_offsets,
+            indices_dtype=indices_dtype,
+            offsets_dtype=offsets_dtype,
+            lengths_dtype=lengths_dtype,
+            random_seed=random_seed,
+        )
+
 
 @skip_if_asan_class
 class ModelParallelBase(ModelParallelTestShared):