Connect optimizer state_dict with computation kernel for checkpoint saving (#2975)

emlin · facebook-github-bot · commit 4b5b5e26cc74 · 2025-05-20T23:21:31.000-07:00
Summary: Pull Request resolved: #2975 Add ZeroCollisionKeyValueEmbeddingFusedOptimizer class for integrating ZCH optimizer state to checkpointing Reviewed By: bobbyliujb Differential Revision: D74790135 fbshipit-source-id: 108ae496da3e5986c728029628946a3481a49a33
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -55,6 +55,7 @@
 from torchrec.distributed.embedding_kernel import (
     BaseEmbedding,
     create_virtual_sharded_tensors,
+    create_virtual_table_local_metadata,
     get_state_dict,
 )
 from torchrec.distributed.embedding_types import (
@@ -206,7 +207,9 @@ def _populate_zero_collision_tbe_params(
     bucket_sizes: List[int] = [size for _, _, size in sharded_local_buckets]
 
     tbe_params["kv_zch_params"] = KVZCHParams(
-        bucket_offsets=bucket_offsets, bucket_sizes=bucket_sizes
+        bucket_offsets=bucket_offsets,
+        bucket_sizes=bucket_sizes,
+        enable_optimizer_offloading=False,
     )
 
 
@@ -283,6 +286,53 @@ def __init__(  # noqa C901
             table_name_to_weight_count_per_rank
         )
 
+        # pyre-ignore [33]
+        state: Dict[Any, Any] = {}
+        param_group: Dict[str, Any] = {
+            "params": [],
+            "lr": emb_module.get_learning_rate(),
+        }
+
+        params: Dict[str, Union[torch.Tensor, ShardedTensor]] = {}
+
+        sorted_id_tensors = (
+            [
+                sharded_t._local_shards[0].tensor
+                for sharded_t in self._sharded_embedding_weight_ids
+            ]
+            if self._sharded_embedding_weight_ids is not None
+            else None
+        )
+
+        all_optimizer_states = emb_module.get_optimizer_state(
+            sorted_id_tensor=sorted_id_tensors
+        )
+        opt_param_list = [param["momentum1"] for param in all_optimizer_states]
+        emb_table_config_copy = copy.deepcopy(self._config.embedding_tables)
+        for emb_table in emb_table_config_copy:
+            emb_table.local_metadata.placement._device = torch.device("cpu")
+        opt_sharded_t_list = create_virtual_sharded_tensors(
+            emb_table_config_copy, opt_param_list, self._pg
+        )
+
+        for (
+            emb_config,
+            sharded_weight,
+            opt_sharded_t,
+        ) in zip(
+            emb_table_config_copy,
+            sharded_embedding_weights_by_table,
+            opt_sharded_t_list,
+        ):
+            param_key = emb_config.name + ".weight"
+            state[sharded_weight] = {}
+            param_group["params"].append(sharded_weight)
+            params[param_key] = sharded_weight
+
+            state[sharded_weight][f"{emb_config.name}.momentum1"] = opt_sharded_t
+
+        super().__init__(params, state, [param_group])
+
     def zero_grad(self, set_to_none: bool = False) -> None:
         # pyre-ignore [16]
         self._emb_module.set_learning_rate(self.param_groups[0]["lr"])
@@ -292,6 +342,61 @@ def step(self, closure: Any = None) -> None:
         # pyre-ignore [16]
         self._emb_module.set_learning_rate(self.param_groups[0]["lr"])
 
+    def set_sharded_embedding_weight_ids(
+        self, sharded_embedding_weight_ids: Optional[List[ShardedTensor]]
+    ) -> None:
+        self._sharded_embedding_weight_ids = sharded_embedding_weight_ids
+
+    def _post_state_dict_hook(self, curr_state: Dict[str, Any]) -> None:
+        logger.info("update optimizer state dict in state_dict_post_hook")
+        embedding_weight_ids = (
+            [
+                sharded_t._local_shards[0].tensor
+                for sharded_t in self._sharded_embedding_weight_ids
+            ]
+            if self._sharded_embedding_weight_ids is not None
+            else None
+        )
+        all_optimizer_states = self._emb_module.get_optimizer_state(
+            embedding_weight_ids,
+            no_snapshot=False,
+            should_flush=False,  # get embedding weights already flushed, no need to flush again here
+        )
+        emb_table_config_copy = copy.deepcopy(self._config.embedding_tables)
+        for emb_table in emb_table_config_copy:
+            emb_table.local_metadata.placement._device = torch.device("cpu")
+
+        # The order of table_config is determined so put it as outer-loop for consistent traverse order across ranks
+        for table_config, opt_states in zip(
+            emb_table_config_copy,
+            all_optimizer_states,
+        ):
+            for key, sharded_t_dict in curr_state.items():
+                # update zero collision table's optimizer state
+                if f".{table_config.name}.weight" in key:
+                    for (_, opt_state_t), (sharded_t_k, sharded_t) in zip(
+                        opt_states.items(), sharded_t_dict.items()
+                    ):
+                        logger.info(
+                            f"update optimizer state for table {table_config.name} with state shape {opt_state_t.shape}, rank={self._my_rank}, weight_count_per_rank={self._table_name_to_weight_count_per_rank.get(table_config.name, None)}"
+                        )
+                        sharded_t.local_shards()[0].tensor = opt_state_t
+                        create_virtual_table_local_metadata(
+                            # pyre-ignore [6]
+                            table_config.local_metadata,
+                            opt_state_t,
+                            self._my_rank,
+                        )
+                        for shard in sharded_t.local_shards():
+                            shard.metadata = table_config.local_metadata
+                        new_sharded_t = ShardedTensor._init_from_local_shards(
+                            sharded_t.local_shards(),
+                            None,
+                            None,
+                            process_group=self._pg,
+                        )
+                        sharded_t_dict[sharded_t_k] = new_sharded_t
+
 
 class EmbeddingFusedOptimizer(FusedOptimizer):
     def __init__(  # noqa C901
@@ -756,7 +861,6 @@ def _gen_named_parameters_by_table_fused(
         table_count = table_name_to_count.pop(table_name)
         if emb_module.weights_precision == SparseType.INT8:
             dim += emb_module.int8_emb_row_dim_offset
-        # pyre-fixme[29]: `Union[(self: TensorBase, indices: Union[None, _NestedSeque...
         offset = emb_module.weights_physical_offsets[t_idx]
         weights: torch.Tensor
         if location == EmbeddingLocation.DEVICE.value:
@@ -1330,7 +1434,7 @@ def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterat
             return
 
         pmt_list, weight_ids_list, bucket_cnt_list = self.split_embedding_weights(
-            no_snapshot=False
+            no_snapshot=False, should_flush=True
         )
         emb_table_config_copy = copy.deepcopy(self._config.embedding_tables)
         for emb_table in emb_table_config_copy:
@@ -1381,12 +1485,16 @@ def purge(self) -> None:
         self.emb_module.lxu_cache_state.fill_(-1)
 
     # pyre-ignore [15]
-    def split_embedding_weights(self, no_snapshot: bool = True) -> Tuple[
+    def split_embedding_weights(
+        self, no_snapshot: bool = True, should_flush: bool = False
+    ) -> Tuple[
         List[PartiallyMaterializedTensor],
         Optional[List[torch.Tensor]],
         Optional[List[torch.Tensor]],
     ]:
-        return self.emb_module.split_embedding_weights(no_snapshot)
+        return self.emb_module.split_embedding_weights(
+            no_snapshot, should_flush=should_flush
+        )
 
     def forward(self, features: KeyedJaggedTensor) -> torch.Tensor:
         # reset split weights during training