skip empty rank lookup (#2293)

gnahzg · PaulZhang12 · commit fade86a9a0cc · 2024-08-19T15:02:25.000Z
Summary: Pull Request resolved: #2293 We have observe issue when run in predictor side with empty ranks (rank with TBEs). In this diff, we try to skip the creation of lookup for empty rank to remove all invalid operations for empty rank. Reviewed By: dstaay-fb Differential Revision: D61020328 fbshipit-source-id: 2278a1f13d2981030a01919c080d7ef010bfa010
diff --git a/torchrec/distributed/dist_data.py b/torchrec/distributed/dist_data.py
@@ -1050,7 +1050,7 @@ def forward(self, tensors: List[torch.Tensor]) -> torch.Tensor:
         Returns:
             Awaitable[torch.Tensor]: awaitable of the merged embeddings.
         """
-        assert len(tensors) == self._world_size
+        assert len(tensors) <= self._world_size
 
         is_target_device_cpu: bool = self._device.type == "cpu"
 
diff --git a/torchrec/distributed/embedding_lookup.py b/torchrec/distributed/embedding_lookup.py
@@ -1014,21 +1014,43 @@ def __init__(
                 "meta" if device is not None and device.type == "meta" else "cuda"
             )
 
+        self._is_empty_rank: List[bool] = []
         for rank in range(world_size):
-            self._embedding_lookups_per_rank.append(
-                # TODO add position weighted module support
-                MetaInferGroupedPooledEmbeddingsLookup(
-                    grouped_configs=grouped_configs_per_rank[rank],
-                    device=rank_device(device_type, rank),
-                    fused_params=fused_params,
+            empty_rank = len(grouped_configs_per_rank[rank]) == 0
+            self._is_empty_rank.append(empty_rank)
+            if not empty_rank:
+                self._embedding_lookups_per_rank.append(
+                    # TODO add position weighted module support
+                    MetaInferGroupedPooledEmbeddingsLookup(
+                        grouped_configs=grouped_configs_per_rank[rank],
+                        device=rank_device(device_type, rank),
+                        fused_params=fused_params,
+                    )
                 )
-            )
 
     def get_tbes_to_register(
         self,
     ) -> Dict[IntNBitTableBatchedEmbeddingBagsCodegen, GroupedEmbeddingConfig]:
         return get_tbes_to_register_from_iterable(self._embedding_lookups_per_rank)
 
+    def forward(
+        self,
+        input_dist_outputs: InputDistOutputs,
+    ) -> List[torch.Tensor]:
+        embeddings: List[torch.Tensor] = []
+        sparse_features = [
+            input_dist_outputs.features[i]
+            for i, is_empty in enumerate(self._is_empty_rank)
+            if not is_empty
+        ]
+        # syntax for torchscript
+        for i, embedding_lookup in enumerate(
+            self._embedding_lookups_per_rank,
+        ):
+            sparse_features_rank = sparse_features[i]
+            embeddings.append(embedding_lookup.forward(sparse_features_rank))
+        return embeddings
+
 
 class InferGroupedEmbeddingsLookup(
     InferGroupedLookupMixin,
diff --git a/torchrec/distributed/test_utils/infer_utils.py b/torchrec/distributed/test_utils/infer_utils.py
@@ -825,11 +825,15 @@ def shard_qebc(
     expected_shards: Optional[List[List[Tuple[Tuple[int, int, int, int], str]]]] = None,
     plan: Optional[ShardingPlan] = None,
     ebc_fqn: str = "_module.sparse.ebc",
+    shard_score_ebc: bool = False,
 ) -> torch.nn.Module:
     sharder = TestQuantEBCSharder(
         sharding_type=sharding_type.value,
         kernel_type=EmbeddingComputeKernel.QUANT.value,
-        shardable_params=[table.name for table in mi.tables],
+        shardable_params=(
+            [table.name for table in mi.tables]
+            + ([table.name for table in mi.weighted_tables] if shard_score_ebc else [])
+        ),
     )
     if not plan:
         # pyre-ignore
diff --git a/torchrec/distributed/tests/test_infer_shardings.py b/torchrec/distributed/tests/test_infer_shardings.py
@@ -204,6 +204,79 @@ def test_tw(self, weight_dtype: torch.dtype, device_type: str) -> None:
             ShardingType.TABLE_WISE.value,
         )
 
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 1,
+        "Not enough GPUs available",
+    )
+    # pyre-ignore
+    @given(
+        weight_dtype=st.sampled_from([torch.qint8]),
+        device_type=st.sampled_from(["cuda"]),
+    )
+    @settings(max_examples=4, deadline=None)
+    def test_tw_ebc_full_rank_weighted_ebc_with_empty_rank(
+        self, weight_dtype: torch.dtype, device_type: str
+    ) -> None:
+        set_propogate_device(True)
+        num_embeddings = 256
+        emb_dim = 16
+        world_size = 2
+        batch_size = 4
+        local_device = torch.device(f"{device_type}:0")
+        mi = create_test_model(
+            num_embeddings,
+            emb_dim,
+            world_size,
+            batch_size,
+            dense_device=local_device,
+            sparse_device=local_device,
+            quant_state_dict_split_scale_bias=True,
+            weight_dtype=weight_dtype,
+            num_features=6,  # 6 sparse features on ebc
+            num_weighted_features=1,  # only 1 weighted sparse feature on weighted_ebc
+        )
+
+        non_sharded_model = mi.quant_model
+
+        sharded_model = shard_qebc(
+            mi,
+            sharding_type=ShardingType.TABLE_WISE,
+            device=local_device,
+            expected_shards=None,
+            shard_score_ebc=True,
+        )
+
+        self.assertEqual(
+            len(
+                sharded_model._module.sparse.ebc._lookups[0]._embedding_lookups_per_rank
+            ),
+            2,
+        )
+        self.assertEqual(
+            len(
+                sharded_model._module.sparse.weighted_ebc._lookups[
+                    0
+                ]._embedding_lookups_per_rank
+            ),
+            1,
+        )
+
+        inputs = [
+            model_input_to_forward_args(inp.to(local_device))
+            for inp in prep_inputs(mi, world_size, batch_size, long_indices=False)
+        ]
+
+        sharded_model.load_state_dict(non_sharded_model.state_dict())
+
+        sharded_output = sharded_model(*inputs[0])
+        non_sharded_output = non_sharded_model(*inputs[0])
+        assert_close(sharded_output, non_sharded_output)
+
+        gm: torch.fx.GraphModule = symbolic_trace(sharded_model)
+        gm_script = torch.jit.script(gm)
+        gm_script_output = gm_script(*inputs[0])
+        assert_close(sharded_output, gm_script_output)
+
     @unittest.skipIf(
         torch.cuda.device_count() <= 1,
         "Not enough GPUs available",