fix flaky test due to input_jkt.weight dtype (#2763)

TroyGarden · facebook-github-bot · commit 5ee179c96208 · 2025-02-25T18:36:18.000-08:00
Summary: Pull Request resolved: #2763 # context * The [test_model_parallel_nccl](https://fb.workplace.com/groups/970281557043698/posts/1863456557726189/?comment_id=1867254224013089) has been reported to be flaky: [paste](https://www.internalfb.com/intern/everpaste/?color=0&handle=GJBrgxaEWkfR-ycEAP_fNV5sl_l1br0LAAAz) * after an in-depth investigation, the root cause is that when the dtype of the generated input KJT._weights is always `torch.float` (i.e., `torch.float32`), but the test embedding table's dtype could be `torch.FP16`. # changes * convert the input_kjt._weights.dtype to be consistent with `EmbeddingBag.weight.dtype` in EBC (unsharded) Reviewed By: dstaay-fb Differential Revision: D70126859 fbshipit-source-id: 52fc46ced5a3119f168dc4e41eff949ed4a9ec66
diff --git a/torchrec/distributed/test_utils/test_model.py b/torchrec/distributed/test_utils/test_model.py
@@ -223,8 +223,7 @@ def _validate_pooling_factor(
         else:
             raise ValueError(f"For IdList features, unknown input type {input_type}")
 
-        for idx in range(len(idscore_ind_ranges)):
-            ind_range = idscore_ind_ranges[idx]
+        for idx, ind_range in enumerate(idscore_ind_ranges):
             lengths_ = torch.abs(
                 torch.randn(batch_size * world_size, device=device)
                 + (
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -59,7 +59,11 @@
     ShardingPlan,
     ShardingType,
 )
-from torchrec.modules.embedding_configs import BaseEmbeddingConfig, EmbeddingBagConfig
+from torchrec.modules.embedding_configs import (
+    BaseEmbeddingConfig,
+    DataType,
+    EmbeddingBagConfig,
+)
 from torchrec.optim.keyed import CombinedOptimizer, KeyedOptimizerWrapper
 from torchrec.optim.optimizers import in_backward_optimizer_filter
 
@@ -520,9 +524,7 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
             )
 
             # Compare predictions of sharded vs unsharded models.
-            if qcomms_config is None:
-                torch.testing.assert_close(global_pred, torch.cat(all_local_pred))
-            else:
+            if qcomms_config is not None:
                 # With quantized comms, we can relax constraints a bit
                 rtol = 0.003
                 if CommType.FP8 in [
@@ -534,6 +536,18 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
                 torch.testing.assert_close(
                     global_pred, torch.cat(all_local_pred), rtol=rtol, atol=atol
                 )
+            elif (
+                weighted_tables is not None
+                and weighted_tables[0].data_type == DataType.FP16
+            ):  # https://www.internalfb.com/intern/diffing/?paste_number=1740410921
+                torch.testing.assert_close(
+                    global_pred,
+                    torch.cat(all_local_pred),
+                    atol=1e-4,  # relaxed atol due to FP16 in weights
+                    rtol=1e-4,  # relaxed rtol due to FP16 in weights
+                )
+            else:
+                torch.testing.assert_close(global_pred, torch.cat(all_local_pred))
 
 
 def create_device_mesh_for_2D(
diff --git a/torchrec/modules/embedding_modules.py b/torchrec/modules/embedding_modules.py
@@ -248,7 +248,11 @@ def forward(
                 res = embedding_bag(
                     input=f.values(),
                     offsets=f.offsets(),
-                    per_sample_weights=f.weights() if self._is_weighted else None,
+                    per_sample_weights=(
+                        f.weights().to(embedding_bag.weight.dtype)
+                        if self._is_weighted
+                        else None
+                    ),
                 ).float()
                 pooled_embeddings.append(res)
         return KeyedTensor(