Set intra_group_size from env var inside comm.py. (meta-pytorch#3697)

Ali-Tehrani · facebook-github-bot · commit 5cea44ad724b · 2026-01-29T17:08:56.000-08:00
Summary:

Context
---------
TorchRec comms needs a way to obtain the pod-size (topology-domain-multiple) and the total amount of process groups of the topology group for TWRW/Grid-sharding.

We obtain the number of intra-nodes within a pod by obtaining the `TOPOLOGY_DOMAIN_MULTIPLE` from the environment variables (see diff stack).

Implementation
------------------
- created `get_intra_group_size` function that obtains the number of intra-node-size from envrionemnt variable, and if not it defaults to usual `get_local_size`.
- updated `intra_and_cross_node_pg` to utilize `get_intra_node_size` instead.

Differential Revision: D91617889
diff --git a/torchrec/distributed/comm.py b/torchrec/distributed/comm.py
@@ -115,12 +115,68 @@ def get_num_groups(world_size: Optional[int] = None) -> int:
     return world_size // get_local_size(world_size)
 
 
+def get_topology_domain_multiple() -> Optional[int]:
+    """The number of host/server/node per pod/domain."""
+    topology_domain_multiple = _env2int(
+        [
+            "TOPOLOGY_DOMAIN_MULTIPLE",
+        ],
+        -1,
+    )
+    if topology_domain_multiple == -1:
+        return None
+    return topology_domain_multiple
+
+
+def get_topology_group_world_size(world_size: Optional[int] = None) -> int:
+    """
+    Gets topology group world size, total number of processes linked within a topology group
+
+    This is the largest number of processes linked together by high-bandwidth communication.
+    If it isn't specified, it falls back to LOCAL_WORLD_SIZE
+    """
+    topology_domain_multiple = get_topology_domain_multiple()
+    local_world_size = get_local_size(world_size)
+
+    if topology_domain_multiple is None:
+        logger.warn(
+            "Could not determine TOPOLOGY_DOMAIN_MULTIPLE from environment,"
+            " utilizing LOCAL_WORLD_SIZE instead."
+        )
+        return local_world_size
+
+    # Total number of gpu in domain = topology_domain_mult * number_gpu_per_domain
+    total_numb_proc = topology_domain_multiple * local_world_size
+    world_size = dist.get_world_size()
+    if world_size % total_numb_proc != 0:
+        raise ValueError(
+            f"World size {world_size} is not a multiple of the topology group: {total_numb_proc}"
+        )
+    return total_numb_proc
+
+
 def intra_and_cross_node_pg(
     device: Optional[torch.device] = None,
     backend: Optional[str] = None,
 ) -> Tuple[Optional[dist.ProcessGroup], Optional[dist.ProcessGroup]]:
     """
     Creates sub process groups (intra and cross node)
+
+    e.g. world_size = 12 need to split into groups of size `local_size = 6`
+    process groups = [0, 1, 2, ... 11]
+
+    intra-group:
+        [0] -> [0, 1, .., 5]
+        [1] -> [0, 1, .., 5]
+        ...
+        [6] -> [6, ..., 11]
+        ...
+        [11] -> [6, ..., 11]
+    cross-group:
+        [0] -> [[0, 6]]
+        [1] -> [[1, 7]]
+        ...
+        [5] -> [[5, 11]]
     """
     if device is not None and device.type == "meta":
         return None, None
@@ -130,10 +186,12 @@ def intra_and_cross_node_pg(
 
     my_size = dist.get_world_size()
     my_rank = dist.get_rank()
-    my_local_rank = get_local_rank(my_size, my_rank)
-    local_size = get_local_size(my_size)
-    my_group_rank = get_group_rank(my_size, my_rank)
-    group_count = get_num_groups(my_size)
+    local_size = get_topology_group_world_size(my_size)
+    # TODO: Alireza look into incorporating topology group WS in
+    #       get_group_rank, get_num_groups, get_local_rank
+    my_group_rank = my_rank // local_size
+    group_count = my_size // local_size
+    my_local_rank = my_rank % local_size  # Not the same as the actual local_rank
     if backend is None:
         backend = dist.get_backend()
 
diff --git a/torchrec/distributed/test_utils/test_model_parallel.py b/torchrec/distributed/test_utils/test_model_parallel.py
@@ -140,6 +140,7 @@ def _test_sharding(
         backend: str = "gloo",
         world_size: int = 2,
         local_size: Optional[int] = None,
+        pod_size: Optional[int] = None,
         world_size_2D: Optional[int] = None,
         node_group_size: Optional[int] = None,
         constraints: Optional[Dict[str, ParameterConstraints]] = None,
@@ -173,6 +174,7 @@ def _test_sharding(
                 rank=0,
                 world_size=world_size,
                 local_size=local_size,
+                pod_size=pod_size,
                 world_size_2D=world_size_2D,
                 node_group_size=node_group_size,
                 model_class=model_class,  # pyre-ignore[6]
@@ -205,6 +207,7 @@ def _test_sharding(
                 callable=sharding_single_rank_test,
                 world_size=world_size,
                 local_size=local_size,
+                pod_size=pod_size,
                 world_size_2D=world_size_2D,
                 node_group_size=node_group_size,
                 model_class=model_class,
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -770,6 +770,7 @@ def sharding_single_rank_test_single_process(
     weighted_tables: Optional[List[EmbeddingTableConfig]] = None,
     constraints: Optional[Dict[str, ParameterConstraints]] = None,
     local_size: Optional[int] = None,
+    pod_size: Optional[int] = None,
     qcomms_config: Optional[QCommsConfig] = None,
     apply_optimizer_in_backward_config: Optional[
         Dict[str, Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]]
@@ -869,11 +870,11 @@ def sharding_single_rank_test_single_process(
             world_size=world_size_2D if world_size_2D else world_size,
             compute_device=device.type,
             local_world_size=node_group_size if node_group_size else local_size,
+            pod_size=pod_size,
         ),
         constraints=constraints,
     )
     plan: ShardingPlan = planner.collective_plan(local_model, sharders, pg)
-
     if submodule_configs is not None:
         # Dynamic 2D parallel, create a new plan for each submodule
         for config in submodule_configs:
@@ -1057,6 +1058,7 @@ def sharding_single_rank_test(
     weighted_tables: Optional[List[EmbeddingTableConfig]] = None,
     constraints: Optional[Dict[str, ParameterConstraints]] = None,
     local_size: Optional[int] = None,
+    pod_size: Optional[int] = None,
     qcomms_config: Optional[QCommsConfig] = None,
     apply_optimizer_in_backward_config: Optional[
         Dict[str, Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]]
@@ -1098,6 +1100,7 @@ def sharding_single_rank_test(
             weighted_tables=weighted_tables,
             constraints=constraints,
             local_size=local_size,
+            pod_size=pod_size,
             qcomms_config=qcomms_config,
             apply_optimizer_in_backward_config=apply_optimizer_in_backward_config,
             variable_batch_size=variable_batch_size,
diff --git a/torchrec/distributed/tests/test_model_parallel_hierarchical.py b/torchrec/distributed/tests/test_model_parallel_hierarchical.py
@@ -60,6 +60,7 @@ class ModelParallelHierarchicalTest(ModelParallelTestShared):
                 EmbeddingComputeKernel.FUSED.value,
             ]
         ),
+        topology_domain=st.sampled_from([None, 1]),
         local_size=st.sampled_from([2]),
         qcomms_config=st.sampled_from(
             [
@@ -92,6 +93,7 @@ def test_sharding_nccl_twrw(
         sharder_type: str,
         sharding_type: str,
         kernel_type: str,
+        topology_domain: int,
         local_size: int,
         qcomms_config: Optional[QCommsConfig],
         apply_optimizer_in_backward_config: Optional[
@@ -111,6 +113,10 @@ def test_sharding_nccl_twrw(
         )
         # Make sure detail debug will work with non-even collective
         os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+        world_size = 4
+        if topology_domain:
+            # Need this to test topology group for TWRW
+            os.environ["TOPOLOGY_DOMAIN_MULTIPLE"] = str(topology_domain)
 
         self._test_sharding(
             # pyre-ignore[6]
@@ -123,8 +129,9 @@ def test_sharding_nccl_twrw(
                     device=torch.device("cuda"),
                 ),
             ],
+            pod_size=topology_domain,
             backend="nccl",
-            world_size=4,
+            world_size=world_size,
             local_size=local_size,
             qcomms_config=qcomms_config,
             apply_optimizer_in_backward_config=apply_optimizer_in_backward_config,