Enable Consistent SHA256 Hashing with reduced Planner Context (pytorch#3091)

aporialiao · aporialiao · commit d14d02c702bd · 2025-06-13T12:04:25.000-07:00
Summary: Pull Request resolved: pytorch#3091 Even though SHA256 hashing is used, we're still not seeing the expected same hash generated from the original planner context inputs. This problem is due to Enumerator and Storage Reservation objects we were originally trying to hash containing attributes that differ between processes/instances. To resolve this we reduced the hashing context to only use the specific attributes we need from enumerator and storage reservation. Namely: * enumerator.enumerate(...)'s output - which is used as the `search_space` in both LP and OSS planner * We are storing the output of enumerate as an attribute `last_stored_search_space`. **This assumes enumerate will have been called before we hash the planner context inputs**. * StorageResveration's policy (aka whether `HeuristicalStorageReservation` is used or `FixedStorageReservation` * StorageResveration's initialization attributes: * _percentage * _parameter_multiplier for HeuristicalStorageReservation * _dense_tensor_estimate for HeuristicalStorageReservation Created helper functions: * `hash_planner_context_inputs` to be called in both planner.hash_planner_context_inputs and manifold loading call site (see D75723272) * `hash_sha256_to_int` to be passed in as the default hash function in hash_planner_context_inputs Also created a multiprocess unit test to quickly check if consistent hashes are being generated across different processes given the same input. Differential Revision: D76303748
diff --git a/torchrec/distributed/planner/enumerators.py b/torchrec/distributed/planner/enumerators.py
@@ -7,6 +7,7 @@
 
 # pyre-strict
 
+import copy
 import logging
 from typing import Dict, List, Optional, Set, Tuple, Union
 
@@ -102,6 +103,11 @@ def __init__(
                 EmbeddingStorageEstimator(topology=topology, constraints=constraints),
             ]
 
+        # Initializing caching for enumerate
+        self._last_stored_search_space: Optional[List[ShardingOption]] = None
+        self._last_stored_module: Optional[nn.Module] = None
+        self._last_stored_sharders: Optional[List[ModuleSharder[nn.Module]]] = None
+
     def enumerate(
         self,
         module: nn.Module,
@@ -118,6 +124,12 @@ def enumerate(
             List[ShardingOption]: valid sharding options with values populated.
         """
 
+        if (
+            self._last_stored_module == module
+            and self._last_stored_sharders == sharders
+        ):
+            return self._last_stored_search_space  # pyre-ignore
+
         self._sharder_map = {
             sharder_name(sharder.module_type): sharder for sharder in sharders
         }
@@ -230,8 +242,20 @@ def enumerate(
 
         self.populate_estimates(sharding_options)
 
+        self._last_stored_module = module
+        self._last_stored_sharders = sharders
+
+        # Caching the search space with a copy of sharding options, to avoid unexpected modifications to list
+        self._last_stored_search_space = copy.deepcopy(sharding_options)
         return sharding_options
 
+    @property
+    def last_stored_search_space(self) -> Optional[List[ShardingOption]]:
+        # NOTE: This is the last search space stored by enumerate(...), do not use
+        # this field in place of actually calling enumerate(...) as it will varie for each
+        # module/sharders passed in.
+        return self._last_stored_search_space
+
     def populate_estimates(self, sharding_options: List[ShardingOption]) -> None:
         for estimator in self._estimators:
             estimator.estimate(sharding_options, self._sharder_map)
diff --git a/torchrec/distributed/planner/planners.py b/torchrec/distributed/planner/planners.py
@@ -39,6 +39,7 @@
 )
 from torchrec.distributed.planner.types import (
     Enumerator,
+    hash_planner_context_inputs,
     ParameterConstraints,
     Partitioner,
     PerfModel,
@@ -280,25 +281,21 @@ def collective_plan(
             sharders,
         )
 
-    def hash_planner_context_inputs(self) -> str:
+    def hash_planner_context_inputs(self) -> int:
         """
         Generates a hash for all planner inputs except for partitioner, proposer, performance model, and stats.
         These are all the inputs needed to verify whether a previously generated sharding plan is still valid in a new context.
 
         Returns:
             Generates a hash capturing topology, batch size, enumerator, storage reservation, stats and constraints.
         """
-        hashable_list = [
+        return hash_planner_context_inputs(
             self._topology,
             self._batch_size,
             self._enumerator,
             self._storage_reservation,
-            frozenset(self._constraints.items()) if self._constraints else None,
-        ]
-        serialized_list = str(hashable_list).encode("utf-8")
-        hash_object = hashlib.sha256(serialized_list)
-        hash_digest = hash_object.hexdigest()
-        return hash_digest
+            self._constraints,
+        )
 
     def plan(
         self,
diff --git a/torchrec/distributed/planner/storage_reservations.py b/torchrec/distributed/planner/storage_reservations.py
@@ -163,6 +163,7 @@ class FixedPercentageStorageReservation(StorageReservation):
     def __init__(self, percentage: float) -> None:
         assert percentage >= 0 and percentage <= 1
         self._percentage: float = percentage
+        self._last_reserved_toplogy: Optional[Topology] = None
 
     def reserve(
         self,
@@ -174,8 +175,14 @@ def reserve(
     ) -> Topology:
         reserved_topology = copy.deepcopy(topology)
         _reserve_storage_percentage(reserved_topology, self._percentage)
+        self._last_reserved_toplogy = reserved_topology
         return reserved_topology
 
+    @property
+    def last_reserved_toplogy(self) -> Optional[Topology]:
+        "Cached value of the most recent output from the reserve() method."
+        return self._last_reserved_toplogy
+
 
 class HeuristicalStorageReservation(StorageReservation):
     """
@@ -206,6 +213,7 @@ def __init__(
 
         self._dense_storage: Optional[Storage] = None
         self._kjt_storage: Optional[Storage] = None
+        self._last_reserved_toplogy: Optional[Topology] = None
 
     def reserve(
         self,
@@ -215,6 +223,7 @@ def reserve(
         sharders: List[ModuleSharder[nn.Module]],
         constraints: Optional[Dict[str, ParameterConstraints]] = None,
     ) -> Topology:
+        # TODO: enable proper caching of topology values through _last_reserved_toplogy
         reserved_topology = copy.deepcopy(topology)
 
         batch_inputs, shardable_modules = _get_batch_inputs_and_shardable_parameters(
@@ -262,8 +271,14 @@ def reserve(
                 message=negative_storage_solution,
             )
 
+        self._last_reserved_toplogy = reserved_topology
         return reserved_topology
 
+    @property
+    def last_reserved_toplogy(self) -> Optional[Topology]:
+        "Cached value of the most recent output from the reserve() method."
+        return self._last_reserved_toplogy
+
 
 class InferenceStorageReservation(StorageReservation):
     """
diff --git a/torchrec/distributed/planner/tests/test_planners.py b/torchrec/distributed/planner/tests/test_planners.py
@@ -12,7 +12,7 @@
 
 import torch
 from torch import nn
-from torchrec import EmbeddingConfig
+from torchrec import EmbeddingBagCollection, EmbeddingConfig
 from torchrec.distributed.embedding import EmbeddingCollectionSharder
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
 from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
@@ -306,6 +306,22 @@ def test_passing_info_through_constraints(self) -> None:
 class TestEmbeddingShardingHashPlannerContextInputs(unittest.TestCase):
 
     def setUp(self) -> None:
+        eb_config = EmbeddingBagConfig(
+            name="table_0",
+            embedding_dim=160,
+            num_embeddings=10000,
+            feature_names=["f1"],
+            data_type=DataType.FP16,
+        )
+        module = EmbeddingBagCollection(
+            tables=[eb_config],
+            is_weighted=False,
+            device=torch.device(
+                "meta"
+            ),  # Using meta device for now since only getting search space
+        )
+        sharders = [EmbeddingBagCollectionSharder()]
+
         self.topology = Topology(
             local_world_size=8,
             world_size=1,
@@ -315,10 +331,20 @@ def setUp(self) -> None:
         self.enumerator = EmbeddingEnumerator(
             topology=self.topology, batch_size=self.batch_size
         )
+        self.enumerator.enumerate(module, sharders)  # pyre-ignore
+
         self.storage_reservation = HeuristicalStorageReservation(percentage=0.15)
         self.perf_model = NoopPerfModel(topology=self.topology)
         self.constraints = {"table1": ParameterConstraints()}
 
+        self.storage_reservation.reserve(
+            topology=self.topology,
+            batch_size=self.batch_size,
+            module=module,
+            sharders=sharders,  # pyre-ignore
+            constraints=self.constraints,
+        )
+
     def test_hash_equality(self) -> None:
         planner1 = EmbeddingShardingPlanner(
             topology=self.topology,
diff --git a/torchrec/distributed/planner/tests/test_types.py b/torchrec/distributed/planner/tests/test_types.py
@@ -8,18 +8,30 @@
 # pyre-strict
 
 import unittest
-from typing import cast
+from typing import cast, Dict, Optional
 from unittest.mock import MagicMock
 
 import torch
+from torch import multiprocessing
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
+from torchrec.distributed.planner import EmbeddingShardingPlanner
+from torchrec.distributed.planner.enumerators import EmbeddingEnumerator
+from torchrec.distributed.planner.perf_models import NoopPerfModel
+from torchrec.distributed.planner.storage_reservations import (
+    HeuristicalStorageReservation,
+)
 
 from torchrec.distributed.planner.types import (
     ParameterConstraints,
     Shard,
     ShardingOption,
     Topology,
 )
+from torchrec.distributed.test_utils.multi_process import (
+    MultiProcessContext,
+    MultiProcessTestBase,
+)
 from torchrec.distributed.types import (
     BoundsCheckMode,
     CacheAlgorithm,
@@ -348,3 +360,75 @@ def test_hash_inequality(self) -> None:
         self.assertNotEqual(
             hash(pc1), hash(pc2), "Hashes should be different for different instances"
         )
+
+
+def _test_hashing_consistency(
+    rank: int,
+    world_size: int,
+    backend: str,
+    return_hash_dict: Dict[str, int],
+    local_size: Optional[int] = None,
+) -> None:
+    with MultiProcessContext(rank, world_size, backend, local_size) as ctx:
+        topology = Topology(
+            local_world_size=8,
+            world_size=1,
+            compute_device="cuda",
+        )
+        batch_size = 128
+        enumerator = EmbeddingEnumerator(topology=topology, batch_size=batch_size)
+        eb_config = EmbeddingBagConfig(
+            name="table_0",
+            embedding_dim=160,
+            num_embeddings=10000,
+            feature_names=["f1"],
+            data_type=DataType.FP16,
+        )
+        module = EmbeddingBagCollection(
+            tables=[eb_config],
+            is_weighted=False,
+            device=torch.device(
+                "meta"
+            ),  # Using meta device for now since only getting search space
+        )
+        sharders = [EmbeddingBagCollectionSharder()]
+        enumerator.enumerate(module, sharders)  # pyre-ignore
+        storage_reservation = HeuristicalStorageReservation(percentage=0.15)
+        constraints = {"table1": ParameterConstraints()}
+
+        storage_reservation.reserve(
+            topology=topology,
+            batch_size=batch_size,
+            module=module,
+            sharders=sharders,  # pyre-ignore
+            constraints=constraints,
+        )
+        perf_model = NoopPerfModel(topology=topology)
+
+        planner1 = EmbeddingShardingPlanner(
+            topology=topology,
+            batch_size=batch_size,
+            enumerator=enumerator,
+            storage_reservation=storage_reservation,
+            performance_model=perf_model,
+            constraints=constraints,
+        )
+
+        h = planner1.hash_planner_context_inputs()
+        return_hash_dict[str(rank)] = h
+
+
+class TestConsistentHashingBetweenProcesses(MultiProcessTestBase):
+
+    def test_hash_consistency(self) -> None:
+        # planner
+        world_size = 2
+        return_hash_dict = multiprocessing.Manager().dict()
+        self._run_multi_process_test(
+            callable=_test_hashing_consistency,
+            world_size=world_size,
+            backend="nccl" if torch.cuda.is_available() else "gloo",
+            return_hash_dict=return_hash_dict,
+        )
+        hashes = return_hash_dict.values()
+        assert hashes[0] == hashes[1], "hash values are different."
diff --git a/torchrec/distributed/planner/types.py b/torchrec/distributed/planner/types.py