Delta tracker DMP integration (pytorch#3064)

aliafzal · facebook-github-bot · commit 7fbc76604b82 · 2025-06-14T11:19:10.000-07:00
Summary: Pull Request resolved: pytorch#3064 ## This Diff Adds ModelDeltaTracker integration with DMP (DistributedModelParallel) and sharded modules. This integration enables tracking of embedding IDs, embeddings, and optimizer states during model execution, which is particularly useful for online training scenarios. ### Key Components: **ModelTrackerConfig Integration**: * Added ModelTrackerConfig parameter to DMP constructor * When provided, automatically initializes ModelDeltaTracker * Configurable options include tracking_mode, delete_on_read, auto_compact, and fqns_to_skip **Custom Callables for Tracking**: * Added custom post_lookup_tracker_fn in ShardedModule to capture IDs and embeddings after lookup operations. This provides tracking ids/states natively into torchrec without registering any nn.Module specific hooks. * Added post_odist_tracker_fn for auto-compaction of tracked data. This custom callable provides native support for overlapping compaction with odist. * Implemented pre_forward callables in DMP for operations like batch index incrementation **Model Parallel API Enhancements**: * Added `get_model_tracker()` method to DistributedModelParallel for direct access to the ModelDeltaTracker instance. This API give the flexibility to integrate model tracker into required components directly without needing to access the dmp_module. * Added `get_delta()` method as a convenience API to retrieve delta rows from dmp_module. **Embedding Module Changes**: * Enhanced ShardedEmbeddingBag and ShardedEmbedding to support tracking callable * Added callable registration methods in embedding modules * Implemented tracking support for different optimizer states (momentum, Adam states) ## ModelDeltaTracker Context ModelDeltaTracker is a utility for tracking and retrieving unique IDs and their corresponding embeddings or states from embedding modules in model using Torchrec. It's particularly useful for: 1. Identifying which embedding rows were accessed during model execution 2. Retrieving the latest delta or unique rows for a model 3. Computing top-k changed embeddings 4. Supporting streaming updated embeddings between systems during online training For more details see diff:D75853147 or PR pytorch#3057 Differential Revision: D76202371
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -1514,13 +1514,17 @@ def compute_and_output_dist(
                 EmbeddingEvent.LOOKUP, self._module_fqn, sharding_type
             ):
                 embs = lookup(features)
+                if self.post_lookup_tracker_fn is not None:
+                    self.post_lookup_tracker_fn(features, embs)
 
             with maybe_annotate_embedding_event(
                 EmbeddingEvent.OUTPUT_DIST, self._module_fqn, sharding_type
             ):
                 awaitables_per_sharding.append(
                     odist(embs.view(-1, embedding_dim), sharding_ctx)
                 )
+                if self.post_odist_tracker_fn is not None:
+                    self.post_odist_tracker_fn()
 
             features_before_all2all_per_sharding.append(
                 # pyre-fixme[6]: For 1st argument expected `KeyedJaggedTensor` but
diff --git a/torchrec/distributed/embedding_types.py b/torchrec/distributed/embedding_types.py
@@ -9,10 +9,12 @@
 
 import abc
 import copy
+import logging as logger
 from dataclasses import dataclass
 from enum import Enum, unique
 from typing import (
     Any,
+    Callable,
     Dict,
     Generic,
     Iterable,
@@ -370,6 +372,10 @@ def __init__(
         self._input_dists: List[nn.Module] = []
         self._lookups: List[nn.Module] = []
         self._output_dists: List[nn.Module] = []
+        self.post_lookup_tracker_fn: Optional[
+            Callable[[KeyedJaggedTensor, torch.Tensor], None]
+        ] = None
+        self.post_odist_tracker_fn: Optional[Callable[..., None]] = None
 
     def prefetch(
         self,
@@ -418,6 +424,41 @@ def train(self, mode: bool = True):  # pyre-ignore[3]
 
         return self
 
+    def register_post_lookup_tracker_fn(
+        self,
+        record_fn: Callable[[KeyedJaggedTensor, torch.Tensor], None],
+    ) -> None:
+        """
+        Register a function to be called after lookup is done. This is used for
+        tracking the lookup results and optimizer states.
+
+        Args:
+            record_fn (Callable[[KeyedJaggedTensor, torch.Tensor], None]): A custom record function to be called after lookup is done.
+
+        """
+        if self.post_lookup_tracker_fn is not None:
+            logger.warning(
+                "[ModelDeltaTracker] Custom record function already defined, overriding with new callable"
+            )
+        self.post_lookup_tracker_fn = record_fn
+
+    def register_post_odist_tracker_fn(
+        self,
+        record_fn: Callable[..., None],
+    ) -> None:
+        """
+        Register a function to be called after registering odist awaitable.
+
+        Args:
+            record_fn (Callable[Callable[..., None]):
+
+        """
+        if self.post_odist_tracker_fn is not None:
+            logger.warning(
+                "[ModelDeltaTracker] Compaction function already defined, overriding with new callable"
+            )
+        self.post_odist_tracker_fn = record_fn
+
     @property
     def unsharded_module_type(self) -> Type[nn.Module]:
         """
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -1458,13 +1458,17 @@ def compute_and_output_dist(
                 sharding_type,
             ):
                 embs = lookup(features)
+                if self.post_lookup_tracker_fn is not None:
+                    self.post_lookup_tracker_fn(features, embs)
 
             with maybe_annotate_embedding_event(
                 EmbeddingEvent.OUTPUT_DIST,
                 self._module_fqn,
                 sharding_type,
             ):
                 awaitables.append(dist(embs, sharding_context))
+                if self.post_odist_tracker_fn is not None:
+                    self.post_odist_tracker_fn()
 
             if sharding_context:
                 batch_size_per_feature_pre_a2a.extend(
diff --git a/torchrec/distributed/model_parallel.py b/torchrec/distributed/model_parallel.py
@@ -29,6 +29,8 @@
 from torch.nn.modules.module import _IncompatibleKeys
 from torch.nn.parallel import DistributedDataParallel
 from torchrec.distributed.comm import get_local_size
+from torchrec.distributed.model_tracker.model_delta_tracker import ModelDeltaTracker
+from torchrec.distributed.model_tracker.types import DeltaRows, ModelTrackerConfig
 
 from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
 from torchrec.distributed.sharding_plan import get_default_sharders
@@ -208,6 +210,7 @@ class DistributedModelParallel(nn.Module, FusedOptimizerModule):
         init_parameters (bool): initialize parameters for modules still on meta device.
         data_parallel_wrapper (Optional[DataParallelWrapper]): custom wrapper for data
             parallel modules.
+        model_tracker_config (Optional[DeltaTrackerConfig]): config for model tracker.
 
     Example::
 
@@ -234,6 +237,7 @@ def __init__(
         init_data_parallel: bool = True,
         init_parameters: bool = True,
         data_parallel_wrapper: Optional[DataParallelWrapper] = None,
+        model_tracker_config: Optional[ModelTrackerConfig] = None,
     ) -> None:
         super().__init__()
         torch._C._log_api_usage_once(f"torchrec.distributed.{self.__class__.__name__}")
@@ -242,6 +246,8 @@ def __init__(
 
         self._ddp_wrapped: bool = False
 
+        self.has_model_tracker: bool = model_tracker_config is not None
+
         if env is None:
             pg = dist.GroupMember.WORLD
             assert pg is not None, "Process group is not initialized"
@@ -286,6 +292,11 @@ def __init__(
         if init_data_parallel:
             self.init_data_parallel()
 
+        if model_tracker_config is not None:
+            self.model_delta_tracker: ModelDeltaTracker = self._init_delta_tracker(
+                model_tracker_config, self._dmp_wrapped_module
+            )
+
     @property
     def module(self) -> nn.Module:
         """
@@ -344,6 +355,19 @@ def copy(
     def _init_dmp(self, module: nn.Module) -> nn.Module:
         return self._shard_modules_impl(module)
 
+    def _init_delta_tracker(
+        self, model_tracker_config: ModelTrackerConfig, module: nn.Module
+    ) -> ModelDeltaTracker:
+        # Init delta tracker if config is provided
+        return ModelDeltaTracker(
+            model=module,
+            consumers=model_tracker_config.consumers,
+            delete_on_read=model_tracker_config.delete_on_read,
+            auto_compact=model_tracker_config.auto_compact,
+            mode=model_tracker_config.tracking_mode,
+            fqns_to_skip=model_tracker_config.fqns_to_skip,
+        )
+
     def _init_optim(self, module: nn.Module) -> CombinedOptimizer:
         # pyre-ignore [6]
         return CombinedOptimizer(self._fused_optim_impl(module, []))
@@ -421,6 +445,25 @@ def init_parameters(module: nn.Module) -> None:
 
         module.apply(init_parameters)
 
+    def get_model_tracker(self) -> ModelDeltaTracker:
+        """
+        Returns the model tracker if it exists.
+        """
+
+        assert (
+            self.has_model_tracker
+        ), "Model tracker is not initialized. Add ModelTrackerConfig at DistributedModelParallel init."
+        return self.model_delta_tracker
+
+    def get_delta(self, consumer: Optional[str] = None) -> Dict[str, DeltaRows]:
+        """
+        Returns the delta rows for the given consumer.
+        """
+        assert (
+            self.has_model_tracker
+        ), "Model tracker is not initialized. Add ModelTrackerConfig at DistributedModelParallel init."
+        return self.model_delta_tracker.get_delta(consumer)
+
     def sparse_grad_parameter_names(
         self, destination: Optional[List[str]] = None, prefix: str = ""
     ) -> List[str]:
diff --git a/torchrec/distributed/model_tracker/__init__.py b/torchrec/distributed/model_tracker/__init__.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+"""Torchrec Model Tracker
+
+The model tracker module provides functionality to track and retrieve unique IDs and 
+embeddings for supported modules during training. This is useful for identifying and 
+retrieving the latest delta or unique rows for a model, which can help compute topk 
+or to stream updated embeddings from predictors to trainers during online training.
+
+Key features include:
+- Tracking unique IDs and embeddings for supported modules
+- Support for multiple consumers with independent tracking
+- Configurable tracking modes (ID_ONLY, EMBEDDING)
+- Compaction of tracked data to reduce memory usage
+"""
+
+from torchrec.distributed.model_tracker.delta_store import DeltaStore  # noqa
+from torchrec.distributed.model_tracker.model_delta_tracker import (
+    ModelDeltaTracker,  # noqa
+    SUPPORTED_MODULES,  # noqa
+)
+from torchrec.distributed.model_tracker.types import (
+    DeltaRows,  # noqa
+    EmbdUpdateMode,  # noqa
+    IndexedLookup,  # noqa
+    ModelTrackerConfig,  # noqa
+    TrackingMode,  # noqa
+)
diff --git a/torchrec/distributed/model_tracker/model_delta_tracker.py b/torchrec/distributed/model_tracker/model_delta_tracker.py
@@ -85,15 +85,15 @@ def __init__(
         self.curr_batch_idx: int = 0
         self.curr_compact_index: int = 0
 
-        self.store: DeltaStore = DeltaStore(UPDATE_MODE_MAP[self._mode])
-
         # from module FQN to ShardedEmbeddingCollection/ShardedEmbeddingBagCollection
         self.tracked_modules: Dict[str, nn.Module] = {}
         self.feature_to_fqn: Dict[str, str] = {}
         # Generate the mapping from FQN to feature names.
         self.fqn_to_feature_names()
-        # Validate the mode is supported for the given module
-        self._validate_mode()
+        # Validate is the mode is supported for the given module and initialize tracker functions
+        self._validate_and_init_tracker_fns()
+
+        self.store: DeltaStore = DeltaStore(UPDATE_MODE_MAP[self._mode])
 
         # Mapping feature name to corresponding FQNs. This is used for retrieving
         # the FQN associated with a given feature name in record_lookup().
@@ -105,6 +105,38 @@ def __init__(
                 self.feature_to_fqn[feature_name] = fqn
         logger.info(f"feature_to_fqn: {self.feature_to_fqn}")
 
+    def increment_batch_idx(self) -> None:
+        self.curr_batch_idx += 1
+
+    def trigger_compaction(self) -> None:
+        if self.curr_compact_index >= self.curr_batch_idx:
+            # only trigger compaction once per iteration
+            return
+
+        # TODO: May need to revisit the compaction logic with multiple consmers.
+        # At present we take the max per_consumer_batch_idx to ensure we only compact
+        # newely received lookups
+
+        # The trigger_compaction() function is expected to overlap with comms to hide
+        # compaction compute overhead. Currently, we overlap compaction with odist
+        # because ID tracking occurs during local embedding lookup, which takes place
+        # before odist. This way, auto_compact always merges all past IDs tensors since
+        # the last get_delta call into a single IDs tensor per FQN.
+        #
+        # For delete_on_read=True, get_delta() should delete up to per_consumer_batch_idx
+        # (exclusive). So the compaction should start from per_consumer_batch_idx.
+        #
+        # For delete_on_read=False, get_delta() won't delete tensors, but it does advance
+        # per_consumer_batch_idx accordingly, where all ids prior to per_consumer_batch_idx (exclusive)
+        # should have been compacted into one tensor regardless of auto_compact=True/False.
+        # Therefore, all future compactions should start from per_consumer_batch_idx.
+        start_idx = max(self.per_consumer_batch_idx.values())
+        end_idx = self.curr_batch_idx
+        # Update the current compact index to the end index to avoid duplicate compaction.
+        self.curr_compact_index = end_idx
+        if start_idx < end_idx:
+            self.compact(start_idx=start_idx, end_idx=end_idx)
+
     def record_lookup(self, kjt: KeyedJaggedTensor, states: torch.Tensor) -> None:
         """
         Records the IDs from a given KeyedJaggedTensor and their corresponding embeddings/parameter states.
@@ -120,6 +152,11 @@ def record_lookup(self, kjt: KeyedJaggedTensor, states: torch.Tensor) -> None:
             states (torch.Tensor): The embeddings or states corresponding to the IDs in the kjt.
         """
 
+        # FIXME: Validate if record_lookup is always the first entry point for each batch and mode.
+        # Need to re-adjust this approach in future as it may not scale if record_lookup is called
+        # multiple times per batch or if we add more entry points.
+        self.increment_batch_idx()
+
         # In ID_ONLY mode, we only track feature IDs received in the current batch.
         if self._mode == TrackingMode.ID_ONLY:
             self.record_ids(kjt)
@@ -333,13 +370,23 @@ def compact(self, start_idx: int, end_idx: int) -> None:
         self.store.compact(start_idx, end_idx)
 
     def _clean_fqn_fn(self, fqn: str) -> str:
-        # strip DMP internal module FQN prefix to match state dict FQN
-        return fqn.replace("_dmp_wrapped_module.module.", "")
-
-    def _validate_mode(self) -> None:
+        # strip FQN prefixes added by DMP and other TorchRec operations to match state dict FQN
+        # handles both "_dmp_wrapped_module.module." and "module." prefixes
+        prefixes_to_strip = ["_dmp_wrapped_module.module.", "module."]
+        for prefix in prefixes_to_strip:
+            if fqn.startswith(prefix):
+                return fqn[len(prefix) :]
+        return fqn
+
+    def _validate_and_init_tracker_fns(self) -> None:
         "To validate the mode is supported for the given module"
         for module in self.tracked_modules.values():
             assert not (
                 isinstance(module, ShardedEmbeddingBagCollection)
                 and self._mode == TrackingMode.EMBEDDING
             ), "EBC's lookup returns pooled embeddings and currently, we do not support tracking raw embeddings."
+            # register post lookup function
+            module.register_post_lookup_tracker_fn(self.record_lookup)
+            # register auto compaction function at odist
+            if self._auto_compact:
+                module.register_post_odist_tracker_fn(self.trigger_compaction)
diff --git a/torchrec/distributed/model_tracker/tests/test_model_delta_tracker.py b/torchrec/distributed/model_tracker/tests/test_model_delta_tracker.py
diff --git a/torchrec/distributed/model_tracker/types.py b/torchrec/distributed/model_tracker/types.py