Address 10% performance issue (~16 ms per call) when using batch_size_stages in TorchRec Metrics with GPU (#2814)

ilyas409 · facebook-github-bot · commit 4c203eb32c3e · 2025-03-18T08:19:16.000-07:00
Summary: Pull Request resolved: #2814 When num_batch was stored **and used** as a tensor that was part of the module, it was moved to GPU when the parent module was moved to GPU. Accessing a tensor with a GPU value within a CPU workload causes a CUDA StreamSync because the tensor value needs to be moved from HBM to RAM via ATen. Instead, we are using a separate variable `num_batch` for calculation, and synchronizing its value to as a tensor using state dict hooks. **From profiler:** *** Title torchrec/metrics/throughput.py(172): _get_batch_size Start 399.423 ms Wall Duration 16.001 ms Self Time 0.019 ms *** Title aten::is_nonzero Category cpu_op Start 399.464 ms Wall Duration 15.954 ms Self Time 0.003 ms *** Title aten::_local_scalar_dense Category cpu_op Start 399.468 ms Wall Duration 15.948 ms Self Time 0.015 ms *** Title cudaStreamSynchronize Category cuda_runtime Start 399.506 ms Wall Duration 15.904 ms *** {F1976014130} {F1976133311} Reviewed By: xunnanxu, iamzainhuda Differential Revision: D71079949 fbshipit-source-id: 5b21198e2579611904056f899a8c237e517ab22a
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py
@@ -29,6 +29,7 @@
 )
 from torchrec.metrics.metrics_config import (
     _DEFAULT_WINDOW_SIZE,
+    BatchSizeStage,
     DefaultMetricsConfig,
     DefaultTaskInfo,
     EmptyMetricsConfig,
@@ -536,3 +537,52 @@ def test_adjust_compute_interval_1_30(self) -> None:
             min_interval=1.0,
             max_interval=30.0,
         )
+
+    def test_save_and_load_state_dict(self) -> None:
+        # Test without batch_size_stages
+        metric_module = generate_metric_module(
+            TestMetricModule,
+            metrics_config=DefaultMetricsConfig,
+            batch_size=128,
+            world_size=1,
+            my_rank=0,
+            state_metrics_mapping={},
+            device=torch.device("cpu"),
+        )
+        metric_module.update(gen_test_batch(128))
+
+        state_dict_without_bss = metric_module.state_dict()
+        # Make sure state loading works and doesn't throw an error
+        metric_module.load_state_dict(state_dict_without_bss)
+        # Make sure num_batch in the throughput module is not in state_dict
+        self.assertFalse("throughput_metric.num_batch" in state_dict_without_bss)
+
+        # Test with batch_size_stages
+        metric_module = generate_metric_module(
+            TestMetricModule,
+            metrics_config=DefaultMetricsConfig,
+            batch_size=128,
+            world_size=1,
+            my_rank=0,
+            state_metrics_mapping={},
+            device=torch.device("cpu"),
+            batch_size_stages=[BatchSizeStage(256, 100), BatchSizeStage(512, None)],
+        )
+
+        # Update metric 100 times
+        for _ in range(100):
+            metric_module.update(gen_test_batch(128))
+
+        # Simulate a checkpoint save
+        state_dict = metric_module.state_dict()
+        # Make sure num_batch is updated correctly to 100
+        self.assertEqual(state_dict["throughput_metric.num_batch"], 100)
+
+        # Simulate a checkpoint load
+        metric_module.load_state_dict(state_dict)
+        # Make sure num_batch is correctly restored
+        throughput_metric = metric_module.throughput_metric
+        self.assertIsNotNone(throughput_metric)
+        self.assertEqual(throughput_metric._num_batch, 100)
+        # Make sure num_batch is correctly synchronized
+        self.assertEqual(throughput_metric._num_batch, 100)
diff --git a/torchrec/metrics/tests/test_throughput.py b/torchrec/metrics/tests/test_throughput.py
@@ -10,6 +10,8 @@
 # pyre-ignore-all-errors[56]
 
 import unittest
+from collections import OrderedDict
+from typing import Any, Dict
 from unittest.mock import Mock, patch
 
 import torch
@@ -213,3 +215,125 @@ def test_batch_size_schedule(self, time_mock: Mock) -> None:
                 "throughput-throughput|batch_size": 512,
             },
         )
+
+    def test_num_batch_without_batch_size_stages(self) -> None:
+        # Create the module without the batch_size_stages
+        throughput_metric = ThroughputMetric(
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_seconds=100,
+            batch_size_stages=None,
+        )
+
+        # Make sure num_batch is not present as an argument of the class
+        self.assertFalse(hasattr(throughput_metric, "num_batch"))
+
+        throughput_metric.update()
+        state_dict: Dict[str, Any] = throughput_metric.state_dict()
+        # Ensure num_batch is not included in the state_dict for the module without batch_size_stages
+        self.assertNotIn("num_batch", state_dict)
+
+    def test_state_dict_load_module_lifecycle(self) -> None:
+        """
+        A test to ensure that the load_state_dict and state_dict hooks correctly handle the num_batch attribute
+        through the module lifecycle.
+        """
+
+        throughput_metric = ThroughputMetric(
+            batch_size=32,
+            world_size=4,
+            window_seconds=100,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+
+        self.assertTrue(hasattr(throughput_metric, "_num_batch"))
+
+        # Stage 1: create metric and update the state_dict before persisting it
+        # Update metric, expecting num_batch to be incremented to 1
+        throughput_metric.update()
+        # Ensure num_batch is 1
+        self.assertEqual(throughput_metric._num_batch, 1)
+        # Ensure num_batch is included in the state_dict and has the correct value
+        state_dict: Dict[str, Any] = throughput_metric.state_dict()
+        self.assertIn("num_batch", state_dict)
+        # Ensure num_batch was saved to state_dict with the correct value
+        self.assertEqual(state_dict["num_batch"].item(), throughput_metric._num_batch)
+
+        # Stage 2: load the state_dict and ensure num_batch is loaded correctly
+
+        # Create a new metric instance
+        new_throughput_metric = ThroughputMetric(
+            batch_size=32,
+            world_size=4,
+            window_seconds=100,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+        # Ensure num_batch is 0
+        self.assertEqual(new_throughput_metric._num_batch, 0)
+        # Load the state_dict
+        new_throughput_metric.load_state_dict(state_dict)
+        # Ensure num_batch is loaded from the state_dict with the correct value
+        self.assertEqual(new_throughput_metric._num_batch, 1)
+
+        # Stage 3: update the metric after loading the state and resave the state_dict
+
+        # Save the state_dict
+        state_dict = new_throughput_metric.state_dict()
+        # Ensure num_batch is included in the state_dict
+        self.assertIn("num_batch", state_dict)
+        # Ensure num_batch was saved to state_dict with the correct value
+        self.assertEqual(
+            state_dict["num_batch"].item(), new_throughput_metric._num_batch
+        )
+
+    def test_state_dict_hook_adds_key(self) -> None:
+        """
+        Ensures that the state_dict_hook adds the 'num_batch' key to the state_dict
+        when batch_size_stages is True.
+        """
+        throughput_metric = ThroughputMetric(
+            batch_size=32,
+            world_size=4,
+            window_seconds=100,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+        for _ in range(5):
+            throughput_metric.update()
+        state_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
+        prefix: str = "test_prefix_"
+        ThroughputMetric.state_dict_hook(throughput_metric, state_dict, prefix, {})
+        self.assertIn(f"{prefix}num_batch", state_dict)
+        self.assertEqual(state_dict[f"{prefix}num_batch"].item(), 5)
+
+    def test_state_dict_hook_no_batch_size_stages(self) -> None:
+        """
+        Verifies that the state_dict_hook does not add the 'num_batch' key when
+        batch_size_stages is None.
+        """
+        throughput_metric = ThroughputMetric(
+            batch_size=32,
+            world_size=4,
+            window_seconds=100,
+            batch_size_stages=None,
+        )
+        state_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
+        prefix: str = "test_prefix_"
+        ThroughputMetric.state_dict_hook(throughput_metric, state_dict, prefix, {})
+        self.assertNotIn(f"{prefix}num_batch", state_dict)
+
+    def test_load_state_dict_hook_restores_value(self) -> None:
+        """
+        Checks that load_state_dict_hook correctly restores the 'num_batch' value
+        from the state_dict.
+        """
+        throughput_metric = ThroughputMetric(
+            batch_size=32,
+            world_size=4,
+            window_seconds=100,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+        state_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
+        prefix: str = "test_prefix_"
+        state_dict[f"{prefix}num_batch"] = torch.tensor(10, dtype=torch.long)
+        throughput_metric.load_state_dict_hook(state_dict, prefix, {}, True, [], [], [])
+        self.assertEqual(throughput_metric._num_batch, 10)
diff --git a/torchrec/metrics/throughput.py b/torchrec/metrics/throughput.py
@@ -13,8 +13,8 @@
 import logging
 import math
 import time
-from collections import deque
-from typing import Deque, Dict, List, Optional
+from collections import deque, OrderedDict
+from typing import Any, Deque, Dict, List, Optional
 
 import torch
 import torch.nn as nn
@@ -112,12 +112,14 @@ def __init__(
             batch_size_stages
         )
 
+        if self._batch_size_stages is not None:
+            # Keep track of the number of batches if using batch_size_stages
+            self._num_batch: int = 0
+            self._register_load_state_dict_pre_hook(self.load_state_dict_hook)
+            self.register_state_dict_post_hook(self.state_dict_hook)
+
         self.register_buffer("total_examples", torch.tensor(0, dtype=torch.long))
         self.register_buffer("warmup_examples", torch.tensor(0, dtype=torch.long))
-        if batch_size_stages is not None:
-            # only load num_batch when batch_size_stages is set.
-            # So ckpt can be backward compatible -> non-existing key won't be loaded and crash
-            self.register_buffer("num_batch", torch.tensor(0, dtype=torch.long))
         self.register_buffer(
             "time_lapse_after_warmup", torch.tensor(0, dtype=torch.double)
         )
@@ -181,6 +183,7 @@ def _get_batch_size(self) -> int:
             return self._batch_size
 
         # Get batch size from batch_size_stages
+        assert self._num_batch is not None, "num_batch should not be None"
         batch_size_stages = none_throws(self._batch_size_stages)
         while self._batch_size_stages:
             stage = self._batch_size_stages[0]
@@ -189,7 +192,7 @@ def _get_batch_size(self) -> int:
                 assert len(batch_size_stages) == 1
                 return stage.batch_size
             # This stage finished
-            if stage.max_iters < self.num_batch:
+            if stage.max_iters < self._num_batch:
                 batch_size_stages.pop(0)
                 # Move to the next stage
                 continue
@@ -208,7 +211,7 @@ def update(self) -> None:
         ts = time.monotonic()
         self._steps += 1
         if self._batch_size_stages is not None:
-            self.num_batch += 1
+            self._num_batch += 1
         batch_examples = self._batch_examples()
         self.total_examples += batch_examples
         self.attempt_examples += batch_examples
@@ -276,3 +279,33 @@ def compute(self) -> Dict[str, torch.Tensor]:
             )
 
         return ret
+
+    @staticmethod
+    def state_dict_hook(
+        module: nn.Module,
+        state_dict: OrderedDict[str, torch.Tensor],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+    ) -> None:
+        if module._batch_size_stages is not None:
+            # Save the number of batches used for the throughput calculation to the state dict
+            num_batch_key = f"{prefix}num_batch"
+            state_dict[num_batch_key] = torch.tensor(
+                module._num_batch, dtype=torch.long
+            )
+
+    def load_state_dict_hook(
+        self,
+        state_dict: OrderedDict[str, torch.Tensor],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        key = f"{prefix}num_batch"
+        if key in state_dict and self._batch_size_stages is not None:
+            # Restore the number of batches used for the throughput calculation from the state dict
+            num_batch_tensor = state_dict.pop(key)
+            self._num_batch = int(num_batch_tensor.item())