Starting to test distributed train, fix issue with batch_size reduce

rwightman · rwightman · commit 39eb56f875ce · 2025-04-28T16:48:06.000-07:00
diff --git a/train.py b/train.py
@@ -1142,7 +1142,10 @@ def _backward(_loss):
 
             if args.distributed:
                 # scale gradient btw distributed ranks, each one can have different batch size
-                global_batch_size = utils.reduce_tensor(torch.tensor(batch_size, device=device), 1) # SUM
+                global_batch_size = utils.reduce_tensor(
+                    torch.tensor(batch_size, device=device, dtype=torch.float32),
+                    1 # SUM
+                )
                 dist_scale = args.world_size * batch_size / global_batch_size
             else:
                 dist_scale = None