Inprecision in pytorch-lightning's Gradient Accumulation? #18743

leehawk787 · 2023-10-07T14:20:24Z

leehawk787
Oct 7, 2023

In my understanding of Gradient Accumulation the result of computing with batch_size = x vs. accumulate_gradient = h, batch size = x/h should be the same.

So those three samples should compute the same thing in pytorch-lightning:

import pytorch_lightning as pl

pl.seed_everything(42)

batch_size = 8
trainer = pl.Trainer(max_epochs=1)

batch_size = 8
trainer = pl.Trainer(max_epochs=1, accumulate_grad_batches=1)

batch_size = 4
trainer = pl.Trainer(max_epochs=1, accumulate_grad_batches=2)

However, they do not. The weights of the model differ slightly (~ e⁻5) in all three cases after just a few hundred batches.

Here is a full reproducible example:

[python 3.11.5, pytorch-lightning 2.0.9]

import numpy as np
import pandas
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

#seed everything
pl.seed_everything(42)

# happens with this dataset and also my original working dataset, MasakhaNER
training_data = datasets.CIFAR10(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

# happens with this CNN, but also with an AutoEncoder and a huggingface BERT model
class Network(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(3, 32, 3, padding=1)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.fc1 = torch.nn.Linear(32 * 16 * 16, 128)
        self.fc2 = torch.nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(-1, 32 * 16 * 16)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = torch.nn.CrossEntropyLoss()(outputs, labels)

        self.writeWeights(inputs, batch_idx)

        return loss

    def configure_optimizers(self):
# also happens with SDG
        return torch.optim.Adam(self.parameters(), lr=0.001)

  
    # the weight after the last batch should be the same across all all versions, but is not!
    def writeWeights(self, x, batch_idx):
        if ACC:
            if (accumulate_gradient == 1):
                path = "weight_tracker_a1.csv"
            else:
                path = "weight_tracker_a.csv"
        else:
            path = "weight_tracker.csv"

        np.set_printoptions(formatter={'float': '{: e}'.format})
        df = pandas.read_csv(path)
        params = list(self.parameters())[0]
        params = params.detach().cpu().numpy()
        df = df._append({"shape": x.shape, "batch_idx": batch_idx, "weights_1": params}, ignore_index=True)
        df.to_csv(path, float_format='{.e}'.format, index=False)



#Set the 3 different test options here:
ACC = True
#Set accumulate_gradient to 1 or 2 if ACC = True for logging
accumulate_gradient = 2
#set accumulate gradient here manually for training!
#also happens without limit_train_batches
trainer = pl.Trainer(limit_train_batches=limit_train_batches, max_epochs=1, accumulate_grad_batches=2)

if ACC:
    if (accumulate_gradient == 1):
        batch_size = 16
        limit_train_batches = 100
        df = pandas.DataFrame(columns=["shape", "batch_idx", "weights_1"])
        df.to_csv("weight_tracker_a1.csv", float_format='{e}'.format, index=False)
    else:
        batch_size = 8
        limit_train_batches = 200
        df = pandas.DataFrame(columns=["shape", "batch_idx", "weights_1"])
        df.to_csv("weight_tracker_a.csv", float_format='{e}'.format, index=False)

else:
    batch_size = 16
    limit_train_batches = 100
    df = pandas.DataFrame(columns=["shape", "batch_idx", "weights_1"])
    df.to_csv("weight_tracker.csv", float_format='{e}'.format, index=False)


# init stuff
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

model = Network()

#train
trainer.fit(model=model, train_dataloaders=train_dataloader)

I tried switching the learning rate, optimizer, dataset and model; the difference in the 3 versions persists. I also tried it on CPU and 2 different GPUs, difference persists.

I'd like to understand where this difference comes from and if it can be eradicated, any ideas?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Inprecision in pytorch-lightning's Gradient Accumulation? #18743

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Inprecision in pytorch-lightning's Gradient Accumulation? #18743

Uh oh!

leehawk787 Oct 7, 2023

Replies: 0 comments

leehawk787
Oct 7, 2023