Linear regression with Pytorch not converging
I am trying to perform a simple linear regression using Pytorch lightning (a network with only one neuron). The network is supposed to learn a simple function: y=-4x
.
The size of my dataset is 1000 and contains points from the line y=-4x
with a small amount of gaussian noise. The dataset looks like this:
I am facing a strange problem where the model only converges when the batch size is small enough and when I don't shuffle random data in each batch.
Shown in the graph below is the slope, intercept and final train loss that the model converged to after 100 epochs, as a function of the batch size, when the data is not shuffled:
We can see that the model converges to the right solution only when the batch size is small enough (the right solution is intercept=0 and slope=-4).
Shown below is the same experiment, but this time I used shuffled data in each batch:
We can see that the model didn't even converge to the right solution, regardless of the batch size!
Below is the code used to generate the results for this experiment:
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import numpy as np
import matplotlib.pyplot as plt
class LineFunction(Dataset):
def __init__(self, a, b, n, x0=0, x1=1, noise=0):
self._a = a
self._b = b
self._x = torch.linspace(x0, x1, n, requires_grad=False)
self._x = self._x.unsqueeze(1)
self._noise = torch.distributions.normal.Normal(torch.tensor([0.0]), torch.tensor([1.0])).sample((n,)) * noise
self._noise = self._noise.unsqueeze(1)
def __len__(self):
return len(self._x)
def __getitem__(self, idx):
x = self._x[idx]
y = self._a * x + self._b + self._noise[idx]
return x, y
class LinReg(pl.LightningModule):
def __init__(self):
super().__init__()
self.layer_1 = torch.nn.Linear(1, 1)
def forward(self, x):
return self.layer_1(x)
def training_step(self, train_batch, batch_idx):
x, y = train_batch
preds = self.forward(x)
loss = F.mse_loss(preds, y)
self.log('train_loss', loss)
return loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
preds = self.forward(x)
loss = F.mse_loss(preds, y)
self.log('val_loss', loss)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-2)
return optimizer
batch_sizes = [int(x) for x in np.linspace(10, 1000, 100)]
slopes = []
intercepts = []
train_loss = []
shuffle = True
for batch_size in batch_sizes:
train_dataloader = DataLoader(LineFunction(-4, 0, 1000, noise=0.05), batch_size=batch_size, shuffle=shuffle)
test_dataloader = DataLoader(LineFunction(-4, 0, 1000, noise=0.05), batch_size=batch_size, shuffle=shuffle)
model = LinReg()
trainer = pl.Trainer(max_epochs=100)
trainer.fit(model, train_dataloader, test_dataloader)
slopes.append(float(model.layer_1.weight.data))
intercepts.append(float(model.layer_1.bias.data))
train_loss.append(trainer.logged_metrics['train_loss'])
fig, axes = plt.subplots(nrows=3, figsize=(15, 10))
for ax, measure, measure_name in zip(axes, [slopes, intercepts, train_loss], ['slope', 'intercept', 'train_loss']):
ax.set_xlabel('batch size')
ax.set_ylabel(measure_name)
ax.plot(batch_sizes, measure)
plt.show()
I am kind of stumped here. Why does this simple model not always converge? I also tried experimenting with different learning rates but that didn't seem to solve my problem.
Topic mlp pytorch implementation linear-regression neural-network
Category Data Science