Multi-step forecasts of factory production data using a Seq2Seq Encoder-Decoder Model with Attention
I am attempting to use a Seq2Seq model to make forecasts of factory production data using an Encoder-Decoder model augmented with Attention. I have become a little stuck as the output of the model seems to be a constant and has the same size sequence length as the input, where in fact I would like to be able to specify that say I want to forecast 3 (or any number of) months into the future.
Here is 2 diagrams of the Seq2Seq architecture and the attention mechanism I am looking to construct:
The Target
To my understanding, I went to be predicting the production volume of a given material from this factory into the future. So its dimensionality is $1$ and it is of course an integer.
The Encoder
The encoder takes as input a sequence of length $168$, with each input being the $20$ previous days data, as well as $37$ factory-level features such as number of workers etc etc..
The Decoder
This is where I get confused and where I am running into issues with my code. Again, to my understanding the Decoder should be taking the previous time-steps production levels as input (meaning dimension $1$), as well as the previous hidden and cell state.
Code
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, p):
super(EncoderRNN, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size,
num_layers, dropout = p,
bidirectional = True)
self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
self.fc_cell = nn.Linear(hidden_size*2, hidden_size)
def forward(self, input):
print(fEncoder input shape is {input.shape})
encoder_states, (hidden, cell_state) = self.lstm(input)
print(fEncoder Hidden: {hidden.shape})
print(fEncoder Cell: {cell_state.shape})
hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim = 2))
cell = self.fc_cell(torch.cat((cell_state[0:1], cell_state[1:2]), dim = 2))
print(fEncoder Hidden: {hidden.shape})
print(fEncoder Cell: {cell.shape})
return encoder_states, hidden, cell
class Decoder_LSTMwAttention(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, p):
super(Decoder_LSTMwAttention, self).__init__()
self.rnn = nn.LSTM(hidden_size*2 + input_size, hidden_size,
num_layers)
self.energy = nn.Linear(hidden_size * 3, 1)
self.fc = nn.Linear(hidden_size, output_size)
self.softmax = nn.Softmax(dim=0)
self.dropout = nn.Dropout(p)
self.relu = nn.ReLU()
self.attention_combine = nn.Linear(hidden_size, hidden_size)
def forward(self, input, encoder_states, hidden, cell):
input = input.unsqueeze(0)
input = input.unsqueeze(0)
input = self.dropout(input)
sequence_length = encoder_states.shape[0]
h_reshaped = hidden.repeat(sequence_length, 1, 1)
concatenated = torch.cat((h_reshaped, encoder_states), dim = 2)
print(fConcatenated size: {concatenated.shape})
energy = self.relu(self.energy(concatenated))
attention = self.softmax(energy)
attention = attention.permute(1, 0, 2)
encoder_states = encoder_states.permute(1, 0, 2)
context_vector = torch.einsum(snk,snl-knl, attention, encoder_states)
rnn_input = torch.cat((context_vector, input), dim = 2)
output, (hidden, cell) = self.rnn(rnn_input, hidden, cell)
output = self.fc(output).squeeze(0)
return output, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
target_len = target.shape[0]
#target_vocab_size = len(english.vocab)
outputs = torch.zeros(target_len, batch_size).to(device)
encoder_states, hidden, cell = self.encoder(source)
# First input will be SOS token
x = target[0]
for t in range(1, target_len):
# At every time step use encoder_states and update hidden, cell
output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
# Store prediction for current time step
outputs[t] = output
# Get the best word the Decoder predicted (index in the vocabulary)
best_guess = output.argmax(1)
# With probability of teacher_force_ratio we take the actual next word
# otherwise we take the word that the Decoder predicted it to be.
# Teacher Forcing is used so that the model gets used to seeing
# similar inputs at training and testing time, if teacher forcing is 1
# then inputs at test time might be completely different than what the
# network is used to. This was a long comment.
x = target[t] if random.random() teacher_force_ratio else best_guess
return outputs
Training Routine
def Seq2seq_trainer(model, optimizer, train_input, train_target,
test_input, test_target, criterion, num_epochs):
train_losses = np.zeros(num_epochs)
validation_losses = np.zeros(num_epochs)
for it in range(num_epochs):
# zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(train_input, train_target)
loss = criterion(outputs, train_target)
# Back prop
loss.backward()
# Clip to avoid exploding gradient issues
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Gradient descent step
optimizer.step()
# Save losses
train_losses[it] = loss.item()
# Test loss
test_outputs = model(test_input, test_target)
validation_loss = loss_function(test_outputs, test_target)
validation_losses[it] = validation_loss.item()
if (it + 1) % 25 == 0:
print(f'Epoch {it+1}/{num_epochs}, Train Loss: {loss.item():.4f}, Validation Loss: {validation_loss.item():.4f}')
return train_losses, validation_losses
Results I get
The issue seems to be the decoder is predicting a constant value each time and does not pick up on the noise in the data
Topic sequence-to-sequence time-series machine-learning
Category Data Science