Multi-step forecasts of factory production data using a Seq2Seq Encoder-Decoder Model with Attention

I am attempting to use a Seq2Seq model to make forecasts of factory production data using an Encoder-Decoder model augmented with Attention. I have become a little stuck as the output of the model seems to be a constant and has the same size sequence length as the input, where in fact I would like to be able to specify that say I want to forecast 3 (or any number of) months into the future.

Here is 2 diagrams of the Seq2Seq architecture and the attention mechanism I am looking to construct:

The Target

To my understanding, I went to be predicting the production volume of a given material from this factory into the future. So its dimensionality is $1$ and it is of course an integer.

The Encoder

The encoder takes as input a sequence of length $168$, with each input being the $20$ previous days data, as well as $37$ factory-level features such as number of workers etc etc..

The Decoder

This is where I get confused and where I am running into issues with my code. Again, to my understanding the Decoder should be taking the previous time-steps production levels as input (meaning dimension $1$), as well as the previous hidden and cell state.

Code

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, p):
        super(EncoderRNN, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size,
                            num_layers, dropout = p, 
                            bidirectional = True)

        self.fc_hidden = nn.Linear(hidden_size*2, hidden_size) 
        self.fc_cell = nn.Linear(hidden_size*2, hidden_size)

    def forward(self, input):
        print(fEncoder input shape is {input.shape})
        
        encoder_states, (hidden, cell_state) = self.lstm(input)

        print(fEncoder Hidden: {hidden.shape})
        print(fEncoder Cell: {cell_state.shape})

        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim = 2))
        cell = self.fc_cell(torch.cat((cell_state[0:1], cell_state[1:2]), dim = 2))

        print(fEncoder Hidden: {hidden.shape})
        print(fEncoder Cell: {cell.shape})
        
        return encoder_states, hidden, cell


class Decoder_LSTMwAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, p):
        super(Decoder_LSTMwAttention, self).__init__()
       
        self.rnn = nn.LSTM(hidden_size*2 + input_size, hidden_size,
                           num_layers)

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=0)
        self.dropout = nn.Dropout(p)
        self.relu = nn.ReLU()  

        self.attention_combine = nn.Linear(hidden_size, hidden_size)


    def forward(self, input, encoder_states, hidden, cell):


        input = input.unsqueeze(0)
        input = input.unsqueeze(0)

        input = self.dropout(input)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)

        concatenated = torch.cat((h_reshaped, encoder_states), dim = 2)
        print(fConcatenated size: {concatenated.shape})

        energy = self.relu(self.energy(concatenated))
        attention = self.softmax(energy)
        attention = attention.permute(1, 0, 2)

        encoder_states = encoder_states.permute(1, 0, 2)

        context_vector = torch.einsum(snk,snl-knl, attention, encoder_states)
        
        rnn_input = torch.cat((context_vector, input), dim = 2)

        output, (hidden, cell) = self.rnn(rnn_input, hidden, cell)

        output = self.fc(output).squeeze(0)
        
        return output, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        #target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be SOS token
        x = target[0]

        for t in range(1, target_len):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random()  teacher_force_ratio else best_guess

        return outputs

Training Routine

def Seq2seq_trainer(model, optimizer, train_input, train_target,
                  test_input, test_target, criterion, num_epochs):

    train_losses = np.zeros(num_epochs)
    validation_losses = np.zeros(num_epochs)

    for it in range(num_epochs):
        # zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(train_input, train_target)  
        loss = criterion(outputs, train_target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Save losses
        train_losses[it] = loss.item()

        # Test loss
        
        test_outputs = model(test_input, test_target)
        validation_loss = loss_function(test_outputs, test_target)
        validation_losses[it] = validation_loss.item()
            
        if (it + 1) % 25 == 0:
            print(f'Epoch {it+1}/{num_epochs}, Train Loss: {loss.item():.4f}, Validation Loss: {validation_loss.item():.4f}')

    return train_losses, validation_losses

Results I get

The issue seems to be the decoder is predicting a constant value each time and does not pick up on the noise in the data

Topic sequence-to-sequence time-series machine-learning

Category Data Science


Breaking down a number of questions, firstly

I want to forecast 3 months into the future.

You need at least 3+ months worth of data to do this task. That means, your "forecast horizon" needs to be a subset of your data set in which you can define how far ahead you want to be making predictions. See for example the image below: enter image description here

the Decoder should be taking the previous time-steps production levels as input (meaning dimension 1), as well as the previous hidden and cell state.

I think you are mixing up attention pooling architecture with self-attention/transformer architecture concepts.

  1. In typical encoder/decoder rnn, the decoder needs to consume the hidden state output from each LSTM cell/timestep. Unless you are trying a more experimental output, the cell state as well as the input data should not be passed to the decoder. If you are using Luong/additive attention pooling, again only the hidden state is needed for its calculation. I am not sure your attention method is entirely correct. I paste below an example of hoe your decoder with additive attention should look like:
class AttnDecoderRNN(nn.Module):
    """
    Courtesy of https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
    """


    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):

        # Here is the attention pooling calculation
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
  1. Rightfully so, you claim that input and hidden state needs to be passed to the decoder and I presume you refer to a self-attention architecture. For this model, there are no decoder/decoder units but rather each input is abstracted into a query, key and value latent spaces, as in the following diagram: enter image description here You can read more about the self-attention architecture here https://towardsdatascience.com/self-attention-5b95ea164f61, before you decide how you want to proceed attacking your problem.

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.