How to add a Decoder & Attention Layer to Bidirectional Encoder with tensorflow 2.0
I am a beginner in machine learning and I'm trying to create a spelling correction model that spell checks for a small amount of vocab (approximately 1000 phrases). Currently, I am refering to the tensorflow 2.0 tutorials for 1. NMT with Attention, and 2. Text Generation. I have completed up to an encoding layer but currently I am having some issue matching up the shape of the following layers (decoder and attention) with the previous (encoder). The encoder in the tutorial is not bidirectional whereas I am trying to implement a bidirectional encoder. Below is my code for the encoder and attention layer.
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.bigru = tf.keras.layers.Bidirectional(
def call(self, x, hidden):
x = self.embedding(x)
output, forward_state, backward_state = self.bigru(x, initial_state = hidden)
hidden_state = tf.convert_to_tensor([forward_state, backward_state])
return output, hidden_state
def initialize_hidden_state(self):
init_state = [tf.zeros((self.batch_sz, self.enc_units)) for i in range(2)]
return init_state
embedding_dim = 10
enc_units = 100
batch_size = 64
encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_size)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))
WARNING:tensorflow:Layer gru_20 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer gru_20 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer gru_20 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
Encoder output shape: (batch size, sequence length, units) (64, 27, 200)
Encoder Hidden state shape: (batch size, units) (2, 64, 100)
Encoder Hidden (backward) state shape: (batch size, units) (64, 100)
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
# query hidden state shape == (batch_size, hidden size)
# query_with_time_axis shape == (batch_size, 1, hidden size)
# values shape == (batch_size, max_len, hidden size)
# we are doing this to broadcast addition along the time axis to calculate the score
query_with_time_axis = tf.expand_dims(query, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))
InvalidArgumentError Traceback (most recent call last) in () 1 attention_layer = BahdanauAttention(10) ----> 2 attention_result, attention_weights = attention_layer(sample_hidden, sample_output) 3 4 print("Attention result shape: (batch size, units) {}".format(attention_result.shape)) 5 print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))
6 frames /usr/local/lib/python3.6/dist-packages/ in raise_from(value, from_value)
InvalidArgumentError: Incompatible shapes: [2,1,64,10] vs. [64,27,10] [Op:AddV2]
