MNIST - Vanilla Neural Network - Why Cost Function is Increasing?

I've been combing through this code for a week now trying to figure out why my cost function is increasing as in the following image. Reducing the learning rate does help but very little. Can anyone spot why the cost function isn't working as expected?

I realise a CNN would be preferable, but I still want to understand why this simple network is failing. Please help:)

import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import matplotlib.pyplot as plt

mnist = input_data.read_data_sets("MNIST_DATA/",one_hot=True)

def createPlaceholders():
    xph = tf.placeholder(tf.float32, (784, None))
    yph = tf.placeholder(tf.float32, (10, None))
    return xph, yph

def init_param(layers_dim):
    weights = {}
    L = len(layers_dim)

    for l in range(1,L):
        weights['W' + str(l)] = tf.get_variable('W' + str(l), shape=(layers_dim[l],layers_dim[l-1]), initializer= tf.contrib.layers.xavier_initializer())
        weights['b' + str(l)] = tf.get_variable('b' + str(l), shape=(layers_dim[l],1), initializer= tf.zeros_initializer())

    return weights

def forward_prop(X,L,weights):
    parameters = {}
    parameters['A0'] = tf.cast(X,tf.float32)

    for l in range(1,L-1):
        parameters['Z' + str(l)] = tf.add(tf.matmul(weights['W' + str(l)], parameters['A' + str(l-1)]), weights['b' + str(l)])
        parameters['A' + str(l)] = tf.nn.relu(parameters['Z' + str(l)])

    parameters['Z' + str(L-1)] = tf.add(tf.matmul(weights['W' + str(L-1)], parameters['A' + str(L-2)]), weights['b' + str(L-1)])
    return parameters['Z' + str(L-1)]

def compute_cost(ZL,Y):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.cast(Y,tf.float32), logits = ZL))
    return cost

def randomMiniBatches(X,Y,minibatch_size):
    m = X.shape[1]
    shuffle = np.random.permutation(m)
    temp_X = X[:,shuffle]
    temp_Y = Y[:,shuffle]

    num_complete_minibatches = int(np.floor(m/minibatch_size))

    mini_batches = []

    for batch in range(num_complete_minibatches):
        mini_batches.append((temp_X[:,batch*minibatch_size: (batch+1)*minibatch_size], temp_Y[:,batch*minibatch_size: (batch+1)*minibatch_size]))

    mini_batches.append((temp_X[:,num_complete_minibatches*minibatch_size:], temp_Y[:,num_complete_minibatches*minibatch_size:]))

    return mini_batches

def model(X, Y, layers_dim, learning_rate = 0.001, num_epochs = 20, minibatch_size = 64):
    tf.reset_default_graph()
    costs = []

    xph, yph = createPlaceholders()
    weights = init_param(layers_dim)
    ZL = forward_prop(xph, len(layers_dim), weights)
    cost = compute_cost(ZL,yph)
    optimiser = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)

        for epoch in range(num_epochs):
            minibatches = randomMiniBatches(X,Y,minibatch_size)
            epoch_cost = 0

            for b, mini in enumerate(minibatches,1):
                mini_x, mini_y = mini
                _,c = sess.run([optimiser,cost],feed_dict={xph:mini_x,yph:mini_y})
                epoch_cost += c
            print('epoch: ',epoch+1,'/ ',num_epochs)

            epoch_cost /= len(minibatches)
            costs.append(epoch_cost)

    plt.plot(costs) 
    print(costs)



X_train = mnist.train.images.T
n_x = X_train.shape[0]
Y_train = mnist.train.labels.T
n_y = Y_train.shape[0]
layers_dim = [n_x,10,n_y]

model(X_train, Y_train, layers_dim)

Topic mnist tensorflow neural-network

Category Data Science


Can you try the same after normalizing the data, sometimes most of us in the beginning does not provide enough attention to that specific area. One of the reasons that this could happen is that on the train set you might have normalized but passed the raw data as labels when fitting to the network.


Tensorflow's softmax function only works if the number of batches are in the rows and the output in the columns. If these are reversed, then you need to transpose the tensors in the cost function.

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.