CNN, sudden drop of accuracy between epochs, steps for improvements?
I am working on a text recognition problem, in which essentially I am trying to read images similar to captchas.
I implemented a ResNet in keras and I run it on colab with gpu.
Because I cannot upload a million pictures I have created a loop where I train the model in a subset (10000 pics) save them and then load the next subset of pics and continue training.
I did some hyper tuning by trial and error but mostly I am using the original ResNet. My batch size is 128 and epoch = 30.
The model reaches fast a validation accuracy of 14% (after training it to 50k images) but I see no further improvement after this. Theoretically I would expect that the more I train it the more the improvement.
Question 1: What should I do next in order to improve the model accuracy?
Secondly, when I train it. I have some sudden drops in the validation accuracy and I can't understand why this happens.
Finally, most of the times I reach a pick of validation accuracy at the early stage (low number of epoch) and later it goes a bit lower.
Question 2: Why the model behaves like that?
Any ideas around the topic would be very welcome :) Thank you
The code is the following:
Note: I know it is quite a lot, I add it just in case.
load_res_net = False
if load_res_net:
import keras
from keras.layers import Dense, Conv2D, BatchNormalization, Activation
from keras.layers import AveragePooling2D, Input, Flatten
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras import backend as K
from keras.models import Model
from keras.datasets import cifar10
import numpy as np
import os
# Training parameters
batch_size = 32 # orig paper trained all networks with batch_size=128
epochs = 200
num_classes = alphabet.__len__()*10
input_shape = (pic_dim_for_model[1], pic_dim_for_model[0], 1)
# Subtracting pixel mean improves accuracy
subtract_pixel_mean = True
n = 3
# Model version
# Orig paper: version = 1 (ResNet v1), Improved ResNet: version = 2 (ResNet v2)
version = 1
# Computed depth from supplied model parameter n
if version == 1:
depth = n * 6 + 2
elif version == 2:
depth = n * 9 + 2
# Model name, depth and version
model_type = 'ResNet%dv%d' % (depth, version)
def lr_schedule(epoch):
"""Learning Rate Schedule
Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs.
Called automatically every epoch as part of callbacks during training.
# Arguments
epoch (int): The number of epochs
# Returns
lr (float32): learning rate
lr = 1e-3
if epoch 180:
lr *= 0.5e-3
elif epoch 160:
lr *= 1e-3
elif epoch 120:
lr *= 1e-2
elif epoch 80:
lr *= 1e-1
print('Learning rate: ', lr)
return lr
def resnet_layer(inputs,
"""2D Convolution-Batch Normalization-Activation stack builder
# Arguments
inputs (tensor): input tensor from input image or previous layer
num_filters (int): Conv2D number of filters
kernel_size (int): Conv2D square kernel dimensions
strides (int): Conv2D square stride dimensions
activation (string): activation name
batch_normalization (bool): whether to include batch normalization
conv_first (bool): conv-bn-activation (True) or
bn-activation-conv (False)
# Returns
x (tensor): tensor as input to the next layer
conv = Conv2D(num_filters,
x = inputs
if conv_first:
x = conv(x)
if batch_normalization:
x = BatchNormalization()(x)
if activation is not None:
x = Activation(activation)(x)
if batch_normalization:
x = BatchNormalization()(x)
if activation is not None:
x = Activation(activation)(x)
x = conv(x)
return x
def resnet_v1(input_shape, depth, num_classes):
"""ResNet Version 1 Model builder [a]
Stacks of 2 x (3 x 3) Conv2D-BN-ReLU
Last ReLU is after the shortcut connection.
At the beginning of each stage, the feature map size is halved (downsampled)
by a convolutional layer with strides=2, while the number of filters is
doubled. Within each stage, the layers have the same number filters and the
same number of filters.
Features maps sizes:
stage 0: 32x32, 16
stage 1: 16x16, 32
stage 2: 8x8, 64
The Number of parameters is approx the same as Table 6 of [a]:
ResNet20 0.27M
ResNet32 0.46M
ResNet44 0.66M
ResNet56 0.85M
ResNet110 1.7M
# Arguments
input_shape (tensor): shape of input image tensor
depth (int): number of core convolutional layers
num_classes (int): number of classes (CIFAR10 has 10)
# Returns
model (Model): Keras model instance
if (depth - 2) % 6 != 0:
raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])')
# Start model definition.
num_filters = 16
num_res_blocks = int((depth - 2) / 6)
inputs = Input(shape=input_shape)
x = resnet_layer(inputs=inputs)
# Instantiate the stack of residual units
for stack in range(3):
for res_block in range(num_res_blocks):
strides = 1
if stack 0 and res_block == 0: # first layer but not first stack
strides = 2 # downsample
y = resnet_layer(inputs=x,
y = resnet_layer(inputs=y,
if stack 0 and res_block == 0: # first layer but not first stack
# linear projection residual shortcut connection to match
# changed dims
x = resnet_layer(inputs=x,
x = keras.layers.add([x, y])
x = Activation('relu')(x)
num_filters *= 2
# Add classifier on top.
# v1 does not use BN after last shortcut connection-ReLU
x = AveragePooling2D(pool_size=8)(x)
y = Flatten()(x)
outputs = Dense(num_classes,
# Instantiate model.
model = Model(inputs=inputs, outputs=outputs)
return model
subtract_pixel_mean = True
for tries in range(0,10):
data, labels = generate_captchas(number_of_generated_images_to_create = 8000, fonts_path = data_path + '/fonts', lines = 1)
data2, labels2 = generate_captchas(number_of_generated_images_to_create = 8000, fonts_path = data_path + '/fonts', lines = 2)
data = data + data2
labels = labels + labels2
del data2, labels2
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)
print('try : ' + str(tries))
print('images loaded: ' + str(len(labels)))
# Split the training data into separate train and test sets
(X_train, X_test, Y_train, Y_test) = train_test_split(data, labels, test_size=0.25, random_state=0)
Y_test = np.array([one_hot_encoding_for_word(x) for x in Y_test])
Y_train = np.array([one_hot_encoding_for_word(x) for x in Y_train])
del data
# If subtract pixel mean is enabled
if subtract_pixel_mean:
x_train_mean = np.mean(X_train, axis=0)
X_train -= x_train_mean
X_test -= x_train_mean
# load next part of data, and the model and continue training, Y_train, validation_data = (X_test, Y_test), batch_size= 2**7, epochs=40, verbose=2)
# Save the trained model to disk + '/' + name_of_model +'.h5')
del X_train, Y_train
