Chess deep learning siamese network overfitting when shouldn't in theory

TLDR: My network is training with pairs so instead of 10^6 samples it has 10^12 samples (The number of samples squared) . With that large of a data set is shouldn't overfit but it does after very few epochs. Can't find the reason, any help is appriciated. Thanks.

I'm trying to implement a chess deep learning model like shown in the paper "DeepChess: End-to-End Deep Neural Network for Automatic Learning in Chess" (https://www.cs.tau.ac.il/~wolf/papers/deepchess.pdf).

It is using a siamese neural network to compare between 2 chess positions and predict which position is better. Also it is using an autoencoder to extract features. I implemented all the stuff in keras with tensorflow as backend but my problem is that the siamese model is overfitting, it gets up to 87% accuracy after a couple of epochs (in the paper it gets up to 97%) and starts to overfit(loss starts going up on validation set), while loss and accuracy on train set still improves.

In the paper it says " Since the number of potential training pairs is 6.5 × 10^12, virtually all training samples in each epoch are new, thus guaranteeing that no overfitting would take place."

so in theory it shouldn't overfit and no regularization is needed.
Anyone knows how to deal with this problem?

It's my first project in deep learning so I don't really know what to do right know, maybe someone can point me in the right direction what I should research or where should I try to check for problems (My assumption is that there is something wrong in the siamese model or in the data generator but everything looks fine to me).

Thanks in advance!

I added the code of the project here:

If you want to compile and try it yourself you will need to download the chess pgn games data base (http://ccrl.chessdom.com/ccrl/4040/games.html).

Generating data:


    dataPath = "ChessDataBase.pgn"

    num_white_moves = 1000000
    num_black_moves = 1000000
    num_white_moves_per_arr = 100000
    num_black_moves_per_arr = 100000


    def get_valid_moves(game):
        valid_moves = []

        for i, move in enumerate(game.mainline_moves()):
            if not game.board().is_capture(move) and i = 5:
                # Append the move index to the valid_moves list
                valid_moves.append(i)

        return valid_moves

    # Get bit representation of chess board
    def get_bitboard(board):
        bitboard = np.zeros(2 * 6 * 64 + 5, dtype='float32')

        piece_indices = {
            'p': 0,
            'n': 1,
            'b': 2,
            'r': 3,
            'q': 4,
            'k': 5}

        for i in range(64):
            if board.piece_at(i):
                color = int(board.piece_at(i).color)
                bitboard[(6 * color + piece_indices[board.piece_at(i).symbol().lower()] + 12 * i)] = 1

        bitboard[-1] = int(board.turn)
        bitboard[-2] = int(board.has_kingside_castling_rights(True))
        bitboard[-3] = int(board.has_kingside_castling_rights(False))
        bitboard[-4] = int(board.has_queenside_castling_rights(True))
        bitboard[-5] = int(board.has_queenside_castling_rights(False))

        return bitboard


    # Adds 10 moves from game to move_array at location move_index
    def add_moves(game, move_array, move_index):
        valid_moves = get_valid_moves(game)
        moves_count = 0

        selected_moves = []
        for i in range(8):
            if not valid_moves:
                break

            move = random.choice(valid_moves)
            valid_moves.remove(move)
            selected_moves.append(move)
            moves_count = moves_count + 1


        board = chess.Board()
        for i, move in enumerate(game.mainline_moves()):
            board.push(move)

            if move_index = move_array.shape[0]:
                break

            if i in selected_moves:
                move_array[move_index] = get_bitboard(board)
                move_index += 1

        return move_index, moves_count

    def iterate_over_data():
        white_moves = np.zeros((num_white_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
        black_moves = np.zeros((num_black_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')

        # _white and black move counts store how many white and black moves have been stored
        white_move_index = 0
        black_move_index = 0
        black_moves_count = 0
        white_moves_count = 0
        count = 0
        white_count = 1
        black_count = 1
        white_empty = True
        black_empty = True

        pgn = open(dataPath)

        while True:
            # Debug printing
            if count % 1000 == 0:
                print("Game Number: {count}\twhite moves: {white_moves}\tblack moves: {black_moves}".format(
                    count=count,
                    black_moves=black_moves_count,
                    white_moves=white_moves_count))
            game = chess.pgn.read_game(pgn)

            if not game or white_moves_count = num_white_moves and black_moves_count = num_black_moves:
                break
            if game.headers["Result"] == "1-0" and white_moves_count  num_white_moves:
                white_move_index, moves_count = add_moves(game, white_moves, white_move_index % num_white_moves_per_arr)
                white_moves_count = white_moves_count + moves_count
            if game.headers["Result"] == "0-1" and black_moves_count  num_black_moves:
                black_move_index, moves_count = add_moves(game, black_moves, black_move_index % num_black_moves_per_arr)
                black_moves_count = black_moves_count + moves_count

            if white_moves_count  num_white_moves_per_arr:
                print(len(white_moves))
                w_str = str(white_count)
                print("Saving white" + w_str + " array")
                np.save('data4/white' + w_str + '.npy', white_moves[:num_white_moves_per_arr])
                white_count = white_count + 1
                white_moves = np.zeros((num_white_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
                white_move_index = 0
                white_moves_count = 0

            if black_moves_count  num_black_moves_per_arr:
                b_str = str(black_count)
                print("Saving black" + b_str + " array")
                np.save('data4/black' + b_str + '.npy', black_moves[:num_black_moves_per_arr])
                black_count = black_count + 1
                black_moves = np.zeros((num_black_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
                black_moves_count = 0
                black_move_index = 0

            count += 1


    iterate_over_data()

Splitting data to train and validate set files:



    import numpy as np

    num_of_positions = 2800000
    num_of_positions_per_file = 100000
    num_of_train_positions = int(num_of_positions-100000)
    whites = np.zeros((num_of_positions, 773), dtype='float32')
    blacks = np.zeros((num_of_positions, 773), dtype='float32')
    for i in range(28):
        print(i + 1)
        whites[i * 100000:(i + 1) * 100000] = np.load('./data4/white' + str(i + 1) + '.npy')
        blacks[i * 100000:(i + 1) * 100000] = np.load('./data4/black' + str(i + 1) + '.npy')

    print("Shuffling white positions")
    np.random.shuffle(whites)
    print("Shuffling black positions")
    np.random.shuffle(blacks)

    train_whites = whites[:num_of_train_positions]
    train_blacks = blacks[:num_of_train_positions]

    val_whites = whites[num_of_train_positions:]
    val_blacks = blacks[num_of_train_positions:]

    for i in range(int(num_of_train_positions/num_of_positions_per_file)):
        np.save('./data5/white_train' + str(i+1) + '.npy', train_whites[i*num_of_positions_per_file:(i+1)*num_of_positions_per_file])
        np.save('./data5/black_train' + str(i+1) + '.npy', train_blacks[i*num_of_positions_per_file:(i+1)*num_of_positions_per_file])
        if i  int((num_of_positions-num_of_train_positions)/num_of_positions_per_file):
            np.save('./data5/white_val' + str(i + 1) + '.npy', val_whites[i * num_of_positions_per_file:(i + 1) * num_of_positions_per_file])
            np.save('./data5/black_val' + str(i + 1) + '.npy', val_blacks[i * num_of_positions_per_file:(i + 1) * num_of_positions_per_file])

Training autoencoder:



    from keras.layers import Dense, Input
    from keras.models import Model
    import keras
    import numpy as np
    import gc
    from util import DenseTied


    class AutoEncoder:
        def __init__(self):
            self.positions = []
            self.positions_val = []
            self.model = None
            self.encoder = None
            self.decoder = None

        def __encoder(self):
            input_layer = Input(shape=(773,))
            hidden_1 = Dense(600, activation='relu')(input_layer)
            hidden_2 = Dense(400, activation='relu')(hidden_1)
            hidden_3 = Dense(200, activation='relu')(hidden_2)
            code = Dense(100, activation='relu')(hidden_3)

            encoder = Model(input_layer, code, name='encoder')
            encoder.summary()
            self.encoder = encoder
            return encoder

        def __decoder(self):
            code_input = Input(shape=(100,))
            hidden_1 = DenseTied(200, activation='relu', tied_to=self.encoder.layers[4])(code_input)
            hidden_2 = DenseTied(400, activation='relu', tied_to=self.encoder.layers[3])(hidden_1)
            hidden_3 = DenseTied(600, activation='relu', tied_to=self.encoder.layers[2])(hidden_2)
            output_layer = DenseTied(773, activation='sigmoid', tied_to=self.encoder.layers[1])(hidden_3)

            decoder = Model(code_input, output_layer, name='decoder')
            decoder.summary()
            self.decoder = decoder
            return decoder

        def encoder_decoder(self, load=0):
            input_layer = Input(shape=(773,))
            if load:
                self.encoder = keras.models.load_model('./Pos2Vec/encoder_v1/encoder_epoch66')
            else:
                self.__encoder()
            self.__decoder()

            ec_out = self.encoder(input_layer)
            dc_out = self.decoder(ec_out)

            autoencoder = Model(input_layer, dc_out, name='autoencoder')
            self.model = autoencoder
            self.model.summary()
            return autoencoder

        def train(self, batch_size=256, epochs=20):
            self.model.compile(optimizer='adam', loss='binary_crossentropy')
            self.load_data3()
            for epoch in range(epochs):
                self.shuffle_positions()
                gc.collect()
                train = self.positions[:2000000]
                self.model.fit(train, train, validation_data=(self.positions_val, self.positions_val), epochs=1, batch_size=batch_size)
                train = []
                gc.collect()

                print('Saving ./Pos2Vec/encoder_v1/encoder_epoch' + str(epoch+1))
                self.encoder.save('./Pos2Vec/encoder_v1/encoder_epoch' + str(epoch+1))

        def save(self):
            self.encoder.save('./weights3/encoder_v8.h5')
            # self.decoder.save('./weights3/decoder_v7.h5')
            # self.model.save('./weights3/autoencoder_v8.h5')

        def load_data3(self):
            positions = 2000000
            val_positions = 200000
            num_per_file = 100000
            self.positions = np.zeros((2*positions, 773), dtype='float32')
            self.positions_val = np.zeros((2*val_positions, 773), dtype='float32')
            for i in range(int(positions/num_per_file)):
                print(i + 1)
                start = i*num_per_file
                self.positions[start:start + num_per_file] = np.load('./data3/white_train' + str(i + 1) + '.npy')
                self.positions[positions + start:positions + start + num_per_file] = np.load('./data3/black_train' + str(i + 1) + '.npy')
                if i  val_positions/num_per_file:
                    self.positions_val[start:start + num_per_file] = np.load('./data3/white_val' + str(i + 1) + '.npy')
                    self.positions_val[val_positions + start:val_positions + start + num_per_file] = np.load('./data3/black_val' + str(i + 1) + '.npy')

        def shuffle_positions(self):
            # print("---Shuffling white positions---")
            # random.shuffle(self.white_positions)
            # gc.collect()
            # print("---Shuffling black positions---")
            # random.shuffle(self.black_positions)
            # gc.collect()
            print("Shuffling positions")
            np.random.shuffle(self.positions)
            gc.collect()

        def predict(self, data):
            return self.encoder.predict(data)


    if __name__ == '__main__':
        ae = AutoEncoder()
        ae.encoder_decoder(load=0)
        ae.train(batch_size=256, epochs=100)
        ae.save()

DenseTied layer class:




    class DenseTied(keras.layers.Layer):
        def __init__(self, units,
                     activation=None,
                     use_bias=True,
                     kernel_initializer='glorot_uniform',
                     bias_initializer='zeros',
                     kernel_regularizer=None,
                     bias_regularizer=None,
                     activity_regularizer=None,
                     kernel_constraint=None,
                     bias_constraint=None,
                     tied_to=None,
                     **kwargs):
            self.tied_to = tied_to
            if 'input_shape' not in kwargs and 'input_dim' in kwargs:
                kwargs['input_shape'] = (kwargs.pop('input_dim'),)
            super().__init__(**kwargs)
            self.units = units
            self.activation = keras.activations.get(activation)
            self.use_bias = use_bias
            self.kernel_initializer = keras.initializers.get(kernel_initializer)
            self.bias_initializer = keras.initializers.get(bias_initializer)
            self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
            self.bias_regularizer = keras.regularizers.get(bias_regularizer)
            self.activity_regularizer = keras.regularizers.get(activity_regularizer)
            self.kernel_constraint = keras.constraints.get(kernel_constraint)
            self.bias_constraint = keras.constraints.get(bias_constraint)
            self.input_spec = keras.layers.InputSpec(min_ndim=2)
            self.supports_masking = True

        def build(self, input_shape):
            assert len(input_shape) = 2
            input_dim = input_shape[-1]

            if self.tied_to is not None:
                self.kernel = K.transpose(self.tied_to.kernel)
                self._non_trainable_weights.append(self.kernel)
            else:
                self.kernel = self.add_weight(shape=(input_dim, self.units),
                                              initializer=self.kernel_initializer,
                                              name='kernel',
                                              regularizer=self.kernel_regularizer,
                                              constraint=self.kernel_constraint)
            if self.use_bias:
                self.bias = self.add_weight(shape=(self.units,),
                                            initializer=self.bias_initializer,
                                            name='bias',
                                            regularizer=self.bias_regularizer,
                                            constraint=self.bias_constraint)
            else:
                self.bias = None
            self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim})
            self.built = True

        def compute_output_shape(self, input_shape):
            assert input_shape and len(input_shape) = 2
            output_shape = list(input_shape)
            output_shape[-1] = self.units
            return tuple(output_shape)

        def call(self, inputs, **kwargs):
            output = K.dot(inputs, self.kernel)
            if self.use_bias:
                output = K.bias_add(output, self.bias, data_format='channels_last')
            if self.activation is not None:
                output = self.activation(output)
            return output

Siamese model training:



    import chess
    import numpy as np
    import gc
    import tensorflow as tf
    import keras
    from keras.models import Model
    from keras.layers import Input, Dense, concatenate
    from keras.utils import Sequence
    import random
    import matplotlib.pyplot as plt


    def batch_to_data(white_batch, black_batch):
        x1 = []
        x2 = []
        y = []
        min_len = min(len(white_batch), len(black_batch))
        for i in range(min_len):
            if random.randint(0, 1) == 1:
                x1.append(white_batch[i])
                x2.append(black_batch[i])
                y.append([1, 0])
            else:
                x1.append(black_batch[i])
                x2.append(white_batch[i])
                y.append([0, 1])
        x1 = np.array(x1).reshape((-1, 773))
        x2 = np.array(x2).reshape((-1, 773))
        y = np.array(y).reshape((-1, 2))

        return [x1, x2], y


    # A class for training the models using keras fit_generator
    class DeepChessDataGenerator(Sequence):
        def __init__(self, batch_size, whites, blacks, train=1):
            print("---Initializing data generator---")
            self.batch_size = batch_size
            self.train = train
            self.whites = whites
            self.blacks = blacks
            if train:
                self.num_of_positions = 1000000
            else:
                self.num_of_positions = len(self.whites)

        def __len__(self):
            return int(np.floor(self.num_of_positions / self.batch_size))

        def __getitem__(self, index):
            white_batch = self.whites[index * self.batch_size:(index+1) * self.batch_size]
            black_batch = self.blacks[index * self.batch_size:(index+1) * self.batch_size]

            return batch_to_data(white_batch, black_batch)

        # Shuffle the order of the white and blacks
        def on_epoch_end(self):
            np.random.shuffle(self.whites)
            np.random.shuffle(self.blacks)


    class DeepChess:
        def __init__(self):
            self.model = None               # DeepChess model
            self.encoder = None             # Encoder models, encodes 773 bits (chess board) to 100 bits

        # Sets the deep neural network chess model. if load=1 then loads model. otherwise, creates a new model
        def neural_chess(self, load=0):
            input_size = 773
            layer1_size = 400
            layer2_size = 200
            layer3_size = 100
            if load == 1:
                # model = keras.models.load_model('dc_models/my_model11.h5')
                # encoder = keras.models.load_model('dc_models/encoder11.h5')
                model = keras.models.load_model('dc_models/dropout/deepchess-18-0.281-0.846.h5')
                encoder = None
            else:
                # encoder = keras.models.load_model('./Pos2Vec/encoder_v1/encoder_epoch94')

                input_layer0 = Input(shape=(input_size, ))
                layer1 = Dense(600, activation='relu')(input_layer0)
                layer2 = Dense(400, activation='relu')(layer1)
                layer3 = Dense(200, activation='relu')(layer2)
                layer4 = Dense(100, activation='relu')(layer3)

                encoder = Model(input_layer0, layer4)

                input_layer1 = Input(shape=(input_size, ))
                input_layer2 = Input(shape=(input_size,))

                e1 = encoder(input_layer1)
                e2 = encoder(input_layer2)

                combined = concatenate([e1, e2])

                layer1 = Dense(layer1_size, activation='relu')(combined)

                layer2 = Dense(layer2_size, activation='relu')(layer1)

                layer3 = Dense(layer3_size, activation='relu')(layer2)

                output_layer = Dense(2, activation='softmax')(layer3)

                model = Model(inputs=[input_layer1, input_layer2], outputs=output_layer)
            self.model = model
            self.encoder = encoder
            model.summary()
            return model

        # Trains the model for a # of epochs
        def fit(self, epochs=50, batch_size=256):

            num_of_positions = 2800000
            num_of_positions_per_file = 100000
            num_of_train_positions = int(num_of_positions-100000)
            num_of_val_positions = num_of_positions - num_of_train_positions

            train_whites = np.zeros((num_of_train_positions, 773), dtype='float32')
            train_blacks = np.zeros((num_of_train_positions, 773), dtype='float32')

            val_whites = np.zeros((num_of_val_positions, 773), dtype='float32')
            val_blacks = np.zeros((num_of_val_positions, 773), dtype='float32')

            for i in range(int(num_of_train_positions/num_of_positions_per_file)):
                print('Loading data ' + str(i + 1) + '/27', end='\r')
                train_whites[i * 100000:(i + 1) * 100000] = np.load('./data5/white_train' + str(i + 1) + '.npy')
                train_blacks[i * 100000:(i + 1) * 100000] = np.load('./data5/black_train' + str(i + 1) + '.npy')
                if i  int(num_of_val_positions/num_of_positions_per_file):
                    val_whites[i * 100000:(i + 1) * 100000] = np.load('./data5/white_val' + str(i + 1) + '.npy')
                    val_blacks[i * 100000:(i + 1) * 100000] = np.load('./data5/black_val' + str(i + 1) + '.npy')
            print()
            # Data generators
            train_generator = DeepChessDataGenerator(batch_size, whites=train_whites, blacks=train_blacks, train=1)
            val_generator = DeepChessDataGenerator(batch_size, whites=val_whites, blacks=val_blacks, train=0)

            self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
            self.model.summary()

            history = self.model.fit_generator(train_generator, validation_data=val_generator, epochs=epochs, shuffle=True)

            # Plot training  validation accuracy values
            plt.plot(history.history['acc'])
            plt.plot(history.history['val_acc'])
            plt.title('Model accuracy')
            plt.ylabel('Accuracy')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Test'], loc='upper left')
            plt.show()

            # Plot training  validation loss values
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('Model loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Test'], loc='upper left')
            plt.show()

    if __name__ == '__main__':
        dc = DeepChess()
        dc.neural_chess(load=0)
        dc.fit(epochs=20, batch_size=256)

Graph of accuracy and loss after 20 epochs

Topic siamese-networks keras autoencoder deep-learning machine-learning

Category Data Science

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.