Chess deep learning siamese network overfitting when shouldn't in theory
TLDR: My network is training with pairs so instead of 10^6 samples it has 10^12 samples (The number of samples squared) . With that large of a data set is shouldn't overfit but it does after very few epochs. Can't find the reason, any help is appriciated. Thanks.
I'm trying to implement a chess deep learning model like shown in the paper "DeepChess: End-to-End Deep Neural Network for Automatic Learning in Chess" (https://www.cs.tau.ac.il/~wolf/papers/deepchess.pdf).
It is using a siamese neural network to compare between 2 chess positions and predict which position is better. Also it is using an autoencoder to extract features. I implemented all the stuff in keras with tensorflow as backend but my problem is that the siamese model is overfitting, it gets up to 87% accuracy after a couple of epochs (in the paper it gets up to 97%) and starts to overfit(loss starts going up on validation set), while loss and accuracy on train set still improves.
In the paper it says " Since the number of potential training pairs is 6.5 × 10^12, virtually all training samples in each epoch are new, thus guaranteeing that no overfitting would take place."
so in theory it shouldn't overfit and no regularization is needed.
Anyone knows how to deal with this problem?
It's my first project in deep learning so I don't really know what to do right know, maybe someone can point me in the right direction what I should research or where should I try to check for problems (My assumption is that there is something wrong in the siamese model or in the data generator but everything looks fine to me).
Thanks in advance!
I added the code of the project here:
If you want to compile and try it yourself you will need to download the chess pgn games data base (http://ccrl.chessdom.com/ccrl/4040/games.html).
Generating data:
dataPath = "ChessDataBase.pgn"
num_white_moves = 1000000
num_black_moves = 1000000
num_white_moves_per_arr = 100000
num_black_moves_per_arr = 100000
def get_valid_moves(game):
valid_moves = []
for i, move in enumerate(game.mainline_moves()):
if not game.board().is_capture(move) and i = 5:
# Append the move index to the valid_moves list
valid_moves.append(i)
return valid_moves
# Get bit representation of chess board
def get_bitboard(board):
bitboard = np.zeros(2 * 6 * 64 + 5, dtype='float32')
piece_indices = {
'p': 0,
'n': 1,
'b': 2,
'r': 3,
'q': 4,
'k': 5}
for i in range(64):
if board.piece_at(i):
color = int(board.piece_at(i).color)
bitboard[(6 * color + piece_indices[board.piece_at(i).symbol().lower()] + 12 * i)] = 1
bitboard[-1] = int(board.turn)
bitboard[-2] = int(board.has_kingside_castling_rights(True))
bitboard[-3] = int(board.has_kingside_castling_rights(False))
bitboard[-4] = int(board.has_queenside_castling_rights(True))
bitboard[-5] = int(board.has_queenside_castling_rights(False))
return bitboard
# Adds 10 moves from game to move_array at location move_index
def add_moves(game, move_array, move_index):
valid_moves = get_valid_moves(game)
moves_count = 0
selected_moves = []
for i in range(8):
if not valid_moves:
break
move = random.choice(valid_moves)
valid_moves.remove(move)
selected_moves.append(move)
moves_count = moves_count + 1
board = chess.Board()
for i, move in enumerate(game.mainline_moves()):
board.push(move)
if move_index = move_array.shape[0]:
break
if i in selected_moves:
move_array[move_index] = get_bitboard(board)
move_index += 1
return move_index, moves_count
def iterate_over_data():
white_moves = np.zeros((num_white_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
black_moves = np.zeros((num_black_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
# _white and black move counts store how many white and black moves have been stored
white_move_index = 0
black_move_index = 0
black_moves_count = 0
white_moves_count = 0
count = 0
white_count = 1
black_count = 1
white_empty = True
black_empty = True
pgn = open(dataPath)
while True:
# Debug printing
if count % 1000 == 0:
print("Game Number: {count}\twhite moves: {white_moves}\tblack moves: {black_moves}".format(
count=count,
black_moves=black_moves_count,
white_moves=white_moves_count))
game = chess.pgn.read_game(pgn)
if not game or white_moves_count = num_white_moves and black_moves_count = num_black_moves:
break
if game.headers["Result"] == "1-0" and white_moves_count num_white_moves:
white_move_index, moves_count = add_moves(game, white_moves, white_move_index % num_white_moves_per_arr)
white_moves_count = white_moves_count + moves_count
if game.headers["Result"] == "0-1" and black_moves_count num_black_moves:
black_move_index, moves_count = add_moves(game, black_moves, black_move_index % num_black_moves_per_arr)
black_moves_count = black_moves_count + moves_count
if white_moves_count num_white_moves_per_arr:
print(len(white_moves))
w_str = str(white_count)
print("Saving white" + w_str + " array")
np.save('data4/white' + w_str + '.npy', white_moves[:num_white_moves_per_arr])
white_count = white_count + 1
white_moves = np.zeros((num_white_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
white_move_index = 0
white_moves_count = 0
if black_moves_count num_black_moves_per_arr:
b_str = str(black_count)
print("Saving black" + b_str + " array")
np.save('data4/black' + b_str + '.npy', black_moves[:num_black_moves_per_arr])
black_count = black_count + 1
black_moves = np.zeros((num_black_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
black_moves_count = 0
black_move_index = 0
count += 1
iterate_over_data()
Splitting data to train and validate set files:
import numpy as np
num_of_positions = 2800000
num_of_positions_per_file = 100000
num_of_train_positions = int(num_of_positions-100000)
whites = np.zeros((num_of_positions, 773), dtype='float32')
blacks = np.zeros((num_of_positions, 773), dtype='float32')
for i in range(28):
print(i + 1)
whites[i * 100000:(i + 1) * 100000] = np.load('./data4/white' + str(i + 1) + '.npy')
blacks[i * 100000:(i + 1) * 100000] = np.load('./data4/black' + str(i + 1) + '.npy')
print("Shuffling white positions")
np.random.shuffle(whites)
print("Shuffling black positions")
np.random.shuffle(blacks)
train_whites = whites[:num_of_train_positions]
train_blacks = blacks[:num_of_train_positions]
val_whites = whites[num_of_train_positions:]
val_blacks = blacks[num_of_train_positions:]
for i in range(int(num_of_train_positions/num_of_positions_per_file)):
np.save('./data5/white_train' + str(i+1) + '.npy', train_whites[i*num_of_positions_per_file:(i+1)*num_of_positions_per_file])
np.save('./data5/black_train' + str(i+1) + '.npy', train_blacks[i*num_of_positions_per_file:(i+1)*num_of_positions_per_file])
if i int((num_of_positions-num_of_train_positions)/num_of_positions_per_file):
np.save('./data5/white_val' + str(i + 1) + '.npy', val_whites[i * num_of_positions_per_file:(i + 1) * num_of_positions_per_file])
np.save('./data5/black_val' + str(i + 1) + '.npy', val_blacks[i * num_of_positions_per_file:(i + 1) * num_of_positions_per_file])
Training autoencoder:
from keras.layers import Dense, Input
from keras.models import Model
import keras
import numpy as np
import gc
from util import DenseTied
class AutoEncoder:
def __init__(self):
self.positions = []
self.positions_val = []
self.model = None
self.encoder = None
self.decoder = None
def __encoder(self):
input_layer = Input(shape=(773,))
hidden_1 = Dense(600, activation='relu')(input_layer)
hidden_2 = Dense(400, activation='relu')(hidden_1)
hidden_3 = Dense(200, activation='relu')(hidden_2)
code = Dense(100, activation='relu')(hidden_3)
encoder = Model(input_layer, code, name='encoder')
encoder.summary()
self.encoder = encoder
return encoder
def __decoder(self):
code_input = Input(shape=(100,))
hidden_1 = DenseTied(200, activation='relu', tied_to=self.encoder.layers[4])(code_input)
hidden_2 = DenseTied(400, activation='relu', tied_to=self.encoder.layers[3])(hidden_1)
hidden_3 = DenseTied(600, activation='relu', tied_to=self.encoder.layers[2])(hidden_2)
output_layer = DenseTied(773, activation='sigmoid', tied_to=self.encoder.layers[1])(hidden_3)
decoder = Model(code_input, output_layer, name='decoder')
decoder.summary()
self.decoder = decoder
return decoder
def encoder_decoder(self, load=0):
input_layer = Input(shape=(773,))
if load:
self.encoder = keras.models.load_model('./Pos2Vec/encoder_v1/encoder_epoch66')
else:
self.__encoder()
self.__decoder()
ec_out = self.encoder(input_layer)
dc_out = self.decoder(ec_out)
autoencoder = Model(input_layer, dc_out, name='autoencoder')
self.model = autoencoder
self.model.summary()
return autoencoder
def train(self, batch_size=256, epochs=20):
self.model.compile(optimizer='adam', loss='binary_crossentropy')
self.load_data3()
for epoch in range(epochs):
self.shuffle_positions()
gc.collect()
train = self.positions[:2000000]
self.model.fit(train, train, validation_data=(self.positions_val, self.positions_val), epochs=1, batch_size=batch_size)
train = []
gc.collect()
print('Saving ./Pos2Vec/encoder_v1/encoder_epoch' + str(epoch+1))
self.encoder.save('./Pos2Vec/encoder_v1/encoder_epoch' + str(epoch+1))
def save(self):
self.encoder.save('./weights3/encoder_v8.h5')
# self.decoder.save('./weights3/decoder_v7.h5')
# self.model.save('./weights3/autoencoder_v8.h5')
def load_data3(self):
positions = 2000000
val_positions = 200000
num_per_file = 100000
self.positions = np.zeros((2*positions, 773), dtype='float32')
self.positions_val = np.zeros((2*val_positions, 773), dtype='float32')
for i in range(int(positions/num_per_file)):
print(i + 1)
start = i*num_per_file
self.positions[start:start + num_per_file] = np.load('./data3/white_train' + str(i + 1) + '.npy')
self.positions[positions + start:positions + start + num_per_file] = np.load('./data3/black_train' + str(i + 1) + '.npy')
if i val_positions/num_per_file:
self.positions_val[start:start + num_per_file] = np.load('./data3/white_val' + str(i + 1) + '.npy')
self.positions_val[val_positions + start:val_positions + start + num_per_file] = np.load('./data3/black_val' + str(i + 1) + '.npy')
def shuffle_positions(self):
# print("---Shuffling white positions---")
# random.shuffle(self.white_positions)
# gc.collect()
# print("---Shuffling black positions---")
# random.shuffle(self.black_positions)
# gc.collect()
print("Shuffling positions")
np.random.shuffle(self.positions)
gc.collect()
def predict(self, data):
return self.encoder.predict(data)
if __name__ == '__main__':
ae = AutoEncoder()
ae.encoder_decoder(load=0)
ae.train(batch_size=256, epochs=100)
ae.save()
DenseTied layer class:
class DenseTied(keras.layers.Layer):
def __init__(self, units,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
tied_to=None,
**kwargs):
self.tied_to = tied_to
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super().__init__(**kwargs)
self.units = units
self.activation = keras.activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = keras.initializers.get(kernel_initializer)
self.bias_initializer = keras.initializers.get(bias_initializer)
self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
self.bias_regularizer = keras.regularizers.get(bias_regularizer)
self.activity_regularizer = keras.regularizers.get(activity_regularizer)
self.kernel_constraint = keras.constraints.get(kernel_constraint)
self.bias_constraint = keras.constraints.get(bias_constraint)
self.input_spec = keras.layers.InputSpec(min_ndim=2)
self.supports_masking = True
def build(self, input_shape):
assert len(input_shape) = 2
input_dim = input_shape[-1]
if self.tied_to is not None:
self.kernel = K.transpose(self.tied_to.kernel)
self._non_trainable_weights.append(self.kernel)
else:
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) = 2
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def call(self, inputs, **kwargs):
output = K.dot(inputs, self.kernel)
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
Siamese model training:
import chess
import numpy as np
import gc
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate
from keras.utils import Sequence
import random
import matplotlib.pyplot as plt
def batch_to_data(white_batch, black_batch):
x1 = []
x2 = []
y = []
min_len = min(len(white_batch), len(black_batch))
for i in range(min_len):
if random.randint(0, 1) == 1:
x1.append(white_batch[i])
x2.append(black_batch[i])
y.append([1, 0])
else:
x1.append(black_batch[i])
x2.append(white_batch[i])
y.append([0, 1])
x1 = np.array(x1).reshape((-1, 773))
x2 = np.array(x2).reshape((-1, 773))
y = np.array(y).reshape((-1, 2))
return [x1, x2], y
# A class for training the models using keras fit_generator
class DeepChessDataGenerator(Sequence):
def __init__(self, batch_size, whites, blacks, train=1):
print("---Initializing data generator---")
self.batch_size = batch_size
self.train = train
self.whites = whites
self.blacks = blacks
if train:
self.num_of_positions = 1000000
else:
self.num_of_positions = len(self.whites)
def __len__(self):
return int(np.floor(self.num_of_positions / self.batch_size))
def __getitem__(self, index):
white_batch = self.whites[index * self.batch_size:(index+1) * self.batch_size]
black_batch = self.blacks[index * self.batch_size:(index+1) * self.batch_size]
return batch_to_data(white_batch, black_batch)
# Shuffle the order of the white and blacks
def on_epoch_end(self):
np.random.shuffle(self.whites)
np.random.shuffle(self.blacks)
class DeepChess:
def __init__(self):
self.model = None # DeepChess model
self.encoder = None # Encoder models, encodes 773 bits (chess board) to 100 bits
# Sets the deep neural network chess model. if load=1 then loads model. otherwise, creates a new model
def neural_chess(self, load=0):
input_size = 773
layer1_size = 400
layer2_size = 200
layer3_size = 100
if load == 1:
# model = keras.models.load_model('dc_models/my_model11.h5')
# encoder = keras.models.load_model('dc_models/encoder11.h5')
model = keras.models.load_model('dc_models/dropout/deepchess-18-0.281-0.846.h5')
encoder = None
else:
# encoder = keras.models.load_model('./Pos2Vec/encoder_v1/encoder_epoch94')
input_layer0 = Input(shape=(input_size, ))
layer1 = Dense(600, activation='relu')(input_layer0)
layer2 = Dense(400, activation='relu')(layer1)
layer3 = Dense(200, activation='relu')(layer2)
layer4 = Dense(100, activation='relu')(layer3)
encoder = Model(input_layer0, layer4)
input_layer1 = Input(shape=(input_size, ))
input_layer2 = Input(shape=(input_size,))
e1 = encoder(input_layer1)
e2 = encoder(input_layer2)
combined = concatenate([e1, e2])
layer1 = Dense(layer1_size, activation='relu')(combined)
layer2 = Dense(layer2_size, activation='relu')(layer1)
layer3 = Dense(layer3_size, activation='relu')(layer2)
output_layer = Dense(2, activation='softmax')(layer3)
model = Model(inputs=[input_layer1, input_layer2], outputs=output_layer)
self.model = model
self.encoder = encoder
model.summary()
return model
# Trains the model for a # of epochs
def fit(self, epochs=50, batch_size=256):
num_of_positions = 2800000
num_of_positions_per_file = 100000
num_of_train_positions = int(num_of_positions-100000)
num_of_val_positions = num_of_positions - num_of_train_positions
train_whites = np.zeros((num_of_train_positions, 773), dtype='float32')
train_blacks = np.zeros((num_of_train_positions, 773), dtype='float32')
val_whites = np.zeros((num_of_val_positions, 773), dtype='float32')
val_blacks = np.zeros((num_of_val_positions, 773), dtype='float32')
for i in range(int(num_of_train_positions/num_of_positions_per_file)):
print('Loading data ' + str(i + 1) + '/27', end='\r')
train_whites[i * 100000:(i + 1) * 100000] = np.load('./data5/white_train' + str(i + 1) + '.npy')
train_blacks[i * 100000:(i + 1) * 100000] = np.load('./data5/black_train' + str(i + 1) + '.npy')
if i int(num_of_val_positions/num_of_positions_per_file):
val_whites[i * 100000:(i + 1) * 100000] = np.load('./data5/white_val' + str(i + 1) + '.npy')
val_blacks[i * 100000:(i + 1) * 100000] = np.load('./data5/black_val' + str(i + 1) + '.npy')
print()
# Data generators
train_generator = DeepChessDataGenerator(batch_size, whites=train_whites, blacks=train_blacks, train=1)
val_generator = DeepChessDataGenerator(batch_size, whites=val_whites, blacks=val_blacks, train=0)
self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
self.model.summary()
history = self.model.fit_generator(train_generator, validation_data=val_generator, epochs=epochs, shuffle=True)
# Plot training validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
# Plot training validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
if __name__ == '__main__':
dc = DeepChess()
dc.neural_chess(load=0)
dc.fit(epochs=20, batch_size=256)
Graph of accuracy and loss after 20 epochs
Topic siamese-networks keras autoencoder deep-learning machine-learning
Category Data Science