Using "Demon Adam" as optimizer in Tensorflow

I am working with a simple neural network in Google Colab using Python with Tensorflow where I've only tried to use the optimizers already available in keras such as Nadam, Adam, Adadelta, Adagrad etc. The best results so far were achieved with Adam. I found an interesting paper Demon: Improved Neural Network Training with Momentum Decay and I thought I'd try to use it and see if my results can get even better. The first line in the source code reads

class DemonAdam(optimizer.Optimizer):

def __init__(self, iterations, learning_rate=0.0001, momentum=0.9, rho=0.999, use_locking=False, epsilon=1e-8, name=DemonAdam):

When changing my optimizer from 'adam' to DemonAdam(250), 250 = iterations

model.compile(loss='mse',
          optimizer = DemonAdam(250),
          metrics=[tf.keras.metrics.RootMeanSquaredError()])

I get an error in my final line which runs the NN (i'm not sure if iterations is the same as # of epochs but anyway):

hist = run.fit(X_train_normalized, y_train_normalized, batch_size=100, validation_data=(X_test_normalized, y_test_normalized),epochs=250, verbose=2, callbacks = [learning_decay])

I get this error message:

NotImplementedError: in user code:

File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 1021, in train_function  *
    return step_function(self, iterator)
File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 1010, in step_function  **
    outputs = model.distribute_strategy.run(run_step, args=(data,))
File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 1000, in run_step  **
    outputs = model.train_step(data)
File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 863, in train_step
    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v1.py, line 792, in minimize
    self.apply_gradients(grads_and_vars)
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v1.py, line 795, in apply_gradients
    self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)

NotImplementedError: 

I tried changing class DemonAdam(optimizer.Optimizer): to class DemonAdam(tf.keras.optimizers.Optimizer): but this gives the following error:

AttributeError: in user code:

File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 1021, in train_function  *
    return step_function(self, iterator)
File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 1010, in step_function  **
    outputs = model.distribute_strategy.run(run_step, args=(data,))
File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 1000, in run_step  **
    outputs = model.train_step(data)
File /usr/local/lib/python3.7/dist-packages/keras/engine/training.py, line 863, in train_step
    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py, line 532, in minimize
    return self.apply_gradients(grads_and_vars, name=name)
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py, line 639, in apply_gradients
    self._create_all_weights(var_list)
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py, line 825, in _create_all_weights
    self._create_slots(var_list)
File ipython-input-150-5a674b6df690, line 29, in _create_slots
    self._zeros_slot(v, m1, self._name)
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py, line 840, in __getattribute__
    raise e
File /usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py, line 830, in __getattribute__
    return super(OptimizerV2, self).__getattribute__(name)

AttributeError: 'DemonAdam' object has no attribute '_zeros_slot'

I have no idea what this means - is there a simple way to get it work with my neural network?

Edit: Source code

class DemonAdam(tf.keras.optimizers.Optimizer):

def __init__(self, iterations, learning_rate=0.0001, momentum=0.9, rho=0.999, use_locking=False, epsilon=1e-8, name=DemonAdam):
    super(DemonAdam, self).__init__(use_locking, name)
    self._lr = learning_rate
    self._momentum = momentum
    self._rho = rho
    self._iterations = iterations
    self.t = tf.Variable(1.0, trainable=False)
    
    # Tensor versions of the constructor arguments, created in _prepare().
    self._lr_t = None
    self._momentum_t = None
    self._rho_t = None
    self._beta1_power = None
    self._beta2_power = None

def _prepare(self):
    self._lr_t = ops.convert_to_tensor(self._lr, name=learning_rate)
    self._momentum_t = ops.convert_to_tensor(self._momentum, name=momentum)
    self._rho_t = ops.convert_to_tensor(self._rho, name=rho)

def _create_slots(self, var_list):
    # Create slots for the first and second moments.
    first_var = min(var_list, key=lambda x: x.name)

    # Create slots for the first and second moments.
    for v in var_list:
        self._zeros_slot(v, m1, self._name)
        self._zeros_slot(v, v1, self._name)

def _apply_resource_dense(self, grad, var):
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    lr_t = lr_t 
    rho_t = math_ops.cast(self._rho_t, var.dtype.base_dtype)
    eps = 1e-8 
    t = self.t
    
    v = self.get_slot(var, v1)
    v_t = v.assign(rho_t * v + (1. - rho_t) * grad * grad)
    
    m = self.get_slot(var, m1)
    z = (self._iterations -t) /(self._iterations)
    cur_momentum = (self._momentum) * (z / ( 1 - self._momentum + self._momentum * z))
    m_t = m.assign(cur_momentum * m + grad)

    var_update = state_ops.assign_sub(var, lr_t * ((m_t) / ((v_t /  (1 - rho_t**t)) ** 0.5  + eps)))
    return control_flow_ops.group(*[var_update, v_t, m_t])

def _apply_sparse(self, grad, var):
    raise NotImplementedError(Sparse gradient updates are not supported.)
    
def _finish(self, update_ops, name_scope):
    t = self.t.assign_add(1.0)

    return control_flow_ops.group(*update_ops + [t], name=name_scope)

Topic keras tensorflow neural-network python machine-learning

Category Data Science


You could potentially make the update to beta_1 using a callback instead of creating a new optimizer. An example of this would be like so


import tensorflow as tf
from tensorflow import keras

class DemonAdamUpdate(keras.callbacks.Callback):

    def __init__(self, beta_1: tf.Variable, total_steps: int, beta_init: float=0.9):
        super(DemonAdamUpdate, self).__init__()
        self.beta_1 = beta_1
        self.beta_1.assign(beta_init)
        self.beta_init = beta_init
        self.total_steps = total_steps

    def on_batch_end(self, batch, logs=None):
        step = tf.cast(self.model.optimizer.iterations, tf.float32)
        decay = tf.maximum(0., 1. - step / self.total_steps)
        beta = self.beta_init * decay / ((1. - self.beta_init) + self.beta_init * decay)
        self.beta_1.assign(beta)
        logs["beta_1"] = self.beta_1



beta_1 = tf.Variable(0.9, trainable=False)
opt = keras.optimizers.Adam(learning_rate=1e-3, beta_1=beta_1)

# ... define dataset, model, and compile model

model.compile(optimizer=opt)
model.fit(train_ds, callbacks=[ DemonAdamUpdate(beta_1=beta_1, total_steps=10000)])

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.