Why could my DDQN get significantly worse after beating the game repeatedly?
I've been trying to train a DDQN to play OpenAI Gym's CartPole-v1, but found that although it starts off well and starts getting full score (500) repeatedly (at around 600 episodes in the pic below), it then seems to go off the rails and do worse the more it plays.
I'm pretty new to ML so I'm not really sure what could cause this so I'm not sure how to start debugging (I've tried tweaking some of the hyper-parameters, but nothing seems to stop this trend).
If it helps, here's the (probably) relevant parts of my agent:
def _build_model(self):
model = Sequential()
model.add(Dense(24, input_dim=self.state_size, activation=relu))
model.add(Dense(24, activation=relu))
model.add(Dense(self.action_size, activation=linear))
model.compile(optimizer=Adam(lr=self.learning_rate), loss=mse)
return model
def get_action(self, state):
# Use random exploration for the current rate.
if np.random.rand() self.epsilon:
return random.randrange(self.action_size)
# Otherwise use the model to predict the rewards and select the max.
q_values = self.model.predict(state)
return np.argmax(q_values[0])
def replay(self, batch_size):
if len(agent.memory) minibatch_size:
return
# Decay the exploration rate.
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_min, self.epsilon)
minibatch = random.sample(self.memory, minibatch_size)
state_batch, q_values_batch = [], []
for state, action, reward, next_state, done in minibatch:
# Get predictions for all actions for the current state.
q_values = self.model.predict(state)
# If we're not done, add on the future predicted reward at the discounted rate.
if done:
q_values[0][action] = reward
else:
f = self.target_model.predict(next_state)
future_reward = max(self.target_model.predict(next_state)[0])
q_values[0][action] = reward + self.gamma * future_reward
state_batch.append(state[0])
q_values_batch.append(q_values[0])
# Re-fit the model to move it closer to this newly calculated reward.
self.model.fit(np.array(state_batch), np.array(q_values_batch), batch_size=batch_size, epochs=1, verbose=0)
self.update_weights()
def update_weights(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
self.target_model.set_weights(target_weights)
And the full notebook is here.
Topic openai-gym q-learning reinforcement-learning deep-learning python
Category Data Science