DQN cannot learn or converge
I have implemented a DQN using keras. The task is to collect the circles and avoid the red circle and crosses. The associated rewards are +5, -5 and 0 otherwise. if the agent go out of the board, the game is reset (reward -5 too).
The average reward fluctuates a long and I cannot observe any learning. I tried to use similar settings as for DQN Atari except that I don't concatenate the last 4 frames but train the neural network on the RGB image.
Is there any bug in my code, or do you have any idea how to learn from this environment?
My code:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam,RMSprop
import sys
sys.path.insert(0,'..')
from snake_env import SnakeEnv
from keras import models
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Flatten
from keras.models import Model
import matplotlib.pyplot as plt
import cv2
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
#number of pretrrainign steps
self.init_learn = 100
self.epsilon_decay = 100000
self.memory = deque(maxlen=self.epsilon_decay)
self.gamma = 0.95 # discount rate
self.epsilon = 1.0 # exploration rate
self.epsilon_min = 0.01
#self.epsilon_decay = 0.995
self.learning_rate = 0.00025
self.model = self._build_model()
self.epsilon_rate = ((self.epsilon - self.epsilon_min) /self.epsilon_decay)
print(self.epsilon_rate)
'''print("prediction",self.model.predict(np.array([np.zeros((84,84,3))])))
print(np.zeros((1,84,84,3)).shape,np.zeros((84,84,3)).shape,np.array([np.zeros((84,84,3))]).shape)
raise("ok")'''
def _build_model(self):
model = models.Sequential()
#input_img = Input(shape=(84,84,3))
model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
input_shape=(84,84,3)))
model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(4))
#model = Model(input_img, x)
model.compile(loss='mse',
optimizer=RMSprop(lr=self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() = self.epsilon:
return random.randrange(self.action_size)
act_values = self.model.predict(np.array([state]))
return np.argmax(act_values[0]) # returns action
def replay(self, batch_size):
minibatch = random.sample(self.memory, batch_size)
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = (reward + self.gamma *
np.amax(self.model.predict(np.array([next_state]))[0]))
target_f = self.model.predict(np.array([state]))
target_f[0][action] = target
self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
if self.epsilon self.epsilon_min:
self.epsilon = max(self.epsilon_min,self.epsilon - self.epsilon_rate)
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
def processor(image):
return cv2.resize(image,(84,84),interpolation=cv2.INTER_NEAREST)
def running_mean(x, N):
cumsum = np.cumsum(np.insert(x, 0, 0))
return (cumsum[N:] - cumsum[:-N]) / float(N)
def main():
env = SnakeEnv()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
batch_size = 32
graphics = True
if graphics:
plt.ion()
fig = plt.figure(figsize=(8, 4))
ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)
rewards= []
steps = []
step_total =0
for ep in range(500000):
#reset environnment
env.reset(test=True)
state = env.render()
state = processor(state)
reward_cuml = 0
step = 0
while True:
step +=1
step_total +=1
#take a step
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
reward_cuml += reward
next_state = processor(next_state)
agent.remember(state,action,reward,next_state,done)
#rewards.append(reward)
if done:
print("Number of steps ", step_total ," Epsilon ",agent.epsilon)
rewards.append(reward_cuml)
steps.append(step)
print("Episode ", ep , " reward " , np.sum(rewards) , " mean reward " , np.mean(rewards))
if graphics:
ax1.cla()
ax1.plot(running_mean(rewards,10), '.--')
ax2.cla()
ax2.plot(running_mean(steps,10), '.--')
plt.pause(0.001)
if ep%50==0:
print("saveging")
plt.savefig("rewards.png")
break
state = next_state
if len(agent.memory) agent.init_learn:
agent.replay(batch_size)
main()
how looks like the environment:
Learning curve (average reward / episode)
Topic dqn implementation reinforcement-learning
Category Data Science