how to in enhance A3C entropy?
I'm trying to implement this A3C code in my custom environment, and I have a basic understanding of the algorithm. The algorithm worked, but it did not give me a good performance. I looked into multiple implementations, and each one seemed different to me, like this one, for example, now the algorithm that I write is as follows:
- a3c
class ActorCritics(nn.Module):
def __init__(self,input,n_actions,env,gamma=0.99):
super(ActorCritics,self).__init__()
self.gamma=gamma
self.env=env
self.n_actions=n_actions
self.pi1=nn.Linear(input,128)
self.v1=nn.Linear(input,128)
self.pi2=nn.Linear(128,64)
self.v2=nn.Linear(128,64)
self.pi3=nn.Linear(64,32)
self.v3=nn.Linear(64,32)
self.pi4=nn.Linear(32,16)
self.v4=nn.Linear(32,16)
self.pi5=nn.Linear(16,8)
self.v5=nn.Linear(16,8)
self.pi6=nn.Linear(8,4)
self.v6=nn.Linear(8,4)
self.pi7=nn.Linear(4,2)
self.v7=nn.Linear(4,2)
self.pi=nn.Linear(2,n_actions)
self.v=nn.Linear(2,1)
self.rewards=[]
self.actions=[]
self.states=[]
#this function takes the values of the state,actions,and reward and append to the memory
def remember(self,state,action,reward):
self.actions.append(action)
self.rewards.append(reward)
self.states.append(state)
#this function reset the memory each time we are calling the learning function
def clear_memory(self):
self.states=[]
self.actions=[]
self.rewards=[]
def forward(self,state):
pi1=F.relu(self.pi1(state))
v1=F.relu(self.v1(state))
pi2=F.relu(self.pi2(pi1))
v2=F.relu(self.v2(v1))
pi3=F.relu(self.pi3(pi2))
v3=F.relu(self.v3(v2))
pi4=F.relu(self.pi4(pi3))
v4=F.relu(self.v4(v3))
pi5=F.relu(self.pi5(pi4))
v5=F.relu(self.v5(v4))
pi6=F.relu(self.pi6(pi5))
v6=F.relu(self.v6(v5))
pi7=F.relu(self.pi7(pi6))
v7=F.relu(self.v7(v6))
pi=self.pi(pi7)
v=self.v(v7)
return pi,v
def calc_returns(self,done,vstates):
p,v=self.forward(vstates)
R=v[-1]*(1-int(done))
batch_return=[]
for reward in self.rewards[::-1]:
R=reward+self.gamma*R
batch_return.append(R)
batch_return.reverse()
batch_return=T.tensor(batch_return,dtype=float)
return batch_return
def calc_loss(self,done):
#states=T.tensor(self.states,dtype=T.float)
list_state=[]
if len(self.states)1:
for lstate in self.states:
soruce,end=self.env.state_dec(lstate)
state_v=self.env.state_to_vector(soruce,end)
list_state.append(state_v)
states=T.tensor(list_state)
else:
soruce,end=self.env.state_dec(self.states[0])
state_v=self.env.state_to_vector(soruce,end)
list_state.append(state_v)
states=T.tensor([list_state])
actions=T.tensor(self.actions,dtype=T.float)
returns=self.calc_returns(done,states)
p,values=self.forward(states)
values=values.squeeze()
critic_loss=(returns-values)**2
probs=T.softmax(p,dim=1)
dist=Categorical(probs)
log_probs=dist.log_prob(actions)
actor_loss=-log_probs*(returns-values)
total_loss=(critic_loss+actor_loss).mean()
return total_loss
def choose_action(self,node,action):
state_vector=self.env.state_to_vector(node,action)
state=T.tensor([state_vector],dtype=T.float)
pi,v=self.forward(state)
probs=T.softmax(pi,dim=1)
dist=Categorical(probs)
action=dist.sample().numpy()[0]#take a sample from the categorical dist from 1-22
return action
- Agent
class Agent(mp.Process):
def __init__(self,global_actor_critic,optimizer,input,n_actions
,gamma,lr,worker_name,global_episode_index,env,gather,games,T_max,res_queue,loss_queue):
super(Agent,self).__init__()
self.local_actor_critic=ActorCritics(input,n_actions,env,gamma)
self.global_actor_critic=global_actor_critic
self.worker_name='w%02i'%worker_name
self.episode_idx = global_episode_index
self.env=env
self.gather_eps=gather
self.optimizer=optimizer
self.N_games=games
self.T_max=T_max
self.res_queue=res_queue
self.loss_queue=loss_queue
self.dict_list={'number_of_episodes':[],'score':[],'loss':[]}
def list_remember(self,d_episode,d_score,d_loss):
self.dict_list['number_of_episodes'].append(d_episode)#,d_score,d_loss.item()
self.dict_list['score'].append(d_score)
self.dict_list['loss'].append(d_loss.item())
def run(self):
t_step=1
max_itr=1000
#self.episode_idx is a gloabl parametar from MP class and we need to get the value from it
while self.episode_idx.value self.N_games:
itr=0
done=False
observation=self.env.reset()
score=0
penalties=0
self.local_actor_critic.clear_memory()
while not done:
soruce,end=self.env.state_dec(observation)
action=self.local_actor_critic.choose_action(soruce,end)
observation_,reward,done=self.env.step(observation,action)
if reward == -1000:
penalties+=1
score += reward
self.local_actor_critic.remember(observation,action,reward)
if t_step% self.T_max==0 or done:
loss=self.local_actor_critic.calc_loss(done)
self.optimizer.zero_grad()
loss.backward()
#set the current parameters for the workers into the gloabl parameters
for local_param,global_param in zip(self.local_actor_critic.parameters(),
self.global_actor_critic.parameters()):
global_param._grad=local_param.grad
self.optimizer.step()
self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict())
self.local_actor_critic.clear_memory()
t_step+=1
itr+=1
observation=observation_
print(self.worker_name,'episode',self.episode_idx.value,'reward',score,'penalties',penalties,'goal',done,
'itr_to_done',itr,'loss',loss.item(),'\n',flush=True)
self.list_remember(self.episode_idx.value,score,loss)
self.gather_eps.append_data(self.episode_idx.value,score,loss.item())
self.res_queue.put(score)
self.loss_queue.put(loss.item())
with self.episode_idx.get_lock():
self.episode_idx.value+=1
self.res_queue.put(None)
self.loss_queue.put(None)
now my question did I implement the algorithm right and is the entropy of my code is also written right, and how can I enhance the code?
Topic actor-critic pytorch reinforcement-learning deep-learning
Category Data Science