for state in range(self.obs_dim): #由于已知策略,可以直接根据policy,输入当前state找出act act = self.policy(state) transition_list = self._get_transitions(state, act)
state_value = 0 for transition in transition_list: prob = transition['prob'] reward = transition['reward'] next_state = transition['next_state'] done = transition['done']
# [TODO] what is the right state value? # hint: you should use reward, self.gamma, old_table, prob, # and next_state to compute the state value state_value += prob*(reward+self.gamma*1*old_table[next_state])
# update the state value self.table[state] = state_value
# [TODO] Compare the old_table and current table to # decide whether to break the value update process. # hint: you should use self.eps, old_table and self.table should_break = None #如果迭代两次差距太小就说明收敛 if (np.fabs(self.table - old_table) < np.array([self.eps]*self.obs_dim)).all(): should_break = True
defupdate_policy(self): """You need to define a new policy function, given current value function. The best action for a given state is the one that has greatest expected return.
To optimize computing efficiency, we introduce a policy table, which take state as index and return the action given a state. """ policy_table = np.zeros([self.obs_dim, ], dtype=np.int)
for state in range(self.obs_dim): state_action_values = [0] * self.action_dim
# [TODO] assign the action with greatest "value" # to policy_table[state] # hint: what is the proper "value" here? # you should use table, gamma, reward, prob, # next_state and self._get_transitions() function # as what we done at self.update_value_function() # Bellman equation may help. best_action = None for act in range(self.action_dim): transition_list = self._get_transitions(state, act) action_value = 0 for transition in transition_list: prob = transition['prob'] reward = transition['reward'] next_state = transition['next_state'] done = transition['done'] action_value += prob*(reward+self.gamma*self.table[next_state]) state_action_values[act] = action_value best_action = np.argmax(state_action_values) policy_table[state] = best_action
old_policy_result = { obs: -1for obs in range(trainer.obs_dim) } for i in range(config['max_iteration']): # train the agent trainer.train() # [TODO] please uncomment this line
# evaluate the result if i % config['evaluate_interval'] == 0: print("[INFO]\tIn {} iteration, current " "mean episode reward is {}.".format( i, trainer.evaluate() ))
# [TODO] compare the new policy with old policy to check should # we stop. # [HINT] If new and old policy have same output given any # observation, them we consider the algorithm is converged and # should be stopped. should_stop = None should_stop = True new_policy_result = { obs: trainer.policy(obs) for obs in range(trainer.obs_dim) } for obs in range(trainer.obs_dim): if new_policy_result[obs] != old_policy_result[obs]: should_stop = False break old_policy_result = new_policy_result if should_stop: print("We found policy is not changed anymore at " "iteration {}. Current mean episode reward " "is {}. Stop training.".format(i, trainer.evaluate())) break