QLearningpaper#importpackageimportnumpyasnpimportpandasaspdimporttimedefineparametersnp.random.seed(2)N_STATES=6#初始距离保证的距离ACTIONS=['left','right']#行为EPSILON=0.9#选择动作概率,90%的情况选择最优动作。10%选择随机动作ALPHA=0.1#学习效率LAMBDA=0.9#未来奖励衰减,MAX_EPISODES=13#轮数FRESH_TIME=0.3#每一步时间,看效果#buildQLabel,defbuild_q_table(n_state,actions):table=pd.DataFrame(np.zeros((n_state,len(actions))),columns=actions,)print(table)returntable#chooseactiondefchoice_action(state,q_tabel):#根据状态,选择state_action=q_tabel.iloc[state,:]#随机生成一个数,如果大于EPSILON或者所有选中的行为都为0则随机选择一个行为if(np.random.uniform()>EPSILON)or(state_action.all()==0):action_name=np.random.choice(ACTIONS)else:#选择这一步中较大的值action_name=ACTIONS[state_action.argmax()]returnaction_name#创建环境和环境反馈defget_env_feedback(S,A):ifA=='right':ifS==N_STATES-2:S_='terminal'R=1else:S_=S+1R=0else:R=0如果S==0:S_=S否则:S_=S-1返回S_,Rdefupdate_env(S,episode,step_counter):env_list=['-']*(N_STATES-1)+['T']ifS=='terminal':interaction='Episode%s:total_steps=%s'%(str(episode+1),step_counter)print('\r{}'.format(interaction),end='')time.sleep(2)print('\r',end='')else:env_list[S]='O'interaction=''.join(env_list)print('\r{}'.format(interaction),end='')time.sleep(FRESH_TIME)defrl():q_tabel=build_q_table(N_STATES,ACTIONS)forepisodeinrange(MAX_EPISODES):step_counter=0S=0is_terminated=Falseupdate_env(S,episode,step_counter)whilenotis_terminated:A=choice_action(S,q_tabel)S_,R=get_env_feedback(S,A)#估计值q_predict=q_tabel.loc[S,A]ifS_!='terminal':#真实值q_target=R+LAMBDA*q_tabel.iloc[S_,:].max()else:q_target=Ris_terminated=Trueq_tabel.loc[S,A]+=ALPHA*(q_target-q_predict)S=S_update_env(S,episode,step_counter+1)step_counter+=1返回q_tabel如果__name__=='__main__':q_tabel=rl()print('\r\nQ-tabel:\n')print(q_tabel)
