【強化學習】python 實現 saras lambda 例一
阿新 • • 發佈:2018-12-20
本文作者:hhh5460
本文地址:https://www.cnblogs.com/hhh5460/p/10147265.html
將例一用saras lambda演算法重新擼了一遍,沒有參照任何其他人的程式碼。僅僅根據虛擬碼,就擼出來了。感覺已真正理解了saras lambda演算法。記錄如下
0. saras lambda演算法虛擬碼
圖片來源:https://morvanzhou.github.io/static/results/reinforcement-learning/3-3-1.png(莫凡)
1. saras lambda演算法真實程式碼
# e_table是q_table的拷貝 e_table = q_table.copy()# ... # saras(lambda)演算法 # 參見:https://morvanzhou.github.io/static/results/reinforcement-learning/3-3-1.png for i in range(13): # 0. e_table清零 e_table *= 0 # 1.從狀態0開始 current_state = 0 # 2.選擇一個合法的動作 current_action = choose_action(current_state, epsilon) # 3.進入迴圈,探索學習 while current_state != states[-1]:# 4.取下一狀態 next_state = get_next_state(current_state, current_action) # 5.取下一獎勵 next_reward = rewards[next_state] # 6.取下一動作 next_action = choose_action(next_state, epsilon) # 7.計算德塔 delta = next_reward + gamma * q_table.ix[next_state, next_action] - q_table.ix[current_state, current_action]# 8.當前狀態、動作對應的e_table的值加1 #e_table.ix[current_state, current_action] += 1 # 這是標準的操作,但是莫凡指出改成下面兩句效果更好! e_table.ix[current_state] *= 0 e_table.ix[current_state, current_action] = 1 # 9.遍歷每一個狀態的所有動作(不能僅合法動作) for state in states: for action in actions: # 10.逐個更新q_talbe, e_table中對應的值 q_table.ix[state, action] += alpha * delta * e_table.ix[state, action] e_table.ix[state, action] *= gamma * lambda_ # 11.進入下一狀態、動作 current_state, current_action = next_state, next_action
第9步,剛開始我這麼寫:for action in get_valid_actions(state):,執行後發現沒有這樣寫好:for action in actions:
2. 完整程式碼
''' -o---T # T 就是寶藏的位置, o 是探索者的位置 ''' # 作者: hhh5460 # 時間:20181220 '''saras(lambda)演算法實現''' import pandas as pd import random import time epsilon = 0.9 # 貪婪度 greedy alpha = 0.1 # 學習率 gamma = 0.8 # 獎勵遞減值 lambda_ = 0.9 # 衰減值 states = range(6) # 狀態集。從0到5 actions = ['left', 'right'] # 動作集。也可新增動作'none',表示停留 rewards = [0,0,0,0,0,1] # 獎勵集。只有最後的寶藏所在位置才有獎勵1,其他皆為0 q_table = pd.DataFrame(data=[[0 for _ in actions] for _ in states], index=states, columns=actions) e_table = q_table.copy() def update_env(state): '''更新環境,並列印''' env = list('-----T') # 環境 env[state] = 'o' # 更新環境 print('\r{}'.format(''.join(env)), end='') time.sleep(0.1) def get_next_state(state, action): '''對狀態執行動作後,得到下一狀態''' global states # l,r,n = -1,+1,0 if action == 'right' and state != states[-1]: # 除末狀態(位置),向右+1 next_state = state + 1 elif action == 'left' and state != states[0]: # 除首狀態(位置),向左-1 next_state = state -1 else: next_state = state return next_state def get_valid_actions(state): '''取當前狀態下的合法動作集合,與reward無關!''' global actions # ['left', 'right'] valid_actions = set(actions) if state == states[0]: # 首狀態(位置),則 不能向左 valid_actions -= set(['left']) if state == states[-1]: # 末狀態(位置),則 不能向右 valid_actions -= set(['right']) return list(valid_actions) def choose_action(state, epsilon_=0.9): '''選擇動作,根據狀態''' if random.uniform(0,1) > epsilon_: # 探索 action = random.choice(get_valid_actions(state)) else: # 利用(貪婪) #current_action = q_table.ix[current_state].idxmax() # 這種寫法是有問題的! s = q_table.ix[state].filter(items=get_valid_actions(state)) action = random.choice(s[s==s.max()].index) # 可能多個最大值,當然,一個更好 return action # saras(lambda)演算法 # 參見:https://morvanzhou.github.io/static/results/reinforcement-learning/3-3-1.png for i in range(13): e_table *= 0 # 清零 current_state = 0 current_action = choose_action(current_state, epsilon) update_env(current_state) # 環境相關 total_steps = 0 # 環境相關 while current_state != states[-1]: next_state = get_next_state(current_state, current_action) next_reward = rewards[next_state] next_action = choose_action(next_state, epsilon) delta = next_reward + gamma * q_table.ix[next_state, next_action] - q_table.ix[current_state, current_action] #e_table.ix[current_state, current_action] += 1 # 這是標準的操作,但是莫凡指出改成下面兩句效果更好! e_table.ix[current_state] *= 0 e_table.ix[current_state, current_action] = 1 for state in states: for action in actions: #get_valid_actions(state): q_table.ix[state, action] += alpha * delta * e_table.ix[state, action] e_table.ix[state, action] *= gamma * lambda_ current_state, current_action = next_state, next_action update_env(current_state) # 環境相關 total_steps += 1 # 環境相關 print('\rEpisode {}: total_steps = {}'.format(i, total_steps), end='') # 環境相關 time.sleep(2) # 環境相關 print('\r ', end='') # 環境相關 print('\nq_table:') print(q_table)