1. 程式人生 > >【強化學習】python 實現 saras lambda 例一

【強化學習】python 實現 saras lambda 例一

本文作者:hhh5460

本文地址:https://www.cnblogs.com/hhh5460/p/10147265.html

將例一用saras lambda演算法重新擼了一遍,沒有參照任何其他人的程式碼。僅僅根據虛擬碼,就擼出來了。感覺已真正理解了saras lambda演算法。記錄如下

0. saras lambda演算法虛擬碼

圖片來源:https://morvanzhou.github.io/static/results/reinforcement-learning/3-3-1.png(莫凡)

1. saras lambda演算法真實程式碼

# e_table是q_table的拷貝
e_table = q_table.copy()

# ... # saras(lambda)演算法 # 參見:https://morvanzhou.github.io/static/results/reinforcement-learning/3-3-1.png for i in range(13): # 0. e_table清零 e_table *= 0 # 1.從狀態0開始 current_state = 0 # 2.選擇一個合法的動作 current_action = choose_action(current_state, epsilon) # 3.進入迴圈,探索學習 while current_state != states[-1]:
# 4.取下一狀態 next_state = get_next_state(current_state, current_action) # 5.取下一獎勵 next_reward = rewards[next_state] # 6.取下一動作 next_action = choose_action(next_state, epsilon) # 7.計算德塔 delta = next_reward + gamma * q_table.ix[next_state, next_action] - q_table.ix[current_state, current_action]
# 8.當前狀態、動作對應的e_table的值加1 #e_table.ix[current_state, current_action] += 1 # 這是標準的操作,但是莫凡指出改成下面兩句效果更好! e_table.ix[current_state] *= 0 e_table.ix[current_state, current_action] = 1 # 9.遍歷每一個狀態的所有動作(不能僅合法動作) for state in states: for action in actions: # 10.逐個更新q_talbe, e_table中對應的值 q_table.ix[state, action] += alpha * delta * e_table.ix[state, action] e_table.ix[state, action] *= gamma * lambda_ # 11.進入下一狀態、動作 current_state, current_action = next_state, next_action

 第9步,剛開始我這麼寫:for action in get_valid_actions(state):,執行後發現沒有這樣寫好:for action in actions:

2. 完整程式碼

'''
-o---T
# T 就是寶藏的位置, o 是探索者的位置
'''
# 作者: hhh5460
# 時間:20181220

'''saras(lambda)演算法實現'''

import pandas as pd
import random
import time


epsilon = 0.9   # 貪婪度 greedy
alpha = 0.1     # 學習率
gamma = 0.8     # 獎勵遞減值
lambda_ = 0.9   # 衰減值

states = range(6)           # 狀態集。從0到5
actions = ['left', 'right'] # 動作集。也可新增動作'none',表示停留
rewards = [0,0,0,0,0,1]     # 獎勵集。只有最後的寶藏所在位置才有獎勵1,其他皆為0

q_table = pd.DataFrame(data=[[0 for _ in actions] for _ in states],
                       index=states, columns=actions)

e_table = q_table.copy()


def update_env(state):
    '''更新環境,並列印'''
    env = list('-----T') # 環境
    
    env[state] = 'o' # 更新環境
    print('\r{}'.format(''.join(env)), end='')
    time.sleep(0.1)
                       
def get_next_state(state, action):
    '''對狀態執行動作後,得到下一狀態'''
    global states
    # l,r,n = -1,+1,0
    if action == 'right' and state != states[-1]: # 除末狀態(位置),向右+1
        next_state = state + 1
    elif action == 'left' and state != states[0]: # 除首狀態(位置),向左-1
        next_state = state -1
    else:
        next_state = state
    return next_state
                       
def get_valid_actions(state):
    '''取當前狀態下的合法動作集合,與reward無關!'''
    global actions # ['left', 'right']
    valid_actions = set(actions)
    if state == states[0]:              # 首狀態(位置),則 不能向左
        valid_actions -= set(['left'])
    if state == states[-1]:             # 末狀態(位置),則 不能向右
        valid_actions -= set(['right'])
    return list(valid_actions)
    
def choose_action(state, epsilon_=0.9):
    '''選擇動作,根據狀態'''
    if random.uniform(0,1) > epsilon_: # 探索
        action = random.choice(get_valid_actions(state))
    else:                             # 利用(貪婪)
        #current_action = q_table.ix[current_state].idxmax() # 這種寫法是有問題的!
        s = q_table.ix[state].filter(items=get_valid_actions(state))
        action = random.choice(s[s==s.max()].index) # 可能多個最大值,當然,一個更好
    return action
    
# saras(lambda)演算法
# 參見:https://morvanzhou.github.io/static/results/reinforcement-learning/3-3-1.png
for i in range(13):
    e_table *= 0 # 清零
    
    current_state = 0
    current_action = choose_action(current_state, epsilon)
    
    update_env(current_state) # 環境相關
    total_steps = 0           # 環境相關
    
    while current_state != states[-1]:
        next_state = get_next_state(current_state, current_action)
        next_reward = rewards[next_state]
        
        next_action = choose_action(next_state, epsilon)
        delta = next_reward + gamma * q_table.ix[next_state, next_action] - q_table.ix[current_state, current_action]
        #e_table.ix[current_state, current_action] += 1 # 這是標準的操作,但是莫凡指出改成下面兩句效果更好!
        e_table.ix[current_state] *= 0
        e_table.ix[current_state, current_action] = 1
        for state in states:
            for action in actions: #get_valid_actions(state):
                q_table.ix[state, action] += alpha * delta * e_table.ix[state, action]
                e_table.ix[state, action] *= gamma * lambda_
        current_state, current_action = next_state, next_action
        
        update_env(current_state) # 環境相關
        total_steps += 1          # 環境相關
        
    print('\rEpisode {}: total_steps = {}'.format(i, total_steps), end='') # 環境相關
    time.sleep(2)                                                          # 環境相關
    print('\r                                ', end='')                    # 環境相關
    
print('\nq_table:')
print(q_table)