1. 程式人生 > >跟我學演算法-強化學習16宮格(向唐老師看齊)

跟我學演算法-強化學習16宮格(向唐老師看齊)

強化學習:當前的獎勵值:

#當前獎勵 =  當前的概率*(及時獎勵 + 衰減係數 * 下一次的獎勵)
不斷迭代,直到當前的獎勵值不發生變換
import numpy as np
from gridworld import GridworldEnv


env = GridworldEnv()

def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI environment. env.P represents the transition probabilities of the environment.
        theta: Stopping threshold. If the value of all states changes less than theta
            in one iteration we are done.
        discount_factor: lambda time discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    
""" def one_step_lookahead(state, V): """ Helper function to calculate the value for all action in a given state. Args: state: The state to consider (int) V: The value to use as an estimator, Vector of length env.nS Returns: A vector of length env.nA containing the expected value of each action.
""" # 每個位置的4個方向,計算當前位置的獎勵值 A = np.zeros(env.nA) # 迭代四次 for a in range(env.nA): for prob, next_state, reward, done in env.P[state][a]: #當前獎勵 = 當前的概率*(及時獎勵 + 衰減係數 * 下一次的獎勵) A[a] += prob * (reward + discount_factor * V[next_state])
return A V = np.zeros(env.nS) while True: # Stopping condition delta = 0 # Update each state... for s in range(env.nS): # Do a one-step lookahead to find the best action A = one_step_lookahead(s, V) # 選擇獎勵值最高的數 best_action_value = np.max(A) # Calculate delta across all states seen so far delta = max(delta, np.abs(best_action_value - V[s])) # Update the value function #V[s]使用最好的獎勵值表示 V[s] = best_action_value # Check if we can stop # 如果獎勵值不發生變化,跳出迴圈 if delta < theta: break # Create a deterministic policy using the optimal value function # 獲得當前位置對應的最佳移動方向 policy = np.zeros([env.nS, env.nA]) for s in range(env.nS): # One step lookahead to find the best action for this state A = one_step_lookahead(s, V) # 最好的方向 best_action = np.argmax(A) # Always take the best action # s表示位置,best_action表示方向,用於後續的操作 policy[s, best_action] = 1.0 return policy, V policy, v = value_iteration(env) print("Policy Probability Distribution:") print(policy) print("") print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") print(np.reshape(np.argmax(policy, axis=1), env.shape)) print("")