tensorflow4:建立一個簡單的強化學習遊戲
阿新 • • 發佈:2019-02-20
Deep Q Network是DeepMind最早(2013年)提出來的,是深度強化學習方法。最開始AI什麼也不會,通過給它提供遊戲介面畫素和分數,慢慢把它訓練成遊戲高手。這裡首先給出一個基本的遊戲例子,然後再給出強化學習方法。
1.基本遊戲
#coding=utf-8
import pygame
from pygame.locals import *
import sys
BLACK =(0,0,0)
WHITE = (255,255,255)
SCREEN_SIZE = [320,400]#螢幕大小
BAR_SIZE = [20,5]#擋板大小
BALL_SIZE = [15,15]#球的尺寸
class Game(object):
def __init__(self):
pygame.init()
self.clock = pygame.time.Clock()#定時器
self.screen = pygame.display.set_mode(SCREEN_SIZE)
pygame.display.set_caption('Simple Game')
self.ball_pos_x = SCREEN_SIZE[0]//2 - BALL_SIZE[0]/2
self.ball_pos_y = SCREEN_SIZE[1 ]//2 - BALL_SIZE[1]/2
#ball 移動方向
self.ball_dir_x = -1 #-1:left 1:right
self.ball_dir_y = -1# -1:up
self.ball_pos = pygame.Rect(self.ball_pos_x,self.ball_pos_y,BALL_SIZE[0],BALL_SIZE[1])
self.score =0
self.bar_pos_x = SCREEN_SIZE[0]//2 - BAR_SIZE[0]//2
self.bar_pos = pygame.Rect(self.bar_pos_x,SCREEN_SIZE[1 ]-BAR_SIZE[1],BAR_SIZE[0],BALL_SIZE[1])
def bar_move_left(self):#左移
self.bar_pos_x = self.bar_pos_x - 2
def bar_move_right(self):
self.bar_pos_x = self.bar_pos_x + 2
def run(self):
pygame.mouse.set_visible(0) #移動滑鼠不可見
bar_move_left =False
bar_move_right = False
while True:
for event in pygame.event.get():
if event.type == QUIT:
pygame.quit()
sys.exit()#接收到退出事件後退出程式
elif event.type == pygame.MOUSEBUTTONDOWN and event.button ==1:#滑鼠左鍵按下
bar_move_left = True
elif event.type == pygame.MOUSEBUTTONUP and event.button == 1: #左鍵彈起
bar_move_left = False
elif event.type == pygame.MOUSEBUTTONDOWN and event.button == 3:#右鍵
bar_move_right = True
elif event.type == pygame.MOUSEBUTTONUP and event.button == 3: # 左鍵彈起
bar_move_right = False
if bar_move_left == True and bar_move_right ==False:
self.bar_move_left()
if bar_move_left == False and bar_move_right == True:
self.bar_move_right()
self.screen.fill(BLACK)
self.bar_pos.left = self.bar_pos_x
pygame.draw.rect(self.screen, WHITE, self.bar_pos)
self.bar_pos.left += self.ball_dir_x * 2
self.ball_pos.bottom += self.ball_dir_y * 3
pygame.draw.rect(self.screen, WHITE, self.ball_pos)
if self.ball_pos.top <= 0 or self.ball_pos.bottom >= (SCREEN_SIZE[1]- BAR_SIZE[1] + 1):
self.ball_dir_y =self.ball_dir_y * -1
if self.ball_pos.left <=0 or self.ball_pos.right >= (SCREEN_SIZE[0]):
self.ball_dir_x = self.ball_dir_x * -1
if self.bar_pos.top <= self.ball_pos.bottom and (
self.bar_pos.left < self.ball_pos.right and self.bar_pos.right > self.ball_pos.left):
self.score += 1
print("Score: ", self.score, end='\r')
elif self.bar_pos.top <= self.ball_pos.bottom and (
self.bar_pos.left > self.ball_pos.right or self.bar_pos.right < self.ball_pos.left):
print("Game Over: ", self.score)
return self.score
pygame.display.update()#更新軟體介面顯示
self.clock.tick(60)
game = Game()
game.run()#啟動
執行結果:
2.強化學習程式碼:
#coding=utf-8
import pygame
import random
from pygame.locals import *
import numpy as np
from collections import deque
import tensorflow as tf
import cv2
BLACK= (0,0,0)
WHITE = (255, 255, 255)
SCREEN_SIZE = [320, 400]
BAR_SIZE = [50, 5]
BALL_SIZE = [15, 15]
# 神經網路的輸出
MOVE_STAY = [1, 0, 0]
MOVE_LEFT = [0, 1, 0]
MOVE_RIGHT = [0, 0, 1]
class Game(object):
def __init__(self):
pygame.init()
self.clock = pygame.time.Clock()
self.screen = pygame.display.set_mode(SCREEN_SIZE)
pygame.display.set_caption('Simple Game')
self.ball_pos_x = SCREEN_SIZE[0] // 2 - BALL_SIZE[0] / 2
self.ball_pos_y = SCREEN_SIZE[1] // 2 - BALL_SIZE[1] / 2
self.ball_dir_x = -1 # -1 = left 1 = right
self.ball_dir_y = -1 # -1 = up 1 = down
self.ball_pos = pygame.Rect(self.ball_pos_x, self.ball_pos_y, BALL_SIZE[0], BALL_SIZE[1])
self.bar_pos_x = SCREEN_SIZE[0] // 2 - BAR_SIZE[0] // 2
self.bar_pos = pygame.Rect(self.bar_pos_x, SCREEN_SIZE[1] - BAR_SIZE[1], BAR_SIZE[0], BAR_SIZE[1])
# action是MOVE_STAY、MOVE_LEFT、MOVE_RIGHT
# ai控制棒子左右移動;返回遊戲介面畫素數和對應的獎勵。(畫素->獎勵->強化棒子往獎勵高的方向移動)
def step(self, action):
if action == MOVE_LEFT:
self.bar_pos_x = self.bar_pos_x - 2
elif action == MOVE_RIGHT:
self.bar_pos_x = self.bar_pos_x + 2
else:
pass
if self.bar_pos_x < 0:
self.bar_pos_x = 0
if self.bar_pos_x > SCREEN_SIZE[0] - BAR_SIZE[0]:
self.bar_pos_x = SCREEN_SIZE[0] - BAR_SIZE[0]
self.screen.fill(BLACK)
self.bar_pos.left = self.bar_pos_x
pygame.draw.rect(self.screen, WHITE, self.bar_pos)
self.ball_pos.left += self.ball_dir_x * 2
self.ball_pos.bottom += self.ball_dir_y * 3
pygame.draw.rect(self.screen, WHITE, self.ball_pos)
if self.ball_pos.top <= 0 or self.ball_pos.bottom >= (SCREEN_SIZE[1] - BAR_SIZE[1] + 1):
self.ball_dir_y = self.ball_dir_y * -1
if self.ball_pos.left <= 0 or self.ball_pos.right >= (SCREEN_SIZE[0]):
self.ball_dir_x = self.ball_dir_x * -1
reward = 0
if self.bar_pos.top <= self.ball_pos.bottom and (
self.bar_pos.left < self.ball_pos.right and self.bar_pos.right > self.ball_pos.left):
reward = 1 # 擊中獎勵
elif self.bar_pos.top <= self.ball_pos.bottom and (
self.bar_pos.left > self.ball_pos.right or self.bar_pos.right < self.ball_pos.left):
reward = -1# 沒擊中懲罰
# 獲得遊戲介面畫素
screen_image = pygame.surfarray.array3d(pygame.display.get_surface())
pygame.display.update()
# 返回遊戲介面畫素和對應的獎勵
return reward, screen_image
# learning_rate
LEARNING_RATE = 0.99
# 更新梯度
INITIAL_EPSILON = 1.0
FINAL_EPSILON = 0.05
# 測試觀測次數
EXPLORE = 500000
OBSERVE = 50000
# 儲存過往經驗大小
REPLAY_MEMORY = 500000
BATCH = 100
output = 3 # 輸出層神經元數。代表3種操作-MOVE_STAY:[1, 0, 0] MOVE_LEFT:[0, 1, 0] MOVE_RIGHT:[0, 0, 1]
input_image = tf.placeholder("float", [None, 80, 100, 4]) # 遊戲畫素
action = tf.placeholder("float", [None, output]) # 操作
# 定義CNN-卷積神經網路 參考:http://blog.topspeedsnail.com/archives/10451
def convolutional_neural_network(input_image):
weights = {'w_conv1': tf.Variable(tf.zeros([8, 8, 4, 32])),
'w_conv2':tf.Variable(tf.zeros([4, 4, 32, 64])),
'w_conv3':tf.Variable(tf.zeros([3, 3, 64, 64])),
'w_fc4':tf.Variable(tf.zeros([3456, 784])),
'w_out':tf.Variable(tf.zeros([784, output]))}
biases = {'b_conv1': tf.Variable(tf.zeros([32])),
'b_conv2':tf.Variable(tf.zeros([64])),
'b_conv3':tf.Variable(tf.zeros([64])),
'b_fc4':tf.Variable(tf.zeros([784])),
'b_out':tf.Variable(tf.zeros([output]))}
conv1 = tf.nn.relu(
tf.nn.conv2d(input_image, weights['w_conv1'], strides=[1, 4, 4, 1], padding="VALID") + biases['b_conv1'])
conv2 = tf.nn.relu(
tf.nn.conv2d(conv1, weights['w_conv2'], strides=[1, 2, 2, 1], padding="VALID") + biases['b_conv2'])
conv3 = tf.nn.relu(
tf.nn.conv2d(conv2, weights['w_conv3'], strides=[1, 1, 1, 1], padding="VALID") + biases['b_conv3'])
conv3_flat = tf.reshape(conv3, [-1, 3456])
fc4 = tf.nn.relu(tf.matmul(conv3_flat, weights['w_fc4']) + biases['b_fc4'])
output_layer = tf.matmul(fc4, weights['w_out']) + biases['b_out']
return output_layer
# 深度強化學習入門: https://www.nervanasys.com/demystifying-deep-reinforcement-learning/
# 訓練神經網路
def train_neural_network(input_image):
predict_action = convolutional_neural_network(input_image)
argmax = tf.placeholder("float", [None, output])
gt = tf.placeholder("float", [None])
action = tf.reduce_sum(tf.mul(predict_action, argmax), reduction_indices=1)
cost = tf.reduce_mean(tf.square(action - gt))
optimizer = tf.train.AdamOptimizer(1e-6).minimize(cost)
game = Game()
D = deque()
_, image = game.step(MOVE_STAY)
# 轉換為灰度值
image = cv2.cvtColor(cv2.resize(image, (100, 80)), cv2.COLOR_BGR2GRAY)
# 轉換為二值
ret, image = cv2.threshold(image, 1, 255, cv2.THRESH_BINARY)
input_image_data = np.stack((image, image, image, image), axis=2)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
n = 0
epsilon = INITIAL_EPSILON
while True:
action_t = predict_action.eval(feed_dict={input_image: [input_image_data]})[0]
argmax_t = np.zeros([output], dtype=np.int)
if (random.random() <= INITIAL_EPSILON):
maxIndex = random.randrange(output)
else:
maxIndex = np.argmax(action_t)
argmax_t[maxIndex] = 1
if epsilon > FINAL_EPSILON:
epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
# for event in pygame.event.get(): macOS需要事件迴圈,否則白屏
# if event.type == QUIT:
# pygame.quit()
# sys.exit()
reward, image = game.step(list(argmax_t))
image = cv2.cvtColor(cv2.resize(image, (100, 80)), cv2.COLOR_BGR2GRAY)
ret, image = cv2.threshold(image, 1, 255, cv2.THRESH_BINARY)
image = np.reshape(image, (80, 100, 1))
input_image_data1 = np.append(image, input_image_data[:, :, 0:3], axis=2)
D.append((input_image_data, argmax_t, reward, input_image_data1))
if len(D) > REPLAY_MEMORY:
D.popleft()
if n > OBSERVE:
minibatch = random.sample(D, BATCH)
input_image_data_batch = [d[0] for d in minibatch]
argmax_batch = [d[1] for d in minibatch]
reward_batch = [d[2] for d in minibatch]
input_image_data1_batch = [d[3] for d in minibatch]
gt_batch = []
out_batch = predict_action.eval(feed_dict={input_image: input_image_data1_batch})
for i in range(0, len(minibatch)):
gt_batch.append(reward_batch[i] + LEARNING_RATE * np.max(out_batch[i]))
optimizer.run(feed_dict={gt: gt_batch, argmax: argmax_batch, input_image: input_image_data_batch})
input_image_data = input_image_data1
n = n + 1
if n % 10000 == 0:
saver.save(sess, './game.cpk', global_step=n)# 儲存模型
print(n, "epsilon:", epsilon, " ", "action:", maxIndex, " ", "reward:", reward)
train_neural_network(input_image)
執行結果圖:
剛開始什麼都不會,後來慢慢就比較強悍了!如果想使用該模型需要重新載入,而且最好在GPU上執行,不然真心比較蛋疼。
附上:python-opencv安裝
由於沒有安裝opencv導致import cv2報錯。
需要在這個網站Python Extension Packages裡面下相關的whl檔案。
由於我的電腦是64位的,我之前安裝過python3.5,所以我就選擇了opencv_python-3.2.0+contrib-cp35-cp35m-win_amd64.whl這個檔案。下載後,cmd 安裝:
pip install opencv_python-3.2.0+contrib-cp35-cp35m-win_amd64.whl
搞定,完成!