1. 程式人生 > >【深度學習】Python實現2層神經網路的誤差反向傳播法學習

【深度學習】Python實現2層神經網路的誤差反向傳播法學習

前言

基於計算圖的反向傳播詳解一篇中,我們通過計算圖的形式詳細介紹了構建神經網路需要的層,我們可以將其視為元件,接下來我們只需要將這些元件組合起來就可以實現誤差反向傳播法。

首先我們回顧下神經網路的學習步驟如下:

  • 從訓練資料中隨機選擇一部分資料(mini-batch)
  • 計算損失函式關於各個權重引數的梯度
  • 將權重引數沿梯度方向進行微小的更新
  • 重複以上步驟

下圖為2層神經網路,圖中紅色表示每層的名稱,每層只畫了固定的神經元數,主要為了讓大家知道層級關係,以便後面進行層的連線

在這裡插入圖片描述

Affine層、Relu層以及SoftmaxWithLoss層實現程式碼

import numpy as np

# softmax函式
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 溢位對策
    return np.exp(x) / np.sum(np.exp(x))
    
# 交叉熵誤差    
def cross_entropy_error(y,
t): if y.ndim == 1: t = t.reshape(1, t.size) y = y.reshape(1, y.size) # 監督資料是one-hot-vector的情況下,轉換為正確解標籤的索引 if t.size == y.size: t = t.argmax(axis=1) batch_size = y.shape[0] return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) /
batch_size # 數值微分(這裡加上這個是為了後面對比數值微分和誤差反向傳播兩種方法求的梯度之間的誤差) def numerical_gradient(f, x): h = 1e-4 # 0.0001 grad = np.zeros_like(x) it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: idx = it.multi_index tmp_val = x[idx] x[idx] = float(tmp_val) + h fxh1 = f(x) # f(x+h) x[idx] = tmp_val - h fxh2 = f(x) # f(x-h) grad[idx] = (fxh1 - fxh2) / (2*h) x[idx] = tmp_val # 還原值 it.iternext() return grad class Affine: def __init__(self, W, b): self.W =W self.b = b self.x = None self.original_x_shape = None # 權重和偏置引數的導數 self.dW = None self.db = None def forward(self, x): # 對應張量 self.original_x_shape = x.shape x = x.reshape(x.shape[0], -1) self.x = x out = np.dot(self.x, self.W) + self.b return out def backward(self, dout): dx = np.dot(dout, self.W.T) self.dW = np.dot(self.x.T, dout) self.db = np.sum(dout, axis=0) dx = dx.reshape(*self.original_x_shape) # 還原輸入資料的形狀(對應張量) return dx class Relu: def __init__(self): self.mask = None def forward(self, x): self.mask = (x <= 0) out = x.copy() out[self.mask] = 0 return out def backward(self, dout): dout[self.mask] = 0 dx = dout return dx class SoftmaxWithLoss: def __init__(self): self.loss = None self.y = None # softmax的輸出 self.t = None # 監督資料 def forward(self, x, t): self.t = t self.y = softmax(x) # 呼叫softmax函式 self.loss = cross_entropy_error(self.y, self.t) # 呼叫cross_entropy_error函式 return self.loss def backward(self, dout=1): batch_size = self.t.shape[0] if self.t.size == self.y.size: # 監督資料是one-hot-vector的情況 dx = (self.y - self.t) / batch_size else: dx = self.y.copy() dx[np.arange(batch_size), self.t] -= 1 dx = dx / batch_size return dx

誤差反向傳播法的神經網路的實現

# coding: utf-8
import numpy as np
from collections import OrderedDict


# 2層神經網路的實現類
class TwoLayerNet:
    # input_size輸入層的神經元數、hidden_size隱藏層神經元數、output_size輸出層神經元數
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化權重
        self.params = {}  # 引數字典
        # 初始化輸入層到隱藏層的權重、偏置
        # 隨機生成形狀為(input_size, hidden_size)的二維陣列
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size) 
        # 生成和隱藏層神經元數相同的一維0陣列
        self.params['b1'] = np.zeros(hidden_size)
        # 初始化隱藏層到輸出層的權重、偏置
        # 隨機生成形狀為(hidden_size, output_size)的二維陣列
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # 生成層
        self.layers = OrderedDict() # 建立有序字典(可以記住向字典中新增元素的順序,在反向傳播時只需要按相反的順序呼叫各層即可)
        # 以正確的順序連線各層
        # Affine1層
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        # Relu1層
        self.layers['Relu1'] = Relu()
        # Affine2層
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        # SoftmaxWithLoss層
        self.lastLayer = SoftmaxWithLoss()
    

    # 進行識別(推理)
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
        
    # 計算損失函式的值 x:輸入資料, t:監督資料
    def loss(self, x, t):
        y = self.predict(x) # 預測
        
        return self.lastLayer.forward(y, t)
    
    # 計算識別精度
    def accuracy(self, x, t):
        y = self.predict(x) # 推理
        y = np.argmax(y, axis=1) # 返回最大值的索引
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        # 如果索引相等,即識別正確,計算精度
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # 計算權重引數梯度 x:輸入資料, t:監督資料
    # 通過誤差反向傳播法計算關於權重的梯度
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        # 設定
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads  # 返回各個引數的梯度
    
    # 計算權重引數梯度 x:輸入資料, t:監督資料
    # 通過數值微分計算(用於對比)
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads  # 返回各個引數的梯度

像這樣通過將每層的元件,按照順序連線起來,可以輕鬆地構建神經網路
這樣,無論是5層、10層、20層 ······,只需要將元件連線起來即可

梯度確認

import mnist

# 讀入資料
(x_train, t_train), (x_test, t_test) = mnist.load_mnist(normalize=True, one_hot_label=True)
# 構建神經網路
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

# 使用部分資料
x_batch = x_train[:3] 
t_batch = t_train[:3] 

# 計算梯度
# 數值微分
grad_numerical = network.numerical_gradient(x_batch, t_batch)
# 誤差反向傳播法
grad_backprop = network.gradient(x_batch, t_batch)

# 求兩種梯度法的梯度誤差
for key in grad_numerical.keys():
    # 求各個權重引數對應元素的差的絕對值,並計算其平均值
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))
輸出為:
W1:3.795184189717579e-10
b1:2.495277803749767e-09
W2:4.712630591287661e-09
b2:1.3932517608367112e-07

通過結果,我們可以看出,兩種方法的計算結果誤差很小,幾乎為0

使用誤差反向傳播法進行神經網路的學習

import time
import mnist

start = time.time()
# 讀入資料
(x_train, t_train), (x_test, t_test) = mnist.load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 使用誤差反向傳播法求梯度
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)
        
end = time.time()
print("耗時:", (end-start))
輸出為:
0.13911666666666667 0.1393
0.90605 0.9098
0.9235666666666666 0.9256
0.9352666666666667 0.9343
0.9438666666666666 0.9433
0.9513 0.9512
0.9576666666666667 0.9546
0.9592333333333334 0.955
0.9645666666666667 0.9608
0.9674833333333334 0.9619
0.96965 0.9635
0.9711833333333333 0.9654
0.9737166666666667 0.9657
0.9735833333333334 0.9661
0.97765 0.9679
0.9779 0.9693
0.97895 0.9686
耗時: 31.826430082321167

總結

  • 數值微分的計算耗時太大,使用誤差反向傳播法學習大大減小了學習的時間
  • 誤差反向傳播法的實現比數值微分法複雜,容易出錯
  • 通過比較數值微分的結果和誤差反向傳播法的結果,以確認誤差反向傳播法的實現是否正確(梯度確認)
  • 誤差反向傳播法可以高效地求解引數梯度