1. 程式人生 > >python實現《機器學習》西瓜書習題5.6自適應學習率的BP改進演算法

python實現《機器學習》西瓜書習題5.6自適應學習率的BP改進演算法

致敬環節:https://blog.csdn.net/Snoopy_Yuan/article/details/70846554
因為太難了,我選擇直接抄,基本無改動。。。
但有點意思的是,自適應學習率的最後,計算出的錯誤率是0.013,和固定學習速率一樣,猜測原始碼大神正好碰上了抖動,而我沒碰上。
不過從理解上,學習速率優化的目的是提升學習效率,畫圖能看到錯誤率減小的速率明顯提升,但大量訓練之後,應該和學習率關聯不明顯了。

主程式adaptive_learningrate_BPnetwork.py

import pandas as pd
import matplotlib.pyplot as plt

#線上載入UCI資料集
from urllib.request import urlopen
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
raw_data = urlopen(url)     # download the file
attr = ['sepal_length','sepal_width','petal_length','petal_width','species']
dataset = pd.read_csv(raw_data, delimiter=",", header = None, names = attr) #names為列名列表

#讀取四個輸入變數的值,iloc是根據列索引讀取
X = dataset.iloc[:,:4].get_values()

# label (generation after transform output to categorical variables)
dataset.iloc[:,-1] = dataset.iloc[:,-1].astype('category')
label = dataset.iloc[:,4].values.categories #類別名稱

# output 1 (generation after string categorical variables to numerical values)
dataset.iloc[:,4].cat.categories = [0,1,2]
y = dataset.iloc[:,4].get_values() #分類這列的數值,賦值替換成0 1 2

# output 2 (generation after one hot encoding)
Y = pd.get_dummies(dataset.iloc[:,4]).get_values() #獨熱編碼

#劃分測試集和驗證集,樣本佔比0.5,隨機數種子是用某種規則生成隨機數
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y, train_Y, test_Y = train_test_split(X,y,Y,test_size = 0.5, random_state = 42)

#構造BP網路
from BP_network import *
bpn1 = BP_network()  # 初始化BP網路
bpn1.CreateNN(4, 5, 3, actfun = 'Sigmoid', learningrate = 0.05)  # build the network

'''
#固定學習率測試
e = []
for i in range(1000):
    err, err_k = bpn1.TrainStandard(train_X, train_Y)
    e.append(err)

# draw the convergence curve of output error by each step of iteration
import matplotlib.pyplot as plt

f1 = plt.figure(1)
plt.xlabel("epochs")
plt.ylabel("error")
plt.ylim(0, 1)
plt.title("training error convergence curve with fixed learning rate")
# plt.title("training error convergence curve\n learning rate = 0.05")
plt.plot(e)
plt.show()

# get the test error in test set
pred = bpn1.PredLabel(test_X);
count = 0
for i in range(len(test_y)):
    if pred[i] == test_y[i]: count += 1

test_err = 1 - count / len(test_y)
print("test error rate: %.3f" % test_err)
'''


#自適應學習率測試
bpn2 = BP_network()  # initial a BP network class
bpn2.CreateNN(4, 5, 3, actfun='Sigmoid', learningrate=0.05)  # build the network

e = []
for i in range(1000):
    err, err_k = bpn2.TrainStandard_Dynamic_Lr(train_X, train_Y)
    e.append(err)

# draw the convergence curve of output error by each step of iteration
# import matplotlib.pyplot as plt
f2 = plt.figure(2)
plt.xlabel("epochs")
plt.ylabel("error")
plt.ylim(0, 1)
plt.title("training error convergence curve with dynamic learning rate")
plt.plot(e)
plt.show()

# get the test error in test set
pred = bpn2.PredLabel(test_X);
count = 0
for i in range(len(test_y)):
    if pred[i] == test_y[i]: count += 1

test_err = 1 - count / len(test_y)
print("test error rate: %.3f" % test_err)

負責具體執行構造網路和訓練的BP_network.py

#很長,裡面包含一堆函式


class BP_network:

    def __init__(self):

        # 每層神經元節點數,i輸入 h隱層 o輸出
        self.i_n = 0
        self.h_n = 0
        self.o_n = 0

        # output value for each layer
        self.i_v = []
        self.h_v = []
        self.o_v = []

        # parameters (w, t)
        self.ih_w = []  # weight for each link
        self.ho_w = []
        self.h_t = []  # threshold for each neuron
        self.o_t = []

        # definition of alternative activation functions and it's derivation
        self.fun = {
            'Sigmoid': Sigmoid,
            'SigmoidDerivate': SigmoidDerivate,   #derivate導數
            'Tanh': Tanh,   #雙曲正切tanh,區別於sigmoid,x=0時y=0
            'TanhDerivate': TanhDerivate,

            # for more, add here
        }

        # initial the learning rate
        self.lr1 = []  # output layer
        self.lr2 = []  # hidden layer

    def CreateNN(self, ni, nh, no, actfun, learningrate):
        '''
        build a BP network structure and initial parameters
        @param ni, nh, no: the neuron number of each layer
        @param actfun: string, the name of activation function
        @param learningrate: learning rate of gradient algorithm
        '''

        # dependent packages
        import numpy as np

        # assignment of node number
        self.i_n = ni
        self.h_n = nh
        self.o_n = no

        # initial value of output for each layer
        self.i_v = np.zeros(self.i_n)
        self.h_v = np.zeros(self.h_n)
        self.o_v = np.zeros(self.o_n)

        # initial weights for each link (random initialization)
        self.ih_w = np.zeros([self.i_n, self.h_n])
        self.ho_w = np.zeros([self.h_n, self.o_n])
        for i in range(self.i_n):
            for h in range(self.h_n):
                self.ih_w[i][h] = rand(0, 1)
        for h in range(self.h_n):
            for j in range(self.o_n):
                self.ho_w[h][j] = rand(0, 1)

        # initial threshold for each neuron
        self.h_t = np.zeros(self.h_n)
        self.o_t = np.zeros(self.o_n)
        for h in range(self.h_n): self.h_t[h] = rand(0, 1)
        for j in range(self.o_n): self.o_t[j] = rand(0, 1)

        # initial activation function
        self.af = self.fun[actfun]
        self.afd = self.fun[actfun + 'Derivate']

        # initial learning rate
        self.lr1 = np.ones(self.o_n) * learningrate
        self.lr2 = np.ones(self.h_n) * learningrate

    def Pred(self, x):
        '''
        predict process through the network
        @param x: the input array for input layer
        '''

        # activate input layer
        for i in range(self.i_n):
            self.i_v[i] = x[i]

        # activate hidden layer
        for h in range(self.h_n):
            total = 0.0
            for i in range(self.i_n):
                total += self.i_v[i] * self.ih_w[i][h]
            self.h_v[h] = self.af(total - self.h_t[h])

        # activate output layer
        for j in range(self.o_n):
            total = 0.0
            for h in range(self.h_n):
                total += self.h_v[h] * self.ho_w[h][j]
            self.o_v[j] = self.af(total - self.o_t[j])

    '''
    for fixed learning rate
    '''

    def BackPropagate(self, x, y):
        '''
        the implementation of BP algorithms on one slide of sample

        @param x, y: array, input and output of the data sample
        '''

        # dependent packages
        import numpy as np

        # get current network output
        self.Pred(x)

        # calculate the gradient based on output
        o_grid = np.zeros(self.o_n)
        for j in range(self.o_n):
            o_grid[j] = (y[j] - self.o_v[j]) * self.afd(self.o_v[j])

        h_grid = np.zeros(self.h_n)
        for h in range(self.h_n):
            for j in range(self.o_n):
                h_grid[h] += self.ho_w[h][j] * o_grid[j]
            h_grid[h] = h_grid[h] * self.afd(self.h_v[h])

            # updating the parameter
        for h in range(self.h_n):
            for j in range(self.o_n):
                self.ho_w[h][j] += self.lr1[j] * o_grid[j] * self.h_v[h]

        for i in range(self.i_n):
            for h in range(self.h_n):
                self.ih_w[i][h] += self.lr2[h] * h_grid[h] * self.i_v[i]

        for j in range(self.o_n):
            self.o_t[j] -= self.lr1[j] * o_grid[j]

        for h in range(self.h_n):
            self.h_t[h] -= self.lr2[h] * h_grid[h]

    def TrainStandard(self, data_in, data_out):
        '''
        standard BP training
        @param lr, learning rate, default 0.05
        @return: e, accumulated error
        @return: e_k, error array of each step
        '''
        e_k = []
        for k in range(len(data_in)):
            x = data_in[k]
            y = data_out[k]
            self.BackPropagate(x, y)

            # error in train set for each step
            y_delta2 = 0.0
            for j in range(self.o_n):
                y_delta2 += (self.o_v[j] - y[j]) * (self.o_v[j] - y[j])
            e_k.append(y_delta2 / 2)

        # total error of training
        e = sum(e_k) / len(e_k)

        return e, e_k

    '''
    for dynamic learning rate
    '''

    def BackPropagate_Dynamic_Lr(self, x, y, d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p, alpha):
        '''
        the implementation of BP algorithms on one slide of sample

        @param x, y: array, input and output of the data sample
        @param d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p: adjust values (delta) of last step
        @param o_grid_p, h_grid_p: gradient of last step
        @param alpha: forget factor

        @return adjust values (delta) of ho_w, ih_w, o_t, h_t,
                and gradient value of o_grid, h_grid for this step
        '''

        # dependent packages
        import numpy as np

        # get current network output
        self.Pred(x)

        # calculate the gradient based on output
        o_grid = np.zeros(self.o_n)
        for j in range(self.o_n):
            o_grid[j] = (y[j] - self.o_v[j]) * self.afd(self.o_v[j])

        h_grid = np.zeros(self.h_n)
        for h in range(self.h_n):
            for j in range(self.o_n):
                h_grid[h] += self.ho_w[h][j] * o_grid[j]
            h_grid[h] = h_grid[h] * self.afd(self.h_v[h])

            # updating the parameter
        lamda = np.sign(o_grid * o_grid_p)
        o_grid_p = o_grid
        for h in range(self.h_n):
            for j in range(self.o_n):
                # adjust learning rate
                o_grid_p[j] = o_grid[j]
                lr = self.lr1[j] * (3 ** lamda[j])
                self.lr1[j] = 0.5 if lr > 0.5 else (0.005 if lr < 0.005 else lr)
                # updating parameter
                d_ho_w_p[h][j] = self.lr1[j] * o_grid[j] * self.h_v[h] + alpha * d_ho_w_p[h][j]
                self.ho_w[h][j] += d_ho_w_p[h][j]

        lamda = np.sign(h_grid * h_grid_p)
        h_grid_p = h_grid
        for i in range(self.i_n):
            for h in range(self.h_n):
                # adjust learning rate
                lr = self.lr2[h] * (3 ** lamda[h])
                self.lr2[j] = 0.5 if lr > 0.5 else (0.005 if lr < 0.005 else lr)

                # updating parameter
                d_ih_w_p[i][h] = self.lr2[h] * h_grid[h] * self.i_v[i] + alpha * d_ih_w_p[i][h]
                self.ih_w[i][h] += d_ih_w_p[i][h]

        for j in range(self.o_n):
            d_o_t_p[j] = -(self.lr1[j] * o_grid[j] + alpha * d_o_t_p[j])
            self.o_t[j] += d_o_t_p[j]

        for h in range(self.h_n):
            d_h_t_p[h] = -(self.lr2[h] * h_grid[h] + alpha * d_h_t_p[h])
            self.h_t[h] += d_h_t_p[h]

        return d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p

    def TrainStandard_Dynamic_Lr(self, data_in, data_out):
        '''
        standard BP training
        @param lr, learning rate, default 0.05
        @return: e, accumulated error
        @return: e_k, error array of each step
        '''
        # dependent packages
        import numpy as np

        d_ih_w_p = np.zeros([self.i_n, self.h_n])  # initial delta values = 0.0
        d_ho_w_p = np.zeros([self.h_n, self.o_n])
        d_h_t_p = np.zeros(self.h_n)
        d_o_t_p = np.zeros(self.o_n)

        o_grid_p = np.zeros(self.o_n)  # initial gradient = 0.01
        h_grid_p = np.zeros(self.h_n)

        e_k = []
        for k in range(len(data_in)):
            x = data_in[k]
            y = data_out[k]
            d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p \
                = self.BackPropagate_Dynamic_Lr(x, y, d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p,
                                                o_grid_p, h_grid_p, 0.2)

            # error in train set for each step
            y_delta2 = 0.0
            for j in range(self.o_n):
                y_delta2 += (self.o_v[j] - y[j]) * (self.o_v[j] - y[j])
            e_k.append(y_delta2 / 2)

        # total error of training
        e = sum(e_k) / len(e_k)

        return e, e_k

    def PredLabel(self, X):
        '''
        predict process through the network

        @param X: the input sample set for input layer
        @return: y, array, output set (0,1,2... - class) based on [winner-takes-all]
        '''
        import numpy as np

        y = []

        for m in range(len(X)):
            self.Pred(X[m])
            #             if self.o_v[0] > 0.5:  y.append(1)
            #             else : y.append(0)
            max_y = self.o_v[0]
            label = 0
            for j in range(1, self.o_n):
                if max_y < self.o_v[j]:
                    label = j
                    max_y = self.o_v[j]
            y.append(label)

        return np.array(y)






'''
the definition of activation functions
'''
def Sigmoid(x):
    '''
    definition of sigmoid function and it's derivation
    '''
    from math import exp
    return 1.0 / (1.0 + exp(-x))

def SigmoidDerivate(y):
    return y * (1 - y)

def Tanh(x):
    '''
    definition of sigmoid function and it's derivation
    '''
    from math import tanh
    return tanh(x)
def TanhDerivate(y):
    return 1 - y*y

'''
the definition of random function
'''
def rand(a, b):
    '''
    random value generation for parameter initialization
    @param a,b: the upper and lower limitation of the random value
    '''
    from random import random
    return (b - a) * random() + a