1. 程式人生 > >Deep Learning 學習筆記(二):神經網路Python實現

Deep Learning 學習筆記(二):神經網路Python實現

多層神經網路的Python實現。

程式碼先貼上,程式設計的東西不解釋。

程式碼中出現的SupervisedLearningModel、NNLayer和SoftmaxRegression,請參考上一篇筆記:Deep Learning 學習筆記(一)——softmax Regression

多層神經網路:

import numpy as np
from NNBase import NNLayer
from softmax import SoftmaxRegression
from dp.supervised import NNBase
from time import time
class MNN(NNBase.SupervisedLearningModel):
    '''
    classdocs
    '''
    def __init__(self, params):
        '''
        Constructor
        parameters:
            params     - the network configuration, dict
            params.inputSize        - dimension of input features
            params.outputSize        - number of output classes
            params.layerSizes        - an array, sizes of all layer, including all hidden layers and output layer
            params.Lambda            -scaling parameter for l2 weight regularization penalty
            params.activitionFunc    -which type of activation function to use in hidden layers
        
        '''         
        layerSizes =  params['layerSizes']
        self.numLayers = len(layerSizes)
        self.allLayers = []
        self.X=0
        
        #initialize all hidden layers
        inputSize = params['inputSize']
        for i in range(self.numLayers-1):
            layer = NNLayer(inputSize,layerSizes[i],params['Lambda'],actFunc=params['activitionFunc'] )
            self.allLayers.append(layer)
            inputSize=layerSizes[i]
        #initialize the softmax layer - output layer
        outputLayer=SoftmaxRegression(inputSize,params['outputSize'],params['Lambda'])
        self.allLayers.append(outputLayer)
        
     
    def rebuildTheta(self,theta):
        '''
        convert the 1-dim weight to all layers weights and intercepts
        overwrite the method of super class 
        '''  
        starter=0
        for i in range(self.numLayers):
            thetaSize =(self.allLayers[i].inputSize+1)*self.allLayers[i].outputSize            
            th=theta[starter:starter+thetaSize]
            starter=starter+thetaSize
            self.allLayers[i].rebuildTheta(th)     
    
    def flatTheta(self):
        '''
        convert all weights and intercept to 1-dim vector
        overwrite the method of super class
        '''
        theta= self.allLayers[0].flatTheta()
        for i in range(self.numLayers-1):
            temp = self.allLayers[i+1].flatTheta()
            theta =np.hstack((theta,temp)) 
                      
        return theta
    
    def nnForward(self,theta,X,y):
        '''
         the forward method 
        '''
        
        act=X
        self.rebuildTheta(theta)
        self.allLayers[-1].setTrainingLabels(y)        
        
        for i in range(self.numLayers):
            self.allLayers[i].input=act
            act = self.allLayers[i].forward()            
        
        return act

    def cost(self, theta,X,y):
        '''
        The cost function.
        Parameters:
            theta    - The vector hold the weights and intercept, needed by scipy.optimize function
                       size: (numClasses - 1)*(numFeatures + 1)        
        '''
        h = np.log(self.nnForward(theta,X,y))
        #h * self.y_mat, apply the indicator function
        cost = -np.sum(h *self.allLayers[-1].y_mat, axis=(0, 1))/ X.shape[1]
        
        return cost 
    
    def gradient(self,theta,X,y):
        '''
        compute the gradient.
        overwrite the method of super class.
        Parameters:
            theta    - 1-dim vector,containing all weights and intercepts
        '''
        self.nnForward(theta,X,y)
        
        i= self.numLayers-1
        grad = np.empty(0)
        while i>0:
            #get the gradient of one layer                                
            gwb=self.allLayers[i].layerGradient() 
            #backpropagate the error terms 
            self.allLayers[i-1].delta=self.allLayers[i].backpropagate()             
            grad=np.hstack((gwb.ravel(),grad))                      
            i=i-1         
        #get the the gradient of the first hidden layer        
        gwb=self.allLayers[0].layerGradient() 
                   
        grad=np.hstack((gwb.ravel(),grad))      
        return grad
    
    def costFunc(self,theta,X,y):
        '''
        '''
        grad=self.gradient(theta, X, y)
        
        h=np.log(self.allLayers[-1].activation)
        cost = -np.sum(h * self.allLayers[-1].y_mat, axis=(0, 1))/X.shape[1]
        return cost,grad   
            
    def predict(self, Xtest):
        '''
        Prediction.
        overwrite the method of super class.
        Before calling this method, this model should be training
        Parameter:
            Xtest    - The data to be predicted, numFeatures by numData
        '''
        act=Xtest
               
        for i in range(self.numLayers-1):
            self.allLayers[i].input=act
            act = self.allLayers[i].forward() 
        return self.allLayers[-1].predict(act)  

def checkGradient(X,y):
    params = dict()
    params['inputSize']=X.shape[0]
    params['outputSize']=10
    params['layerSizes']=[50,20,10]    
    params['Lambda']=0
    params['activitionFunc']='sigmoid'
    
    testnn = MNN(params)
    
    #testnn.setTrainData(X, y)
    theta = testnn.flatTheta()    
    cost,grad = testnn.costFunc(theta,X,y)
    #print(np.size(theta))  
    #print(np.size(grad))  
    numgrad = np.zeros(grad.shape)
    
    e = 1e-6
    
    for i in range(np.size(grad)):         
        theta[i]=theta[i]-e
        loss1,g1 =testnn.costFunc(theta,X,y)
        theta[i]=theta[i]+2*e
        loss2,g2 = testnn.costFunc(theta,X,y)
        theta[i]=theta[i]-e            
        
        numgrad[i] = (-loss1 + loss2) / (2 * e)
        
    print(np.sum(np.abs(grad-numgrad))/np.size(grad))  

隨機梯度下降(改寫自UFLDL的matlab隨機梯度下降程式碼):
import numpy as np
def minFuncSGD(funcObj,theta,data,labels,options):
    '''
    Runs stochastic gradient descent with momentum to optimize the
    parameters for the given objective.
    
    Parameters:
      funObj     -  function handle which accepts as input theta,
                    data, labels and returns cost and gradient w.r.t
                    to theta.
      theta      -  unrolled parameter vector
      data       -  stores data in m x n x numExamples tensor
      labels     -  corresponding labels in numExamples x 1 vector
      options    -  struct to store specific options for optimization
    
     Returns:
      opttheta   -  optimized parameter vector
    
     Options (* required)
      epochs*     - number of epochs through data
      alpha*      - initial learning rate
      minibatch*  - size of minibatch
      momentum    - momentum constant, defualts to 0.9
    
    '''    
    epochs =options['epochs']
    alpha = options['alpha']
    minibatch = options['minibatch']
    if options.get('momentum')==None:
        options['momentum']=0.9
    m= labels.shape[0]
    mom=0.5
    momIncrease = 20
    velocity = np.zeros(theta.shape)
    
    #SGD loop
    it =0
    for e in range(epochs):
        rp=np.random.permutation(m)
        
        for i in range(0,m-minibatch,minibatch):
            it =it+1
            #increase momentum after momIncrease iterations
            if it==momIncrease:
                mom=options['momentum']
            #get next randomly selected minibatch
            mb_data = data[:,rp[i:i+minibatch]]
            mb_labels = labels[rp[i:i+minibatch]]
            # evaluate the objective function on the next minibatch
            cost,grad = funcObj(theta,mb_data,mb_labels)
            '''
             Instructions: Add in the weighted velocity vector to the
             gradient evaluated above scaled by the learning rate.
             Then update the current weights theta according to the
             sgd update rule 
            '''   
            velocity=mom*velocity+alpha*grad
            theta=theta-velocity  
            print('Epoch %d: Cost on iteration %d is %f\n' %(e,it,cost))  
        #aneal learning rate by factor of two after each epoch
        alpha = alpha/2.0
    
    return theta

測試:

使用MNIST資料集進行測試,正確率在96%左右。

測試程式碼:

    X = np.load('../../common/trainImages.npy') / 255
    X = X.T
    y = np.load('../../common/trainLabels.npy')
    '''
    X1=X[:,:10]
    y1=y[:10]
    checkGradient(X1,y1) 
    '''
    Xtest = np.load('../../common/testImages.npy') / 255
    Xtest = Xtest.T
    ytest = np.load('../../common/testLabels.npy') 
    params = dict()
    params['inputSize']=X.shape[0]
    params['outputSize']=10
    params['layerSizes']=[256,10]    
    params['Lambda']=0
    params['activitionFunc']='sigmoid'  
    
    nn=MNN(params)
    t0=time()
    nn.train(X, y)
    print('training Time %.5f s' %(time()-t0))
    print('test acc :%.3f%%' % (nn.performance(Xtest,ytest)))
    

   
   

存在的問題:

  1.使用scipy.optimize中的fmin_cg和fmin_l_bfgs_b兩個函式進行優化時,只有一個隱藏層沒有問題,能得到想要的結果,但隱藏層多於一層的時候卻得不到正確的結果,迭代次數只有個位數。而使用梯度下降或隨機梯度下降法,對於多隱藏層的模型是可以得到想要的結果的。不知道是我實現的神經網路有問題還是scipy.optimize得問題!

  2.在程式碼中對代價函式和梯度沒有使用懲罰項。由於輸出層採用softmax(固定最後一類的輸出為0,不使用懲罰項),不知道是否要對隱藏層的引數進行規範化。不過從實際的結果看,不加任何懲罰項,其結果和使用二次代價函式+懲罰項的結果差不多。