Deep Learning 學習筆記(二):神經網路Python實現
阿新 • • 發佈:2019-02-01
多層神經網路的Python實現。
程式碼先貼上,程式設計的東西不解釋。
程式碼中出現的SupervisedLearningModel、NNLayer和SoftmaxRegression,請參考上一篇筆記:Deep Learning 學習筆記(一)——softmax Regression
多層神經網路:
import numpy as np from NNBase import NNLayer from softmax import SoftmaxRegression from dp.supervised import NNBase from time import time class MNN(NNBase.SupervisedLearningModel): ''' classdocs ''' def __init__(self, params): ''' Constructor parameters: params - the network configuration, dict params.inputSize - dimension of input features params.outputSize - number of output classes params.layerSizes - an array, sizes of all layer, including all hidden layers and output layer params.Lambda -scaling parameter for l2 weight regularization penalty params.activitionFunc -which type of activation function to use in hidden layers ''' layerSizes = params['layerSizes'] self.numLayers = len(layerSizes) self.allLayers = [] self.X=0 #initialize all hidden layers inputSize = params['inputSize'] for i in range(self.numLayers-1): layer = NNLayer(inputSize,layerSizes[i],params['Lambda'],actFunc=params['activitionFunc'] ) self.allLayers.append(layer) inputSize=layerSizes[i] #initialize the softmax layer - output layer outputLayer=SoftmaxRegression(inputSize,params['outputSize'],params['Lambda']) self.allLayers.append(outputLayer) def rebuildTheta(self,theta): ''' convert the 1-dim weight to all layers weights and intercepts overwrite the method of super class ''' starter=0 for i in range(self.numLayers): thetaSize =(self.allLayers[i].inputSize+1)*self.allLayers[i].outputSize th=theta[starter:starter+thetaSize] starter=starter+thetaSize self.allLayers[i].rebuildTheta(th) def flatTheta(self): ''' convert all weights and intercept to 1-dim vector overwrite the method of super class ''' theta= self.allLayers[0].flatTheta() for i in range(self.numLayers-1): temp = self.allLayers[i+1].flatTheta() theta =np.hstack((theta,temp)) return theta def nnForward(self,theta,X,y): ''' the forward method ''' act=X self.rebuildTheta(theta) self.allLayers[-1].setTrainingLabels(y) for i in range(self.numLayers): self.allLayers[i].input=act act = self.allLayers[i].forward() return act def cost(self, theta,X,y): ''' The cost function. Parameters: theta - The vector hold the weights and intercept, needed by scipy.optimize function size: (numClasses - 1)*(numFeatures + 1) ''' h = np.log(self.nnForward(theta,X,y)) #h * self.y_mat, apply the indicator function cost = -np.sum(h *self.allLayers[-1].y_mat, axis=(0, 1))/ X.shape[1] return cost def gradient(self,theta,X,y): ''' compute the gradient. overwrite the method of super class. Parameters: theta - 1-dim vector,containing all weights and intercepts ''' self.nnForward(theta,X,y) i= self.numLayers-1 grad = np.empty(0) while i>0: #get the gradient of one layer gwb=self.allLayers[i].layerGradient() #backpropagate the error terms self.allLayers[i-1].delta=self.allLayers[i].backpropagate() grad=np.hstack((gwb.ravel(),grad)) i=i-1 #get the the gradient of the first hidden layer gwb=self.allLayers[0].layerGradient() grad=np.hstack((gwb.ravel(),grad)) return grad def costFunc(self,theta,X,y): ''' ''' grad=self.gradient(theta, X, y) h=np.log(self.allLayers[-1].activation) cost = -np.sum(h * self.allLayers[-1].y_mat, axis=(0, 1))/X.shape[1] return cost,grad def predict(self, Xtest): ''' Prediction. overwrite the method of super class. Before calling this method, this model should be training Parameter: Xtest - The data to be predicted, numFeatures by numData ''' act=Xtest for i in range(self.numLayers-1): self.allLayers[i].input=act act = self.allLayers[i].forward() return self.allLayers[-1].predict(act) def checkGradient(X,y): params = dict() params['inputSize']=X.shape[0] params['outputSize']=10 params['layerSizes']=[50,20,10] params['Lambda']=0 params['activitionFunc']='sigmoid' testnn = MNN(params) #testnn.setTrainData(X, y) theta = testnn.flatTheta() cost,grad = testnn.costFunc(theta,X,y) #print(np.size(theta)) #print(np.size(grad)) numgrad = np.zeros(grad.shape) e = 1e-6 for i in range(np.size(grad)): theta[i]=theta[i]-e loss1,g1 =testnn.costFunc(theta,X,y) theta[i]=theta[i]+2*e loss2,g2 = testnn.costFunc(theta,X,y) theta[i]=theta[i]-e numgrad[i] = (-loss1 + loss2) / (2 * e) print(np.sum(np.abs(grad-numgrad))/np.size(grad))
隨機梯度下降(改寫自UFLDL的matlab隨機梯度下降程式碼):
import numpy as np def minFuncSGD(funcObj,theta,data,labels,options): ''' Runs stochastic gradient descent with momentum to optimize the parameters for the given objective. Parameters: funObj - function handle which accepts as input theta, data, labels and returns cost and gradient w.r.t to theta. theta - unrolled parameter vector data - stores data in m x n x numExamples tensor labels - corresponding labels in numExamples x 1 vector options - struct to store specific options for optimization Returns: opttheta - optimized parameter vector Options (* required) epochs* - number of epochs through data alpha* - initial learning rate minibatch* - size of minibatch momentum - momentum constant, defualts to 0.9 ''' epochs =options['epochs'] alpha = options['alpha'] minibatch = options['minibatch'] if options.get('momentum')==None: options['momentum']=0.9 m= labels.shape[0] mom=0.5 momIncrease = 20 velocity = np.zeros(theta.shape) #SGD loop it =0 for e in range(epochs): rp=np.random.permutation(m) for i in range(0,m-minibatch,minibatch): it =it+1 #increase momentum after momIncrease iterations if it==momIncrease: mom=options['momentum'] #get next randomly selected minibatch mb_data = data[:,rp[i:i+minibatch]] mb_labels = labels[rp[i:i+minibatch]] # evaluate the objective function on the next minibatch cost,grad = funcObj(theta,mb_data,mb_labels) ''' Instructions: Add in the weighted velocity vector to the gradient evaluated above scaled by the learning rate. Then update the current weights theta according to the sgd update rule ''' velocity=mom*velocity+alpha*grad theta=theta-velocity print('Epoch %d: Cost on iteration %d is %f\n' %(e,it,cost)) #aneal learning rate by factor of two after each epoch alpha = alpha/2.0 return theta
測試:
使用MNIST資料集進行測試,正確率在96%左右。
測試程式碼:
X = np.load('../../common/trainImages.npy') / 255 X = X.T y = np.load('../../common/trainLabels.npy') ''' X1=X[:,:10] y1=y[:10] checkGradient(X1,y1) ''' Xtest = np.load('../../common/testImages.npy') / 255 Xtest = Xtest.T ytest = np.load('../../common/testLabels.npy') params = dict() params['inputSize']=X.shape[0] params['outputSize']=10 params['layerSizes']=[256,10] params['Lambda']=0 params['activitionFunc']='sigmoid' nn=MNN(params) t0=time() nn.train(X, y) print('training Time %.5f s' %(time()-t0)) print('test acc :%.3f%%' % (nn.performance(Xtest,ytest)))
存在的問題:
1.使用scipy.optimize中的fmin_cg和fmin_l_bfgs_b兩個函式進行優化時,只有一個隱藏層沒有問題,能得到想要的結果,但隱藏層多於一層的時候卻得不到正確的結果,迭代次數只有個位數。而使用梯度下降或隨機梯度下降法,對於多隱藏層的模型是可以得到想要的結果的。不知道是我實現的神經網路有問題還是scipy.optimize得問題!
2.在程式碼中對代價函式和梯度沒有使用懲罰項。由於輸出層採用softmax(固定最後一類的輸出為0,不使用懲罰項),不知道是否要對隱藏層的引數進行規範化。不過從實際的結果看,不加任何懲罰項,其結果和使用二次代價函式+懲罰項的結果差不多。