1. 程式人生 > >cs224d 作業 problem set3 (一) 實現Recursive Nerual Net Work 遞迴神經網路

cs224d 作業 problem set3 (一) 實現Recursive Nerual Net Work 遞迴神經網路

'''
Created on 2017年10月5日

@author: weizhen
'''
# 一個簡單的遞迴神經網路的實現,有著一個ReLU層和一個softmax層
# TODO : 必須要更新前向和後向傳遞函式
# 你可以通過執行 python rnn.py 方法來執行一個梯度檢驗
# 插入pdb.set_trace()  在你不確定將會發生什麼的地方

import numpy as np
import collections
import pdb
import tree as treeM
import pickle

class RNN:
    
    def __init__(self, wvecDim, outputDim, numWords, mbSize=30, rho=1e-4):
        self.wvecDim 
= wvecDim self.outputDim = outputDim self.numWords = numWords self.mbSize = mbSize self.defaultVec = lambda : np.zeros((wvecDim,)) self.rho = rho def initParams(self): np.random.seed(12341) # Word vectors self.L = 0.01 * np.random.randn(self.wvecDim, self.numWords)
# Hidden layer parameters self.W = 0.01 * np.random.randn(self.wvecDim, 2 * self.wvecDim) self.b = np.zeros((self.wvecDim)) # Softmax weights # note this is " U "in the notes and the handout... # there is a reason for the change in notation self.Ws = 0.01 * np.random.randn(self.outputDim, self.wvecDim) self.bs
= np.zeros((self.outputDim)) self.stack = [self.L, self.W, self.b, self.Ws, self.bs] # Gradients self.dW = np.empty(self.W.shape) self.db = np.empty((self.wvecDim)) self.dWs = np.empty(self.Ws.shape) self.dbs = np.empty((self.outputDim)) def costAndGrad(self, mbdata, test=False): """ 每一個datum在minibatch裡邊都是一個樹 前向計算每一個樹,反向傳播到每一個樹 返回值: cost: 梯度:w.r.t W,Ws,b,bs 以上變數的梯度都是在稀疏形式儲存的 或者是以測試狀態下的 Returns: cost,correctArray,guessArray,total """ cost = 0.0 correct = [] guess = [] total = 0.0 self.L, self.W, self.b, self.Ws, self.bs = self.stack # 初始化所有梯度都是0 self.dW[:] = 0 self.db[:] = 0 self.dWs[:] = 0 self.dbs[:] = 0 self.dL = collections.defaultdict(self.defaultVec) # 在每一個batch中前向計算每一個tree for tree in mbdata: c, tot = self.forwardProp(tree.root, correct, guess) cost += c total += tot if test: return (1. / len(mbdata)) * cost, correct, guess, total # 在每一個batch上進行反向傳播 for tree in mbdata: self.backProp(tree.root) # 通過mb的大小來計算損失和梯度 scale = (1. / self.mbSize) for v in self.dL.values(): v *= scale # 新增L2正則化項 cost += (self.rho / 2) * np.sum(self.W ** 2) cost += (self.rho / 2) * np.sum(self.Ws ** 2) return scale * cost, [self.dL, scale * (self.dW + self.rho * self.W), scale * self.db, scale * (self.dWs + self.rho * self.Ws), scale * self.dbs] def forwardProp(self, node, correct=[], guess=[]): """損失應該是一個不斷更新的變數,總損失是我們需要用在準確率報告裡邊的資料""" cost = total = 0.0 # 下面實現遞迴神經網路前向傳播的函式 # 你應該更新 node.probs, node.hActsl,node.fprop,and cost # node :你當前節點是在語法樹上的 # correct : 這是一個不斷更新的標記真值的列表 # guess: 這是一個不斷更新的猜測我們的模型會預測為哪一個結果的列表 # (我們會同時使用正確的和猜測的值來構造我們的混淆矩陣) L = self.L # 隱藏層的引數 W = self.W b = self.b # Softmax 權重 Ws = self.Ws bs = self.bs if node.isLeaf: node.hActsl = L[:, node.word] else: if not node.left.fprop: cost_left, total_left = self.forwardProp(node.left, correct, guess) cost += cost_left total += total_left if not node.right.fprop: cost_right, total_right = self.forwardProp(node.right, correct, guess) cost += cost_right total += total_right node.hActsl = W.dot(np.hstack((node.left.hActsl, node.right.hActsl))) + b node.hActsl[node.hActsl < 0] = 0 x = Ws.dot(node.hActsl) + bs x -= np.max(x) node.probs = np.exp(x) / np.sum(np.exp(x)) correct += [node.label] guess += [np.argmax(node.probs)] cost -= np.log(node.probs[node.label]) node.fprop = True return cost, total + 1 def backProp(self, node, error=None): """ 實現遞迴神經網路的反向傳播函式 應該更新 self.dWs, self.dbs, self.dW, self.db, and self.dL[node.word] 相關地 node:你在語法樹種的當前節點 error:誤差從之前一個迭代過程中傳遞進來的 """ # 清空節點 node.fprop = False L = self.L # 隱藏節點的引數 W = self.W b = self.b # Softmax層的權重 Ws = self.Ws bs = self.bs error_this = node.probs error_this[node.label] -= 1.0 delta = Ws.T.dot(error_this) self.dWs += np.outer(error_this, node.hActsl) self.dbs += error_this if error is not None: delta += error delta[node.hActsl == 0] = 0 if node.isLeaf: self.dL[node.word] += delta else: self.dW += np.outer(delta, np.hstack([node.left.hActsl, node.right.hActsl])) self.db += delta delta = np.dot(self.W.T, delta) self.backProp(node.left, delta[:self.wvecDim]) self.backProp(node.right, delta[self.wvecDim:]) def updateParams(self, scale, update, log=False): """ 如下這樣更新引數 p:=p-scale*update 如果log是真的,輸出根節點的均方誤差,並且更新根節點的值 """ if log: for P, dP in zip(self.stack[1:], update[1:]): pRMS = np.sqrt(np.mean(P ** 2)) dpRMS = np.sqrt(np.mean((scale * dP) ** 2)) print("weight rms=%f -- update rms=%f" % (pRMS, dpRMS)) self.stack[1:] = [P + scale * dP for P, dP in zip(self.stack[1:], update[1:])] # 解決詞典並且進行稀疏的更新 dL = update[0] for j in dL.iterkeys(): self.L[:, j] += scale.dL[j] def toFile(self, fid): pickle.dump(self.stack, fid) def fromFile(self, fid): self.stack = pickle.load(fid) def check_grad(self, data, epsilon=1e-6): cost, grad = self.costAndGrad(data) err1 = 0.0 count = 0.0 print("Checking dW...") for W, dW in zip(self.stack[1:], grad[1:]): W = W[..., None] dW = dW[..., None] for i in range(W.shape[0]): for j in range(W.shape[1]): W[i, j] += epsilon costP, _ = self.costAndGrad(data) W[i, j] -= epsilon numGrad = (costP - cost) / epsilon err = np.abs(dW[i, j] - numGrad) err1 += err count += 1 if 0.001 > err1 / count: print("Grad Check Passed for dW") else: print("Grad Check Failed for dW:Sum of Error=%.9f" % (err1 / count)) # check dL separately since dict dL = grad[0] L = self.stack[0] err2 = 0.0 count = 0.0 print("Checking dL...") for j in dL.keys(): for i in range(L.shape[0]): L[i, j] += epsilon costP, _ = self.costAndGrad(data) L[i, j] -= epsilon numGrad = (costP - cost) / epsilon err = np.abs(dL[j][i] - numGrad) err2 += err count += 1 if 0.001 > err2 / count: print("Grad Check Passed for dL") else: print("Grad Check Failed for dL: Sum of Error = %.9f" % (err2 / count)) if __name__ == '__main__': train = treeM.loadTrees() numW = len(treeM.loadWordMap()) wvecDim = 10 outputDim = 5 rnn = RNN(wvecDim, outputDim, numW, mbSize=4) rnn.initParams() mbData = train[:4] print("Numerical gradient check...") rnn.check_grad(mbData)