1. 程式人生 > >深度學習語言模型(3)-word2vec負取樣(Negative Sampling) 模型(keras版本)

深度學習語言模型(3)-word2vec負取樣(Negative Sampling) 模型(keras版本)

目錄:
深度學習語言模型(1)-word2vec的發展歷程
深度學習語言模型(2)-詞向量,神經概率網路模型(keras版本)
深度學習語言模型(3)-word2vec負取樣(Negative Sampling) 模型(keras版本)

程式碼參考了:https://spaces.ac.cn/archives/4515
但他採用的是隨機取樣,我這裡採用的是負取樣,但還是有一些細節沒有去實現,但大體框架就這樣

# coding=utf-8
'''
Created on 2018年9月15日

@author: admin
'''

from gensim import corpora,
models, similarities import numpy as np import keras.backend as K from keras.engine.topology import Layer class NegativeLayer(Layer): def __init__(self, nb_negative,M,M_num, **kwargs): self.nb_negative = nb_negative self.M = M self.M_num = M_num super(NegativeLayer,
self).__init__(**kwargs) def build(self, input_shape): super(NegativeLayer, self).build(input_shape) def call(self, x, mask=None): batch = 0 if str(x.shape[0]).isdigit() == False: batch = 4 else: batch = x.shape[0] #負取樣 final_output =
np.array([[M[i] for i in j]for j in np.random.randint(0, self.M_num+1, size=(batch, self.nb_negative))]) #變成tensor格式 final_output = K.tensorflow_backend._to_tensor(final_output,dtype=np.int32) return final_output def compute_output_shape(self, input_shape): return (input_shape[0], self.nb_negative) if __name__ == '__main__': text = [["我","今天","打","籃球"], ["我","今天","打","足球"], ["我","今天","打","羽毛球"], ["我","今天","打","網球"], ["我","今天","打","排球"], ["我","今天","打","氣球"], ["我","今天","打","遊戲"], ["我","今天","打","冰球"], ["我","今天","打","人"], ["我","今天","打","檯球"], ["我","今天","打","桌球"], ["我","今天","打","水"], ["我","今天","打","籃球"], ["我","今天","打","足球"], ["我","今天","打","羽毛球"], ["我","今天","打","網球"], ["我","今天","打","排球"], ["我","今天","打","氣球"], ] #使用gensim生成詞典 dictionary = corpora.Dictionary(text,prune_at=2000000) #列印詞典中的詞 for key in dictionary.iterkeys(): print(key,dictionary.get(key),dictionary.dfs[key]) #儲存詞典 dictionary.save_as_text('word_dict.dict', sort_by_word=True) #載入詞典 dictionary = dictionary.load_from_text('word_dict.dict') L = {} #計算出詞出現的總數,dictionary.dfs{單詞id,在多少文件中出現} allword_num = np.sum(list(dictionary.dfs.values())) print(allword_num) #72 #構造負取樣dict #進行歸一化,然後按照0-1排列,然後再使用M個均等值來評分0-1,方便對應詞的id sum = 0 M = {} M_num = 1000 for id,num in dictionary.dfs.items(): #向上取整 left = int(np.ceil(sum/(1/M_num))) sum = sum + num/allword_num L[id] = sum #向下取整 right = int(sum/(1/M_num)) print(id,left,right) # 11 0 13 # 0 14 263 # 10 264 277 # 12 278 291 # 1 292 541 # 2 542 791 # 7 792 819 # 13 820 833 # 8 834 861 # 14 862 875 # 9 875 888 # 3 889 916 # 6 917 944 # 5 945 972 # 4 973 1000 for i in range(left,right+1): M[i] = id print(L) #{11: 0.013888888888888888, 0: 0.25, 10: 0.013888888888888888, 12: 0.013888888888888888, 1: 0.25, 2: 0.25, 7: 0.027777777777777776, 13: 0.013888888888888888, 8: 0.027777777777777776, 14: 0.013888888888888888, 9: 0.013888888888888888, 3: 0.027777777777777776, 6: 0.027777777777777776, 5: 0.027777777777777776, 4: 0.027777777777777776} #詞語個數 word_num = len(dictionary.keys()) #使用多少編文章生成每個batch資料 sentence_batch_size = 1 #滑動視窗 window = 3 def data_generator(): #訓練資料生成器 while True: x,y = [],[] _ = 0 for sentence in text: #使用word_num的值作為padding sentence = [word_num]*window + [dictionary.token2id[w] for w in sentence if w in dictionary.token2id] + [word_num]*window for i in range(window, len(sentence)-window): x.append(sentence[i-window:i]+sentence[i+1:i+1+window]) #因為使用的loss函式為sparse_categorical_crossentropy,所以不用one-hot y.append([sentence[i]]) _ += 1 if _ == sentence_batch_size: x,y = np.array(x),np.array(y) #因為正例為輸出層第一個神經元,所以這裡都使用0標籤,也是因為loss函式為sparse_categorical_crossentropy z = np.zeros((len(x), 1)) print("輸入的資料 :",x.shape) print("對應的標籤 :",y.shape) print("對應的標籤 2:",z.shape) yield [x,y],z x,y = [],[] _ = 0 from keras.models import Sequential from keras.layers import Dense, Activation,Embedding,Reshape,Flatten,Input,Embedding,Lambda from keras.models import Model #詞向量維度 word_size = 100 #負樣本個數 nb_negative = 16 input_words = Input(shape=(window*2,), dtype='int32') input_vecs = Embedding(word_num+1, word_size, name='word2vec')(input_words) input_vecs_sum = Lambda(lambda x: K.sum(x, axis=1))(input_vecs) #CBOW模型,直接將上下文詞向量求和 #構造隨機負樣本,與目標組成抽樣 target_word = Input(shape=(1,), dtype='int32') negatives = NegativeLayer(16,M,M_num)(target_word) samples = Lambda(lambda x: K.concatenate(x))([target_word,negatives]) #構造抽樣,負樣本隨機抽。負樣本也可能抽到正樣本,但概率小。 #使用Embedding層代替dense主要原因是隻更新正例和負例相對應的輸出層神經元的權重,這樣可以大量減少記憶體佔用和計算量 softmax_weights = Embedding(word_num+1, word_size, name='W')(samples) softmax_biases = Embedding(word_num+1, 1, name='b')(samples) softmax = Lambda(lambda x: K.softmax((K.batch_dot(x[0], K.expand_dims(x[1],2))+x[2])[:,:,0]) )([softmax_weights,input_vecs_sum,softmax_biases]) #用Embedding層存引數,用K後端實現矩陣乘法,以此復現Dense層的功能 #留意到,我們構造抽樣時,把目標放在了第一位,也就是說,softmax的目標id總是0,這可以從data_generator中的z變數的寫法可以看出 model = Model(inputs=[input_words,target_word], outputs=softmax) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() model.fit_generator(data_generator(),steps_per_epoch =np.ceil(dictionary.num_docs/sentence_batch_size),epochs=100,max_queue_size=1,workers=1) # #儲存模型 model.save_weights("DNNword-vec2.h5") # #載入模型 model.load_weights("DNNword-vec2.h5",by_name=True) # #獲取embeding的權重,也就是詞向量 embeddings = model.get_weights()[0] #向量標準化 normalized_embeddings = embeddings / (embeddings**2).sum(axis=1).reshape((-1,1))**0.5 dictionary.id2token = {j:i for i,j in dictionary.token2id.items()} #獲取前面最相似的15個詞語 def most_similar(w,dictionary): v = normalized_embeddings[dictionary.token2id[w]] #向量標準化之後分母就是1,所以直接相乘就好 sims = np.dot(normalized_embeddings, v) sort = sims.argsort()[::-1] sort = sort[sort > 0] return [(dictionary.id2token[i],sims[i]) for i in sort[:15] if i in dictionary.id2token] for sim in most_similar(u'網球',dictionary): print(sim[0],sim[1]) # 網球 0.99999994 # 羽毛球 0.9787248 # 籃球 0.978495 # 排球 0.9773369 # 人 0.9761201 # 水 0.9760275 # 氣球 0.9753146 # 桌球 0.9731983 # 冰球 0.97278094 # 遊戲 0.9711289 # 足球 0.9660615 # 檯球 0.96072686 # 我 -0.3409065 # 打 -0.42166257