1. 程式人生 > >tensorflow如何正確加載預訓練詞向量

tensorflow如何正確加載預訓練詞向量

global color news doc ... elf import loading initial

使用預訓練詞向量和隨機初始化詞向量的差異還是挺大的,現在說一說我使用預訓練詞向量的流程。

  一、構建本語料的詞匯表,作為我的基礎詞匯

  二、遍歷該詞匯表,從預訓練詞向量中提取出該詞對應的詞向量

  三、初始化embeddings遍歷,將數據賦值給tensor

樣例代碼:

  

 1 #-*- coding: UTF-8 -*-
 2 import numpy as np
 3 import tensorflow as tf
 4 ‘‘‘本程序只是對word2vec進行了簡單的預處理,應用到復雜模型中還需要根據實際情況做必要的改動‘‘‘
 5 
 6 class Wordlist(object):
7 def __init__(self, filename, maxn = 100000): 8 lines = map(lambda x: x.split(), open(filename).readlines()[:maxn]) 9 self.size = len(lines) 10 11 self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))] 12 self.voc = dict(self.voc) 13 14 def
getID(self, word): 15 try: 16 return self.voc[word] 17 except: 18 return 0 19 20 def get_W(word_vecs, k=300): 21 """ 22 Get word matrix. W[i] is the vector for word indexed by i 23 """ 24 vocab_size = len(word_vecs) 25 word_idx_map = dict() 26
W = np.zeros(shape=(vocab_size+1, k), dtype=float32) 27 W[0] = np.zeros(k, dtype=float32) 28 i = 1 29 for word in word_vecs: 30 W[i] = word_vecs[word] 31 word_idx_map[word] = i 32 i += 1 33 return W, word_idx_map 34 35 def load_bin_vec(fname, vocab): 36 """ 37 Loads 300x1 word vecs from Google (Mikolov) word2vec 38 """ 39 i=0 40 word_vecs = {} 41 pury_word_vec = [] 42 with open(fname, "rb") as f: 43 header = f.readline() 44 print header,header 45 vocab_size, layer1_size = map(int, header.split()) 46 print vocabsize:,vocab_size,layer1_size:,layer1_size 47 binary_len = np.dtype(float32).itemsize * layer1_size 48 for line in xrange(vocab_size): 49 word = [] 50 while True: 51 ch = f.read(1) 52 #print ch 53 if ch == : 54 word = ‘‘.join(word) 55 #print ‘single word:‘,word 56 break 57 if ch != \n: 58 word.append(ch) 59 #print word 60 #print word 61 if word in vocab: 62 word_vecs[word] = np.fromstring(f.read(binary_len), dtype=float32) 63 pury_word_vec.append(word_vecs[word]) 64 if i==0: 65 print word,word 66 i=1 67 else: 68 f.read(binary_len) 69 #np.savetxt(‘googleembedding.txt‘,pury_word_vec) 70 return word_vecs,pury_word_vec 71 72 def add_unknown_words(word_vecs, vocab, min_df=1, k=300): 73 """ 74 For words that occur in at least min_df documents, create a separate word vector. 75 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones 76 """ 77 for word in vocab: 78 if word not in word_vecs and vocab[word] >= min_df: 79 word_vecs[word] = np.random.uniform(-0.25,0.25,k) 80 81 if __name__=="__main__": 82 w2v_file = "GoogleNews-vectors-negative300.bin"#Google news word2vec bin文件 83 print "loading data...", 84 vocab = Wordlist(vocab.txt)#自己的數據集要用到的詞表 85 w2v,pury_word2vec = load_bin_vec(w2v_file, vocab.voc) 86 add_unknown_words(w2v, vocab.voc) 87 W, word_idx_map = get_W(w2v) 88 89 ‘‘‘embedding lookup簡單應用‘‘‘ 90 Wa = tf.Variable(W) 91 embedding_input = tf.nn.embedding_lookup(Wa, [0,1,2])#正常使用時要替換成相應的doc 92 93 with tf.Session() as sess: 94 sess.run(tf.global_variables_initializer()) 95 input = sess.run(Wa) 96 #print np.shape(Wa)

tensorflow如何正確加載預訓練詞向量