1. 程式人生 > >訓練詞向量

訓練詞向量

 1 def word_vector_gener():
 2     """
 3     幾種不同的方法來生成詞向量
 4     :return:
 5     """
 6     from gensim.models import Word2Vec
 7     from gensim.test.utils import common_texts
 8     # 1.word2vec
 9     # 獲取原始資料
10     DATA_PATH = './word2vec_data.txt'
11     word2evctor = open('./word2vector.txt', '
w', encoding='utf8') 12 word_list = [] 13 finall = [] 14 # jieba分詞 15 with open(DATA_PATH, 'r', encoding='utf8') as file: 16 for each_line in file.readlines(): 17 # 分詞 18 cut_word = list(jieba.cut(each_line.strip())) 19 # 去停用詞 20 stopwords = [w.strip() for
w in open('./stop_words.txt', 'r', encoding='utf8')] 21 temp = [] 22 for each in cut_word: 23 if each not in stopwords and each.strip(): 24 temp.append(each) 25 word_list.append(each) 26 finall.append(temp) 27
# 訓練模型 28 model = Word2Vec(finall, size=100, window=1, min_count=1, workers=4) 29 model.save('./word2vec_model.') 30 # 檢視詞向量 31 for word in list(set(word_list)): 32 content = str(word) + '\t' + str(model[word]) 33 word2evctor.write(content+'\n') 34 print(content) 35 36 37 print('ok') 38 39 40 if __name__ == '__main__': 41 word_vector_gener()