1. 程式人生 > >Python Word2Vec訓練和測試詞向量

Python Word2Vec訓練和測試詞向量

train_word2vec_model.py:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
 
import logging
import os
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
 
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger =
logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 4: print(globals()['__doc__'
] % locals()) sys.exit(1) inp, outp1, outp2 = sys.argv[1:4] model = Word2Vec(LineSentence(inp), size=800, window=10, min_count=5,sg=1,hs=1, workers=multiprocessing.cpu_count()) #window:skip-gram通常在10附近,CBOW通常在5附近 #hs: 如果為1則會採用hierarchica softmax技巧。如果設定為0(defaut),則negative sampling會被使用。
# trim unneeded model memory = use(much) less RAM # model.init_sims(replace=True) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)

執行 "python train_word2vec_model.py v6_EN.txt v6_EN.model v6_EN.vector"即可訓練詞向量

train_word2vec_model.py為訓練詞向量的程式程式碼,v6_EN.txt是我訓練的語料庫的名稱,v6_EN.model為我訓練出來的詞向量模型名稱,v6_EN.vector為格式化儲存詞向量模型的檔案(一般用不到,但執行語句必須包含這一項)

訓練出的檔案有五個:

中間3個.npy檔案在load詞向量模型時都必須和v6_EN.model放在同一資料夾下

In [1]: import gensim
 
In [2]: model = gensim.models.Word2Vec.load("v6_EN.model")
 
In [3]: result = model.most_similar("足球")
 
In [4]: for e in result:
    print e[0], e[1]
   ....:     
聯賽 0.65538161993
甲級 0.653042972088
籃球 0.596754670143
俱樂部 0.587228953838
乙級 0.58406317234
足球隊 0.556015253067
亞足聯 0.530800580978
allsvenskan 0.52497625351
代表隊 0.521494746208
甲組 0.51778960228

test.py:

import gensim
import numpy as np
import xlwt
model_EN = gensim.models.Word2Vec.load("../v6_EN_SG/v6_EN_SG_800.model")
model_FR = gensim.models.Word2Vec.load("../v6_FR_SG/v6_FR_SG.model")
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('Result')
Thta = np.load("GT/ThtaEN-FR/Thta0.07/ThtaEN-FR0.07_7000.npy")
test = np.load("GT/test1000EN-FR.npy")
font1 = xlwt.Font()
font1.height=0x00E8
font1.name = '宋體'
style1 = xlwt.XFStyle()
style1.font = font1
worksheet.write(0, 0, label = '英文測試單詞', style = style1)
worksheet.col(0).width = 3333
worksheet.write(0, 1, label = '預測的法語譯文', style = style1)
worksheet.col(1).width = 4000
worksheet.write(0, 2, label = '詞典給出的法語譯文', style = style1)
worksheet.col(2).width = 4400
worksheet.write(0, 3, label = '對錯', style = style1)
worksheet.col(3).width = 4400
num = 0
true_Word=0.0
while num < 1000:
    word_EN = test[num][0]
    word_FR = test[num][1]
    vec_Test = model_EN.wv[word_EN]
    vec_Test.shape = (1,800)
    b = np.dot(vec_Test,Thta)
    b.shape = (200,)
    e = model_FR.wv.similar_by_vector(b, topn=5, restrict_vocab=None)
    print(e[0][0])
    worksheet.write(num+1, 0, label = word_EN)
    worksheet.write(num+1, 1, label = [e[k][0]+'  ' for k in range(5)])
    worksheet.write(num+1, 2, label = word_FR)
    for i in range(5):
        if e[i][0] == word_FR:
            worksheet.write(num+1, 3, label = '✔️')
            true_Word+=1
            break
        elif i == 4:
            worksheet.write(num+1, 3, label = '×')
    print('測試完成%d個單詞'%(num+1))
    num += 1

worksheet.write(num+1, 0, label = '正確率', style = style1)
worksheet.write(num+1, 1, label = str(true_Word/num*100)+'%')
print(str(true_Word/num*100)+'%')
workbook.save('GT/test/testEN-FR/Thta0.07/[email protected]_7000.xls')