1. 程式人生 > >20180923 word2vec相似度改進(不浪費句子)

20180923 word2vec相似度改進(不浪費句子)

沒有詞向量就丟掉,不放進詞集合裡,不浪費句子,的改進。

#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import division  #除法
import sys
import codecs   #可以以特定編碼開啟檔案
import jieba
import jieba.posseg as pseg
reload(sys)               #zzh說這種方法不好,不要再用了!!!  可是真的很好用啊 QAQ
sys.setdefaultencoding('utf-8')
import gensim

# model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True)
word_vec = model.wv
del model     #把模型給word_vec,所以Model刪掉。

print word_vec[u'難過']

f = codecs.open("xlj_fenci.txt",'r','utf-8')    #codecs包指定TXT開啟方式
lines = f.readlines()
#doc = open('fenlei.txt', 'w')


right,wrong,total=0,0,0
cntl,cnta,cntn,cntj,cntw=0,0,0,0,0   #標註時每一類的數量
resl,resa,resn,resj,resw=0,0,0,0,0  #分類正確每一類結果數量
for line in lines: #每一行彈幕

    if lines.index(line) % 500 ==0:   #顯示跑到多少條資料
        print lines.index(line)
    if line.split("  ")[0].split(" ")[0]=="0":    #分類正確個數
        cntl=cntl+1
    elif line.split("  ")[0].split(" ")[0]=="1":
        cnta=cnta+1
    elif line.split("  ")[0].split(" ")[0]=="2":
        cntn=cntn+1
    elif line.split("  ")[0].split(" ")[0]=="3":
        cntj=cntj+1
    elif line.split("  ")[0].split(" ")[0]=="4":
            cntw=cntw+1
    line1=line.split("  ")[1]
    words=line1.split(" ")
    u = []
    for word in words:
            if word != "\r\n":    #去掉換行符,linux只用\n換行。win下用\r\n表示換行。反正\n不行就\r\n試試!
                #print type(word)
                try:
                    word_vec[word]
                    u.append(word)    #word_vec輸入必須要unicode才行。
                except:
                    continue



    le = [u'樂']
    ai = [u'哀']
    nu = [u'怒']
    jing = [u'驚']
    wu = [u'惡']
    try:
        l,a,n,j,w=word_vec.n_similarity(u, le),word_vec.n_similarity(u, ai),word_vec.n_similarity(u, nu),word_vec.n_similarity(u, jing),word_vec.n_similarity(u, wu)
        list=[l,a,n,j,w]
#         print list
        #doc.write(line.split(" ",1)[0]+" "+str(list.index(max(list)))+" "+line.split(" ",1)[1]+'\n')      # index記得 變成 str啊 !!改了好半天!
        if str(list.index(max(list)))==line.split("  ")[0].split(" ")[0]:
            right=right+1
            if str(list.index(max(list)))=="0":    #分類正確個數
                resl=resl+1
            elif str(list.index(max(list)))=="1":
                resa=resa+1
            elif str(list.index(max(list)))=="2":
                resn=resn+1
            elif str(list.index(max(list)))=="3":
                resj=resj+1
            elif str(list.index(max(list)))=="4":
                resw=resw+1

            


    except:
        #doc.write(line.split(" ",1)[0]+" "+"-1"+" "+line.split(" ",1)[1]+'\n')
        wrong=wrong+1
        continue
    total=total+1
print(right,wrong,total)
print(cntl,cnta,cntn,cntj,cntw)
print(resl,resa,resn,resj,resw)
print(resl/cntl,resa/cnta,resn/cntn,resj/cntj,resw/cntw)
print("end")
f.close()
#doc.close()