1. 程式人生 > >word2vec全部彈幕比較句子相似度情感五分類

word2vec全部彈幕比較句子相似度情感五分類

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys
import codecs   #可以以特定編碼開啟檔案
import jieba
import jieba.posseg as pseg
reload(sys)               #zzh說這種方法不好,不要再用了!!!  可是真的很好用啊 QAQ
sys.setdefaultencoding('utf-8')
import gensim

#model = gensim.models.Word2Vec.load("22620491.model")
model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin'
, binary=True) word_vec = model.wv del model #把模型給word_vec,所以Model刪掉。 ''' print word_vec[u'難過'] ''' f = codecs.open("qinggancidanmu.txt",'r','utf-8') #codecs包指定TXT開啟方式 lines = f.readlines() doc = open('fenlei.txt', 'w') for line in lines: #每一行彈幕 if lines.index(line) % 500 ==0: #顯示跑到多少條資料 print
lines.index(line) words=line.split(" ") words.pop(0) u = [] for word in words: if word != "\r\n": #去掉換行符,linux只用\n換行。win下用\r\n表示換行。反正\n不行就\r\n試試! #print type(word) u.append(word) #word_vec輸入必須要unicode才行。 le = [u'樂'] ai = [u'哀'] nu = [u'怒'] jing = [u'驚'
] wu = [u'惡'] try: l,a,n,j,w=word_vec.n_similarity(u, le),word_vec.n_similarity(u, ai),word_vec.n_similarity(u, nu),word_vec.n_similarity(u, jing),word_vec.n_similarity(u, wu) list=[l,a,n,j,w] #print list doc.write(line.split(" ",1)[0]+" "+str(list.index(max(list)))+" "+line.split(" ",1)[1]+'\n') # index記得 變成 str啊 !!改了好半天! except: doc.write(line.split(" ",1)[0]+" "+"-1"+" "+line.split(" ",1)[1]+'\n') continue print("end") f.close() doc.close()