# -*- coding: utf-8-*-
from gensim.models.word2vec import Word2Vec 
sentences = [[A1,A2],[A1,A3,A2]] 

with open(sohu_train.txt) as trainText:  #, encoding=‘utf-8‘
    for line in trainText:
= line.split(^_^) words=body.replace(\n,‘‘).split( ) sentences.append(words) # if num>1000:break num+=1 # print(sentences) model= Word2Vec(min_count=1) print("start train ...") model.build_vocab(sentences) model.train(sentences,total_examples = model.corpus_count,epochs = model.iter)
print("train finished!",num) model.save(./sohu_model/Model) #model.save_word2vec_format(‘/tmp/mymodel.txt‘,binary = False) #model.save_word2vec_format(‘/tmp/mymodel.bin.gz‘,binary = True) #前一組方法保存的文件不能利用文本編輯器查看但是保存了訓練的全部信息,可以在讀取後追加訓練 #後一組方法保存為word2vec文本格式但是保存時丟失了詞匯樹等部分信息,不能追加訓練 print("save finished!")


# #模型使用
model = Word2Vec.load(./sohu_model/Model)
print("load model sesuess!")
# model.most_similar([‘北京‘])

print umost similar with 北京:
for i in model.most_similar("北京"): #計算余弦距離最接近“北京”的10個詞
    print i[0].decode(utf-8),i[1]

print u皇帝+女性-男性:
for i in model.most_similar(positive = [皇帝,女性],negative = [男性],topn = 3):print i[0].decode(utf-8),i[1]

print u手機+移動-智能:
for i in model.most_similar(positive = [手機,移動],negative = [智能],topn = 3):print i[0].decode(utf-8),i[1]

print u電影+科幻-劇情:
for i in model.most_similar(positive = [電影,科幻],negative = [劇情],topn = 3):print i[0].decode(utf-8),i[1]

print u北京 vector:
print model[北京]


load model sesuess!
most similar with 北京:
南京 0.670382142067
上海 0.661236405373
成都 0.639219224453
杭州 0.63784122467
廣州 0.631313323975
深圳 0.624626278877
武漢 0.624594151974
昆明 0.620243370533
長春 0.61394149065
長沙 0.60389906168
哥 0.60431176424
魔術師 0.586149096489
魔女 0.581812143326
智能手機 0.605030536652
互聯網 0.54615008831
蘋果 0.539426982403
紀錄片 0.648482918739
動畫 0.639703273773
迪斯尼 0.61851131916
北京 vector:
[-0.08981118  0.18538047 -4.7453156  -1.7730242   2.0390635   2.6085184
  5.088326    2.8057106   2.6798103  -1.4660915   2.778077    2.4279277
  0.69682086 -3.0003173   2.1341784   0.32419717 -5.2817945   0.18809023
 -1.3016417   3.8344557  -0.87402123 -0.26100433  2.8857462  -2.725345
 -2.5024219  -0.70686543 -0.4838663  -2.2535524   0.23617841  3.329134
  3.9053504  -1.9609474  -3.4581995   1.2530506  -2.079397    1.6266809
  0.23296945  1.4600109  -1.9104419   0.80835503 -0.13650164  3.355157
  2.4561696   0.6016032  -1.0312346   1.6474588   1.320931    1.4579619
  1.8017172  -3.5526018   1.2293625   4.798621   -3.5554793   0.5800354
  3.7429204  -0.4906999  -1.3069346  -1.0603447  -0.95469594 -0.35445935
 -1.7658769  -3.2370284  -2.2224278  -0.56134427 -0.46095294  2.8492029
  2.7202766  -3.3692176   1.1739812  -1.9770668   0.37050596  1.1764477
 -0.27834406  5.033905    0.09570877 -0.5670941  -2.1803875  -0.9094422
  1.0485793   0.03497482 -2.07145    -0.8045679  -1.8192968   2.6160874
  0.5630188  -0.45463613 -0.22750562  2.2233796   3.4276621  -0.8689221
  1.5558586  -0.39026013 -1.1843458  -3.378433   -4.2200727   1.6359595
  2.27458    -1.6011585  -0.89109504  2.3993087 ]
