1. 程式人生 > >gensim 英文文字相似度

gensim 英文文字相似度

# -*- coding: utf-8 -*-
# __jiahuiyu__
"""
對英文的處理
"""

import logging
from gensim import models, similarities, corpora
from collections import defaultdict
import os
# 日誌輸出
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 停用詞
stoplist = set('for a of the and to in'
.split()) # 英文標點符號 punctions = [' ', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] documents = open('E:/descfile/en_test/en_text.txt', 'r') lines = documents.readlines() print lines texts = [[word for word in document.lower().split() if word not in stoplist and
punctions] for document in lines] # texts = [[word for word in document.lower().split() if word not in punctions] # for document in texts] print texts # 詞標記 frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts1 = [[token for token in
text if frequency[token] > 1] for text in texts] print texts1 # 建立詞典和語料庫 dictionary = corpora.Dictionary(texts) dictionary.save('e:/descfile/en_test/desc_en.dict') # print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('e:/descfile/en_test/desc_en.mm', corpus) print corpus # 下載儲存的建立好的詞典和語料庫 if os.path.exists('e:/descfile/en_test/desc_en.dict'): dictionary = corpora.Dictionary.load('e:/descfile/en_test/desc_en.dict') corpus = corpora.MmCorpus('e:/descfile/en_test/desc_en.mm') print 'used english files generated' else: print 'please generate the files again!' # 建立模型 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # make transformations serialized lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi_model[corpus_tfidf] index = similarities.MatrixSimilarity(lsi_model[corpus]) """ print corpus_lsi for c in corpus_lsi: print c """ # test english string en_str = 'There is nothing noble in being superior to some other man.The true nobility is being supior to your previous self.' en_str_vec = dictionary.doc2bow(en_str.lower().split()) print en_str_vec lsi_str_vec1 = lsi_model[en_str_vec] print lsi_str_vec1 # 計算相似度 sims = index[lsi_str_vec1] print list(enumerate(sims)) # sorted simsorted = sorted(enumerate(sims), key=lambda item: -item[1]) print simsorted