1. 程式人生 > >TF-IDF比較文字相似度

TF-IDF比較文字相似度

文字相似度

TF-IDF 演算法

如果某個詞在給定文件中很少出現,但是在給定文件中的某一篇文章中出現的次數很大,
該詞在很大程度上反映了該文章的特性,我們稱該詞為這篇文章的關鍵字
參考連結:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html

餘弦相似性

測試案例

程式碼部分

# -*- coding: UTF-8 -*-

# import codecs

import jieba.posseg as pseg
from gensim import corpora, models, similarities


# from hotelmatcher.constant import *
class Tfidf: """ TF-IDF模型比較文字相似度類 """ # 停用詞 stop_words = ['酒店', '旅館'] # 結巴分詞後的停用詞性 # [標點符號、連詞、助詞、副詞、介詞、時語素、‘的’、數詞、方位詞、代詞] stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'] def __init__(self): # self.ensure_stop_words() pass """ def ensure_stop_words(self): # 停用詞 if self.stop_words is None: stop_file = PATH_DOC + 'StopWords.txt' stop_words = codecs.open(stop_file, 'r', encoding='utf8').readlines() self.stop_words = [w.strip() for w in stop_words] """
def text2words(self, text: str) -> list: """ 對一段文字分詞、去停用詞 """ result = [] words = pseg.cut(text) for word, flag in words: if word not in self.stop_words and flag not in self.stop_flag: result.append(word) return result def
similarity_compare(self, compare_doc: str, refer_doc: list) -> tuple: """ 比較相似度 :param compare_doc: 待比對的文件 :param refer_doc: 基準文件 :return: tuple """ # 語料庫 refer_words = [] placeholder_count = 0 for refer_word in refer_doc: words = self.text2words(refer_word) if words: refer_words.append(words) else: # 確保順序 placeholder_count += 1 refer_words.append(self.text2words('placeholder' + str(placeholder_count))) # 建立語料庫詞袋模型 dictionary = corpora.Dictionary(refer_words) doc_vectors = [dictionary.doc2bow(word) for word in refer_words] # 建立語料庫 TF-IDF 模型 tf_idf = models.TfidfModel(doc_vectors) tf_idf_vectors = tf_idf[doc_vectors] compare_vectors = dictionary.doc2bow(self.text2words(compare_doc)) index = similarities.MatrixSimilarity(tf_idf_vectors, num_features=len(dictionary)) sims = index[compare_vectors] # 對結果按相似度由高到低排序 sims = sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True) """ index = similarities.MatrixSimilarity(tf_idf_vectors, num_features=len(dictionary), num_best=1) # 對結果按相似度由高到低排序 sims = index[compare_vectors] """ return sims[0] if __name__ == '__main__': tfIdf = Tfidf() test = '月亮海灘旅館' refers = { '普吉島斷點酒店': [(1, '普吉島斷點酒店')], '月亮海灘酒店': [(10386, '月亮海灘酒店')], '月亮海酒店': [(1564, '月亮海酒店')], '清萊海灘酒店': [(3467, '清萊艾美度假酒店')] } titles = list(refers.keys()) similarity = tfIdf.similarity_compare(test, titles) msg = "測試酒店 '%s' 和參照酒店中的 '%s' 最相似,相似度為 %f,對應酒店ID為:%s" \ % (test, titles[similarity[0]], similarity[1], refers[titles[similarity[0]]][0][0]) print(msg)

結果展示

相似度比較結果