1. 程式人生 > >句子相似度_tf/idf

句子相似度_tf/idf

import math
from math import isnan
import pandas as pd
#結巴分詞,切開之後,有分隔符
def jieba_function(sent):
import jieba
sent1 = jieba.cut(sent)
s = []
for each in sent1:
s.append(each)
return ' '.join(str(i) for i in s)
def count_cos_similarity(vec_1, vec_2):
if len(vec_1) != len(vec_2):
return 0

s = sum(vec_1[i] * vec_2[i] for i in range(len(vec_2)))
den1 = math.sqrt(sum([pow(number, 2) for number in vec_1]))
den2 = math.sqrt(sum([pow(number, 2) for number in vec_2]))
return s / (den1 * den2)
#計算文字向量,傳入文字,接受的是字串
def tf(sent1, sent2):
from sklearn.feature_extraction.text import CountVectorizer

sent1 = jieba_function(sent1)
sent2 = jieba_function(sent2)

count_vec = CountVectorizer()

sentences = [sent1, sent2]
print('sentences',sentences)
print('vector',count_vec.fit_transform(sentences).toarray())## 輸出特徵向量化後的表示
print('cut_word',count_vec.get_feature_names())#輸出的是切分的詞, 輸出向量各個維度的特徵含義

#轉換成維度相同的
vec_1 = count_vec.fit_transform(sentences).toarray()[0]
vec_2 = count_vec.fit_transform(sentences).toarray()[1]
similarity=count_cos_similarity(vec_1, vec_2)
if isnan(similarity):
similarity=0.0

print('count_cos_similarity',similarity)
def tfidf(sent1, sent2):
from sklearn.feature_extraction.text import TfidfVectorizer

sent1 = jieba_function(sent1)
sent2 = jieba_function(sent2)

tfidf_vec = TfidfVectorizer()

sentences = [sent1, sent2]
vec_1 = tfidf_vec.fit_transform(sentences).toarray()[0]
vec_2 = tfidf_vec.fit_transform(sentences).toarray()[1]
similarity=count_cos_similarity(vec_1, vec_2)
if isnan(similarity):
similarity=0.0
return similarity


if __name__=='__main__':

sent1 = '我喜歡看電視也喜歡看電影,'
sent2 = '我不喜歡看電視也不喜歡看電影'
print('<<<<tf<<<<<<<')
tf(sent1, sent2)
print('<<<<tfidf<<<<<<<')
tfidf(sent1, sent2)