1. 程式人生 > >關鍵詞提取/關鍵字提取之TF-IDF演算法


TF-IDF(term frequency–inverse document frequency)是一種用於資訊檢索與資訊探勘的常用加權技術。
TF的意思是詞頻(Term - frequency),  IDF的意思是逆向檔案頻率(inverse Document frequency)。


一個詞在文章中出現很多次,那麼這個詞肯定有著很大的作用,但是我們自己實踐的話,肯定會看到你統計出來的TF 大都是一些這樣的詞:‘的’,‘是’這樣的詞,這樣的詞顯然對我們的分析和統計沒有什麼幫助,反而有的時候會干擾我們的統計,




    此項計算需要相關意境的文件(例如:醫療類, 需多篇醫療相關的文件,從而有相關的側重點) 


         注:此位置 +1 的目的是防止分母為0

   TF-IDF 計算:
            tf-idf = TF(詞頻) * IDF(逆文件頻率)




 1 # -*- coding: utf-8 -*-
 2 # @Time   : 2018/12/5 17:34
 3 # @Author : Richer
 4 # @File   : get_idf.py
 5 # 此檔案用於自動計算idf
 7 import jieba
 8 import os,sys
 9 import math
11 class IDF():
13     def __init__(self):
14         self.base_path = os.getcwd()
15 self.idf_input_path = os.path.join(self.base_path + '/train_data/tf_idf_input/') # 存放製作idf文件存放的資料夾 16 self.stop_word_file = os.path.join(self.base_path + '/train_data/stop_words.txt') # 停用詞 17 self.idf_output_path = os.path.join(self.base_path + '/data/idf_out/') 18 19 def idf(self): 20 all_chars_dict, total = self._get_file() 21 with open(self.idf_output_path + 'idf.txt', 'w', encoding='utf-8') as wf: 22 for char,value in all_chars_dict.items(): 23 if char > u'\u4e00' and char <= u'\u9fa5': 24 p = math.log(total / (value + 1)) 25 wf.write(char + ' ' + str(p) + '\n') 26 27 def _get_file(self): 28 idf_input_list = os.listdir(self.idf_input_path) 29 all_dict = {} 30 total = 0 31 for file_name in idf_input_list: 32 file = os.path.join(self.idf_input_path, file_name) 33 words = self._read_file(file) # 讀取每一個檔案的資訊 34 tmp_dict = {char: 1 for char in words} 35 total =+1 36 for tmp_char in tmp_dict: 37 num = all_dict.get(tmp_char, 0) 38 all_dict[tmp_char] = num + 1 39 return all_dict, total 40 41 def _read_file(self, file): 42 stop_words = self._stop_words() 43 file = open(file, 'r', encoding='utf-8',errors='ignore') .read() 44 content = file.replace("\n","").replace("\u3000","").replace("\u00A0","").replace(" ","") 45 content_chars = jieba.cut(content, cut_all= True) 46 words = list(set([char for char in content_chars if char not in stop_words])) 47 return words 48 49 def _stop_words(self): 50 stop_words = [] 51 with open(self.stop_word_file, 'r') as f: 52 words = f.readlines() 53 for word in words: 54 word = word.replace("\n","").strip() 55 stop_words.append(word) 56 return stop_words
View Code


 1 # -*- coding: utf-8 -*-
 2 # @Time   : 2018/12/8 15:08
 3 # @Author : Richer
 4 # @File   : tfidf.py
 5 # 此檔案是tfidf演算法入口
 7 import os, sys
 8 import jieba
 9 import re
10 from collections import Counter
13 class TFIDF():
14     def __init__(self, file, topK=20):
15         self.base_path = os.getcwd()
16         self.file_path = os.path.join(self.base_path, file)  # 需提取關鍵詞的檔案, 預設在根目錄下
17         self.stop_word_file = os.path.join(self.base_path + '/train_data/stop_words.txt')  # 停用詞
18         self.idf_file = os.path.join(self.base_path + '/data/idf_out/idf.txt')  # idf檔案
19         self.idf_freq = {}
20         self._load_idf()
21         self.topK = topK
23     def key_abstract(self):
24         # 獲取處理後資料
25         data = self._fitter_data()
26         stop_words = self._get_stop_words()
27         data = [char for char in data if char not in stop_words]
28         total_count = data.__len__()
29         list = Counter(data).most_common()
30         keywords = {}
31         for chars in list:
32             char_tmp = {}
33             char_tmp[chars[0]] = (chars[1] / total_count) * self.idf_freq.get(chars[0],
34                                                                               self.mean_idf)  # TF * IDF(IDF不存在就取平均值)值
35             keywords.update(char_tmp)
36         tags = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
37         if self.topK:
38             return [tag[0] for tag in tags[:self.topK]]
39         else:
40             return [tag[0] for tag in tags]
42     def _fitter_data(self):
43         string = open(self.file_path, 'r', encoding='utf-8').read()
44         content = string.replace("\n", "").replace(" ", "").replace("\u3000", "").replace("\u00A0", "")
45         content = " ".join(jieba.cut(content, cut_all=False))
46         return re.sub('[a-zA-Z0-9.。::,,))((!!??”“\"]', '', content).split()  # 此位置說白了就是隻留下中文; 還可以使用遍歷的方法判斷詞是否在  char > u'\u4e00' and char <= u'\u9fa5':
48     def _load_idf(self):  # 從檔案中載入idf
49         cnt = 0
50         with open(self.idf_file, 'r', encoding='utf-8') as f:
51             for line in f:
52                 try:
53                     word, freq = line.strip().split(' ')
54                     cnt += 1
55                 except Exception as e:
56                     pass
57                 self.idf_freq[word] = float(freq)
58         print('Vocabularies loaded: %d' % cnt)
59         self.mean_idf = sum(self.idf_freq.values()) / cnt
61     def _get_stop_words(self):
62         stop_words = []
63         with open(self.stop_word_file, 'r') as f:
64             words = f.readlines()
65             for word in words:
66                 word = word.replace("\n", "").strip()
67                 stop_words.append(word)
68         return stop_words
View Code
