簡單的抽取中文摘要及關鍵詞的方法
基於簡單的計算sentence間的相似度,並進行page ranking實現抽取文章摘要。使用jieba庫實現抽取關鍵詞。可以有很多優化的點,後面慢慢更新吧。
具體理論解析可以參考https://blog.csdn.net/qq_32458499/article/details/78659372
#/usr/bin/python # encoding: UTF-8 import re import math import jieba import jieba.analyse import numpy as np import networkx as nx import random import sys class DocumentHandler: def __init__(self, file_path): self.full_text = '' self.read_file(file_path) # read data from file def read_file(self, file_path): fi = open(file_path, 'r+', encoding='UTF-8') self.full_text = fi.read() fi.close() # split text as sentences def split_sentence(self, full_text): sents = re.split(u'[\n。]', full_text) sents = [sent for sent in sents if len(sent) > 0] return sents # calculate similarity def cal_sim(self, word_list_1, word_list_2): occur_sum = 0 word_set_1 = list(set(word_list_1)) word_set_2 = list(set(word_list_2)) for word in word_set_1: if word in word_set_2: occur_sum += 1.0 if occur_sum < 1e-6: return 0.0 denominator = math.log(len(word_set_1)) + math.log(len(word_set_2)) if abs(denominator) < 1e-6: return 0.0 return occur_sum / denominator # ranking sentces def text_rank(self, sentences, top_num = 5, pagerank_config={'alpha': 0.85}): sents_num = len(sentences) sorted_sents = [] sent_word_list = [] # split sentece to word list for sent in sentences: words = [] cut_res = jieba.cut(sent) for cut in cut_res: words.append(cut) sent_word_list.append(words) # calculate simiarity sim_graph = np.zeros((sents_num, sents_num)) for x in range(sents_num): for y in range(x, sents_num): similarity = self.cal_sim(sent_word_list[x], sent_word_list[y]) sim_graph[x, y] = similarity sim_graph[y, x] = similarity # do page ranking nx_graph = nx.from_numpy_matrix(sim_graph) scores = nx.pagerank(nx_graph, **pagerank_config) sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True) # get top sentences based on page ranking # Attention: you can do break in for loop to get top n for index, score in sorted_scores: item = {"sent": sentences[index], "score": score, "index": index} sorted_sents.append(item) return sorted_sents[:top_num] # try to extract abstract from text def extract_abstracts(self, full_text, top_num = 5): sents = self.split_sentence(full_text) rank_res = self.text_rank(sents, top_num=top_num) sorted_res = sorted(rank_res, key=lambda x: x['index'], reverse=False) return sorted_res # get abstract of article def getAbstract(self, top_num = 5): res = self.extract_abstracts(self.full_text, top_num=top_num) abstract = "" for content in res: abstract = abstract + content["sent"] + "。" return abstract.strip() # get keywords of article, using jieba for Chinese article processing def getKeywords(self, top_num = 5): tfidf = jieba.analyse.extract_tags keywords = tfidf(self.full_text) tmpKeywords = [] # Attention: you can break for loop to get top n for keyword in keywords: if len(keyword) < 5: tmpKeywords.append(keyword) return tmpKeywords[:top_num] # main processor def main(file_path): docHandler = DocumentHandler(file_path) print(docHandler.getAbstract()) print(docHandler.getKeywords()) if __name__ == '__main__': if len(sys.argv) < 2: print('Usage: python digist_keyword.py <file path>') sys.exit() file_path = sys.argv[1] # the folder to store your plain text files main(file_path)
使用百度百科的“百度”詞條進行了測試,測試結果如下:
從創立之初,百度便將“讓人們最平等便捷地獲取資訊,找到所求”作為自己的使命,成立以來,公司秉承“使用者至上”的理念,不斷堅持技術創新,致力於為使用者提供“簡單可依賴”的網際網路搜尋產品及服務,其中包括:以網路搜尋為主的功能性搜尋;以貼吧為主的社群搜尋,針對各區域、行業所需的垂直搜尋;以及門戶頻道、IM等,全面覆蓋了中文網路世界所有的搜尋需求。作為一家以技術為信仰的高科技公司,百度將技術創新作為立身之本,著力於網際網路核心技術突破與人才培養,在搜尋、人工智慧、雲端計算、大資料等技術領域處於全球領先水平。百度是使用者獲取資訊的最主要入口,隨著移動網際網路的發展,百度網頁搜尋完成了由PC向移動的轉型,由連線人與資訊擴充套件到連線人與服務,使用者可以在PC、Pad、手機上訪問百度主頁,通過文字、語音、影象多種互動方式瞬間找到所需要的資訊和服務。作為百度旗下核心產品,hao123及時收錄包括音樂、視訊、小說、遊戲等熱門分類的網站,與搜尋完美結合,為中國網際網路使用者提供最簡單便捷的網上導航服務,重新定義了上網導航的概念。百度商業服務是原有的百度推廣(以搜尋推廣為主)的基礎上,將資料產品、交易產品、媒體產品、信用產品和諮詢服務進行了深度的整合, 並已將諮詢服務、百度內容聯盟加入到整體的商業服務框架中來。
['百度', '搜尋', '服務', '使用者', '網際網路']