1. 程式人生 > >簡單的抽取中文摘要及關鍵詞的方法

簡單的抽取中文摘要及關鍵詞的方法

基於簡單的計算sentence間的相似度,並進行page ranking實現抽取文章摘要。使用jieba庫實現抽取關鍵詞。可以有很多優化的點,後面慢慢更新吧。

具體理論解析可以參考https://blog.csdn.net/qq_32458499/article/details/78659372

#/usr/bin/python
# encoding: UTF-8
import re
import math
import jieba
import jieba.analyse
import numpy as np
import networkx as nx
import random
import sys

class DocumentHandler:
    def __init__(self, file_path):
        self.full_text = ''
        self.read_file(file_path)

    # read data from file
    def read_file(self, file_path):
        fi = open(file_path, 'r+', encoding='UTF-8')
        self.full_text = fi.read()
        fi.close()

    # split text as sentences
    def split_sentence(self, full_text):
        sents = re.split(u'[\n。]', full_text)
        sents = [sent for sent in sents if len(sent) > 0]
        return sents

    # calculate similarity
    def cal_sim(self, word_list_1, word_list_2):
        occur_sum = 0
        word_set_1 = list(set(word_list_1))
        word_set_2 = list(set(word_list_2))
        for word in word_set_1:
            if word in word_set_2:
                occur_sum += 1.0
        if occur_sum < 1e-6:
            return 0.0
        denominator = math.log(len(word_set_1)) + math.log(len(word_set_2))
        if abs(denominator) < 1e-6:
            return 0.0
        return occur_sum / denominator

    # ranking sentces
    def text_rank(self, sentences, top_num = 5, pagerank_config={'alpha': 0.85}):
        sents_num = len(sentences)
        sorted_sents = []
        sent_word_list = []
        # split sentece to word list
        for sent in sentences:
            words = []
            cut_res = jieba.cut(sent)
            for cut in cut_res:
                words.append(cut)
            sent_word_list.append(words)
        # calculate simiarity
        sim_graph = np.zeros((sents_num, sents_num))
        for x in range(sents_num):
            for y in range(x, sents_num):
                similarity = self.cal_sim(sent_word_list[x], sent_word_list[y])
                sim_graph[x, y] = similarity
                sim_graph[y, x] = similarity
        # do page ranking
        nx_graph = nx.from_numpy_matrix(sim_graph)
        scores = nx.pagerank(nx_graph, **pagerank_config)
        sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        # get top sentences based on page ranking
        # Attention: you can do break in for loop to get top n
        for index, score in sorted_scores:
            item = {"sent": sentences[index], "score": score, "index": index}
            sorted_sents.append(item)
        return sorted_sents[:top_num]

    # try to extract abstract from text
    def extract_abstracts(self, full_text, top_num = 5):
        sents = self.split_sentence(full_text)
        rank_res = self.text_rank(sents, top_num=top_num)
        sorted_res = sorted(rank_res, key=lambda x: x['index'], reverse=False)
        return sorted_res

    # get abstract of article
    def getAbstract(self, top_num = 5):
        res = self.extract_abstracts(self.full_text, top_num=top_num)
        abstract = ""
        for content in res:
            abstract = abstract + content["sent"] + "。"
        return abstract.strip()

    # get keywords of article, using jieba for Chinese article processing
    def getKeywords(self, top_num = 5):
        tfidf = jieba.analyse.extract_tags
        keywords = tfidf(self.full_text)
        tmpKeywords = []
        # Attention: you can break for loop to get top n
        for keyword in keywords:
            if len(keyword) < 5:
                tmpKeywords.append(keyword)

        return tmpKeywords[:top_num]

# main processor
def main(file_path):
    docHandler = DocumentHandler(file_path)
    print(docHandler.getAbstract())
    print(docHandler.getKeywords())

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Usage: python digist_keyword.py <file path>')
        sys.exit()
    file_path = sys.argv[1] # the folder to store your plain text files
    main(file_path)

使用百度百科的“百度”詞條進行了測試,測試結果如下:

從創立之初,百度便將“讓人們最平等便捷地獲取資訊,找到所求”作為自己的使命,成立以來,公司秉承“使用者至上”的理念,不斷堅持技術創新,致力於為使用者提供“簡單可依賴”的網際網路搜尋產品及服務,其中包括:以網路搜尋為主的功能性搜尋;以貼吧為主的社群搜尋,針對各區域、行業所需的垂直搜尋;以及門戶頻道、IM等,全面覆蓋了中文網路世界所有的搜尋需求。作為一家以技術為信仰的高科技公司,百度將技術創新作為立身之本,著力於網際網路核心技術突破與人才培養,在搜尋、人工智慧、雲端計算、大資料等技術領域處於全球領先水平。百度是使用者獲取資訊的最主要入口,隨著移動網際網路的發展,百度網頁搜尋完成了由PC向移動的轉型,由連線人與資訊擴充套件到連線人與服務,使用者可以在PC、Pad、手機上訪問百度主頁,通過文字、語音、影象多種互動方式瞬間找到所需要的資訊和服務。作為百度旗下核心產品,hao123及時收錄包括音樂、視訊、小說、遊戲等熱門分類的網站,與搜尋完美結合,為中國網際網路使用者提供最簡單便捷的網上導航服務,重新定義了上網導航的概念。百度商業服務是原有的百度推廣(以搜尋推廣為主)的基礎上,將資料產品、交易產品、媒體產品、信用產品和諮詢服務進行了深度的整合, 並已將諮詢服務、百度內容聯盟加入到整體的商業服務框架中來。
['百度', '搜尋', '服務', '使用者', '網際網路']