1. 程式人生 > >一段比較好的生成自動摘要程式碼

一段比較好的生成自動摘要程式碼

#!/user/bin/python
# coding:utf-8

import nltk
import numpy
import jieba
import codecs
import os

class SummaryTxt:
    def __init__(self,stopwordspath):
        # 單詞數量
        self.N = 100
        # 單詞間的距離
        self.CLUSTER_THRESHOLD = 5
        # 返回的top n句子
        self.TOP_SENTENCES = 5
        self.stopwrods = {}
        #載入停用詞
        if os.path.exists(stopwordspath):
            stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()]
            self.stopwrods = {}.fromkeys(stoplist)


    def _split_sentences(self,texts):
        '''
        把texts拆分成單個句子,儲存在列表裡面,以(.!?。!?)這些標點作為拆分的意見,
        :param texts: 文字資訊
        :return:
        '''
        splitstr = '.!?。!?'.decode('utf8')
        start = 0
        index = 0  # 每個字元的位置
        sentences = []
        for text in texts:
            if text in splitstr:  # 檢查標點符號下一個字元是否還是標點
                sentences.append(texts[start:index + 1])  # 當前標點符號位置
                start = index + 1  # start標記到下一句的開頭
            index += 1
        if start < len(texts):
            sentences.append(texts[start:])  # 這是為了處理文字末尾沒有標

        return sentences

    def _score_sentences(self,sentences, topn_words):
        '''
        利用前N個關鍵字給句子打分
        :param sentences: 句子列表
        :param topn_words: 關鍵字列表
        :return:
        '''
        scores = []
        sentence_idx = -1
        for s in [list(jieba.cut(s)) for s in sentences]:
            sentence_idx += 1
            word_idx = []
            for w in topn_words:
                try:
                    word_idx.append(s.index(w))  # 關鍵詞出現在該句子中的索引位置
                except ValueError:  # w不在句子中
                    pass
            word_idx.sort()
            if len(word_idx) == 0:
                continue
            # 對於兩個連續的單詞,利用單詞位置索引,通過距離閥值計算族
            clusters = []
            cluster = [word_idx[0]]
            i = 1
            while i < len(word_idx):
                if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
                    cluster.append(word_idx[i])
                else:
                    clusters.append(cluster[:])
                    cluster = [word_idx[i]]
                i += 1
            clusters.append(cluster)
            # 對每個族打分,每個族類的最大分數是對句子的打分
            max_cluster_score = 0
            for c in clusters:
                significant_words_in_cluster = len(c)
                total_words_in_cluster = c[-1] - c[0] + 1
                score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
                if score > max_cluster_score:
                    max_cluster_score = score
            scores.append((sentence_idx, max_cluster_score))
        return scores

    def summaryScoredtxt(self,text):
        # 將文章分成句子
        sentences = self._split_sentences(text)

        # 生成分詞
        words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
                 len(w) > 1 and w != '\t']
        # words = []
        # for sentence in sentences:
        #     for w in jieba.cut(sentence):
        #         if w not in stopwords and len(w) > 1 and w != '\t':
        #             words.append(w)

        # 統計詞頻
        wordfre = nltk.FreqDist(words)

        # 獲取詞頻最高的前N個詞
        topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

        # 根據最高的n個關鍵詞,給句子打分
        scored_sentences = self._score_sentences(sentences, topn_words)

        # 利用均值和標準差過濾非重要句子
        avg = numpy.mean([s[1] for s in scored_sentences])  # 均值
        std = numpy.std([s[1] for s in scored_sentences])  # 標準差
        summarySentences = []
        for (sent_idx, score) in scored_sentences:
            if score > (avg + 0.5 * std):
                summarySentences.append(sentences[sent_idx])
                print sentences[sent_idx]
        return summarySentences

    def summaryTopNtxt(self,text):
        # 將文章分成句子
        sentences = self._split_sentences(text)

        # 根據句子列表生成分詞列表
        words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
                 len(w) > 1 and w != '\t']
        # words = []
        # for sentence in sentences:
        #     for w in jieba.cut(sentence):
        #         if w not in stopwords and len(w) > 1 and w != '\t':
        #             words.append(w)

        # 統計詞頻
        wordfre = nltk.FreqDist(words)

        # 獲取詞頻最高的前N個詞
        topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

        # 根據最高的n個關鍵詞,給句子打分
        scored_sentences = self._score_sentences(sentences, topn_words)

        top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-self.TOP_SENTENCES:]
        top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
        summarySentences = []
        for (idx, score) in top_n_scored:
            print sentences[idx]
            summarySentences.append(sentences[idx])

        return sentences



if __name__=='__main__':
    obj =SummaryTxt('D:\work\Solr\solr-python\CNstopwords.txt')

    txt=u'十八大以來的五年,是黨和國家發展程序中極不平凡的五年。面對世界經濟復甦乏力、區域性衝突和動盪頻發、全球性問題加劇的外部環境,面對我國經濟發展進入新常態等一系列深刻變化,我們堅持穩中求進工作總基調,迎難而上,開拓進取,取得了改革開放和社會主義現代化建設的歷史性成就。' \
        u'為貫徹十八大精神,黨中央召開七次全會,分別就政府機構改革和職能轉變、全面深化改革、全面推進依法治國、制定“十三五”規劃、全面從嚴治黨等重大問題作出決定和部署。五年來,我們統籌推進“五位一體”總體佈局、協調推進“四個全面”戰略佈局,“十二五”規劃勝利完成,“十三五”規劃順利實施,黨和國家事業全面開創新局面。' \
        u'經濟建設取得重大成就。堅定不移貫徹新發展理念,堅決端正發展觀念、轉變發展方式,發展質量和效益不斷提升。經濟保持中高速增長,在世界主要國家中名列前茅,國內生產總值從五十四萬億元增長到八十萬億元,穩居世界第二,對世界經濟增長貢獻率超過百分之三十。供給側結構性改革深入推進,經濟結構不斷優化,數字經濟等新興產業蓬勃發展,高鐵、公路、橋樑、港口、機場等基礎設施建設快速推進。農業現代化穩步推進,糧食生產能力達到一萬二千億斤。城鎮化率年均提高一點二個百分點,八千多萬農業轉移人口成為城鎮居民。區域發展協調性增強,“一帶一路”建設、京津冀協同發展、長江經濟帶發展成效顯著。創新驅動發展戰略大力實施,創新型國家建設成果豐碩,天宮、蛟龍、天眼、悟空、墨子、大飛機等重大科技成果相繼問世。南海島礁建設積極推進。開放型經濟新體制逐步健全,對外貿易、對外投資、外匯儲備穩居世界前列。' \
        u'全面深化改革取得重大突破。蹄疾步穩推進全面深化改革,堅決破除各方面體制機制弊端。改革全面發力、多點突破、縱深推進,著力增強改革系統性、整體性、協同性,壓茬拓展改革廣度和深度,推出一千五百多項改革舉措,重要領域和關鍵環節改革取得突破性進展,主要領域改革主體框架基本確立。中國特色社會主義制度更加完善,國家治理體系和治理能力現代化水平明顯提高,全社會發展活力和創新活力明顯增強。'

    # txt ='The information disclosed by the Film Funds Office of the State Administration of Press, Publication, Radio, Film and Television shows that, the total box office in China amounted to nearly 3 billion yuan during the first six days of the lunar year (February 8 - 13), an increase of 67% compared to the 1.797 billion yuan in the Chinese Spring Festival period in 2015, becoming the "Best Chinese Spring Festival Period in History".' \
    #      'During the Chinese Spring Festival period, "The Mermaid" contributed to a box office of 1.46 billion yuan. "The Man From Macau III" reached a box office of 680 million yuan. "The Journey to the West: The Monkey King 2" had a box office of 650 million yuan. "Kung Fu Panda 3" also had a box office of exceeding 130 million. These four blockbusters together contributed more than 95% of the total box office during the Chinese Spring Festival period.' \
    #      'There were many factors contributing to the popularity during the Chinese Spring Festival period. Apparently, the overall popular film market with good box office was driven by the emergence of a few blockbusters. In fact, apart from the appeal of the films, other factors like film ticket subsidy of online seat-selection companies, cinema channel sinking and the film-viewing heat in the middle and small cities driven by the home-returning wave were all main factors contributing to this blowout. A management of Shanghai Film Group told the 21st Century Business Herald.'
    print txt
    print "--"
    obj.summaryScoredtxt(txt)

    print "----"
    obj.summaryTopNtxt(txt)