1. 程式人生 > >使用中文製作詞雲圖---

使用中文製作詞雲圖---

'''
製作詞雲圖,背景可以替換成任意圖片,本例中未展示圖片
'''
import numpy as np
import pandas as pd
from wordcloud import WordCloud    #詞雲包
import jieba                       #中文分詞包
import codecs                      #提供的open方法來指定開啟的檔案的語言編碼,在讀取的時候自動轉換為內部unicode
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0,5.5)          #繪製出的圖大小


'''
讀入資料
'''
df = pd.read_csv('H:/NLP_project/NLP_project/data/entertainment_news.csv')

'''
資料處理
'''
#資料切分
df = df.dropna()
content = df.content.values.tolist()                #轉為list
segment = []
for line in content:
    try:
        segs = jieba.lcut(line)                     #利用jieba進行文字切分
        for seg in segs:
            if len(seg)>1 and seg!='\r\n':        #當元素不為空且不是換行符等,將其加入segment
                segment.append(seg)
    except:
        print(line)
        continue

'''
去除停用詞
'''
stopwords = pd.read_csv('H:/NLP_project/NLP_project/data/stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'])
words_df = pd.DataFrame({'segment':segment})
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

'''
詞頻統計
'''
word_start = words_df.groupby(by=['segment'])['segment'].agg({"計數":np.size})        #按照segment,agg聚合
word_start = word_start.reset_index().sort_values(by=["計數"],ascending=False)

'''
做詞雲
'''
wordcloud = WordCloud(font_path="H:/NLP_project/NLP_project/data/simhei.ttf",background_color="black",max_font_size=80)
word_frequence = {x[0]:x[1] for x in word_start.head(1000).values}
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.show()