python爬豆瓣影評&根據詞頻生成詞雲
阿新 • • 發佈:2018-12-22
python爬豆瓣影評&根據詞頻生成詞雲
通過爬取豆瓣上正在上映的電影影評資訊,並根據評論詞頻生成詞雲。
一、需要的包
import warnings # 防止出現future warning warnings.filterwarnings("ignore") from urllib import request # 用於爬取網頁 from bs4 import BeautifulSoup as bs # 用於解析網頁 import re import pandas as pd import numpy as np import jieba # 用於切詞 from wordcloud import WordCloud # 用於生成詞雲 import matplotlib.pyplot as plt import matplotlib
二、獲取電影列表
開啟豆瓣上海(https://movie.douban.com/cinema/nowplaying/shanghai/),觀察原始碼內需要爬取內容的特徵。
1、獲取nowplaying電影,並將每一個電影的內容都存在list內。
'''get url''' url = 'https://movie.douban.com/nowplaying/shanghai/' resp = request.urlopen(url) html_data = resp.read().decode('utf-8') # 防止亂碼 soup = bs(html_data, 'html.parser') # 解析 nowplaying = soup.find_all('div', id='nowplaying') # 網頁中id為nowplaying是現在正在上映的電影。 nowplaying_list = nowplaying[0].find_all('li', class_='list-item') # 尋找所有上映電影相關資訊
2、提取電影名稱和id
'''get movie list'''
movie_list = [] # 獲取電影id和電影名
for item in nowplaying_list:
movie_dic = {}
movie_dic['id'] = item['id']
movie_dic['name'] = item['data-title']
movie_list.append(movie_dic)
當前nowplaying電影列表
[{'id': '26683723', 'name': '後來的我們'}, {'id': '26420932', 'name': '巴霍巴利王2:終結'}, {'id': '26774033', 'name': '幕後玩家'}, {'id': '26430636', 'name': '狂暴巨獸'}, {'id': '4920389', 'name': '頭號玩家'}, {'id': '26935777', 'name': '瑪麗與魔女之花'}, {'id': '26924141', 'name': '低壓槽:慾望之城'}, {'id': '26640371', 'name': '犬之島'}, {'id': '25881611', 'name': '戰神紀'}, {'id': '26769474', 'name': '香港大營救'}, {'id': '5330387', 'name': '青年馬克思'}, {'id': '26691361', 'name': '21克拉'}, {'id': '26588783', 'name': '冰雪女王3:火與冰'}, {'id': '30183489', 'name': '小公主艾薇拉與神祕王國'}, {'id': '26868408', 'name': '黃金花'}, {'id': '26942631', 'name': '起跑線'}, {'id': '26384741', 'name': '湮滅'}, {'id': '30187395', 'name': '午夜十二點'}, {'id': '26647117', 'name': '暴裂無聲'}, {'id': '30152451', 'name': '厲害了,我的國'}, {'id': '27075280', 'name': '青年馬克思'}, {'id': '26661189', 'name': '脫單告急'}, {'id': '27077266', 'name': '米花之味'}, {'id': '26603666', 'name': '媽媽咪鴨'}, {'id': '26967920', 'name': '遇見你真好'}, {'id': '30162172', 'name': '出山記'}, {'id': '20435622', 'name': '環太平洋:雷霆再起'}]
三、獲取《後來的我們》影評
《最好的我們》位於第一個,索引為0。根據影評地址爬取第一頁20條影評,並找到評論所在位置。
1、獲取影評所在div塊兒。
'''first is 'zuihaodewomen', get comments'''
url_comment = 'https://movie.douban.com/subject/' + movie_list[0]['id'] + '/comments?start=' + '0' + '&limit=20'
resp = request.urlopen(url_comment)
html_comment = resp.read().decode('utf-8')
soup_comment = bs(html_comment, 'html.parser')
comment_list = soup_comment.find_all('div', class_='comment')
2、獲取每個影評的內容
'''get comment list'''
comments = []
for item in comment_list:
comment = item.find_all('p')[0].string
comments.append(comment)
四、清洗影評
前面步驟得到的影評為list,為了能夠利用jieba包進行切詞,需要將其轉化為字元,並且去除所有標點。
'''clean comments'''
allComment = ''
for item in comments:
allComment = allComment + item.strip()
# 至少匹配一個漢字,兩個unicode值正好是Unicode表中的漢字的頭和尾。
pattern = re.compile(r'[\u4e00-\u9fa5]+')
finalComment = ''.join(re.findall(pattern, allComment))
segment = jieba.lcut(finalComment)
words_df = pd.DataFrame({'segment': segment})
五、去除無關字元
利用stopwords檔案(百度即可下載)去除一些無用的片語(如我,你,的.....)。
'''remove useless words'''
stopwords = pd.read_csv(".../chineseStopwords.txt", index_col=False, quoting=3, sep="\t",
names=['stopword'], encoding='GBK')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
'''get words frequency'''
words_fre = words_df.groupby(by='segment')['segment'].agg({'count': np.size})
words_fre = words_fre.reset_index().sort_values(by='count', ascending=False)
六、畫出影評詞雲圖
'''use wordcloud'''
matplotlib.rcParams['figure.figsize'] = [10.0, 5.0]
wordcloud = WordCloud(font_path='simhei.ttf', background_color='white', max_font_size=80)
word_fre_dic = {x[0]: x[1] for x in words_fre.values}
wordcloud = wordcloud.fit_words(word_fre_dic)
plt.imshow(wordcloud)
plt.show()
七、完整版程式碼
import warnings # 防止出現future warning
warnings.filterwarnings("ignore")
from urllib import request # 用於爬取網頁
from bs4 import BeautifulSoup as bs # 用於解析網頁
import re
import pandas as pd
import numpy as np
import jieba # 用於切詞
from wordcloud import WordCloud # 用於生成詞雲
import matplotlib.pyplot as plt
import matplotlib
def getMovieList(url, headers, pattern1='div', id1='nowplaying', pattern2='li', class_='list-item'):
resp = request.urlopen(url)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
nowplaying = soup.find_all(pattern1, id=id1)
nowplaying_list = nowplaying[0].find_all(pattern2, class_=class_)
movie_list = []
for item in nowplaying_list:
movie_dic = {}
movie_dic['id'] = item['id']
movie_dic['name'] = item['data-title']
movie_list.append(movie_dic)
return movie_list
def getCommentList(id2, headers, pages=10, pattern='div', class_='comment'):
assert pages > 0
all_comments = []
for i in range(pages):
start = (i) * 20
url = 'https://movie.douban.com/subject/' + id2 + '/comments' +'?' +'start=' + str(start) + '&limit=20'
resp = request.urlopen(url)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
comment = soup.find_all(pattern, class_=class_)
comments = []
for item in comment:
comment = item.find_all('p')[0].string
comments.append(comment)
all_comments.append(comments)
allComment = ''
for i in range(len(all_comments)):
allComment = allComment + (str(all_comments[i])).strip()
wordpattern = re.compile(r'[\u4e00-\u9fa5]+')
finalComment = ''.join(re.findall(wordpattern, allComment))
return finalComment
def cleanComment(finalComment, path):
segment = jieba.lcut(finalComment)
comment = pd.DataFrame({'segment': segment})
stopwords = pd.read_csv(path, quoting=3, sep='\t', names=['stopword'], encoding='GBK', index_col=False)
comment = comment[~comment.segment.isin(stopwords.stopword)]
comment_fre = comment.groupby(by='segment')['segment'].agg({'count': np.size})
comment_fre = comment_fre.reset_index().sort_values(by='count', ascending=False)
return comment_fre
def wordcloud(comment_fre):
matplotlib.rcParams['figure.figsize'] = [10.0, 5.0]
wordcloud = WordCloud(font_path='simhei.ttf', background_color='white', max_font_size=80)
comment_fre_dic = {x[0]: x[1] for x in comment_fre.head(1000).values}
wordcloud = wordcloud.fit_words(comment_fre_dic)
plt.imshow(wordcloud)
plt.show
def printMoveName(movie_list, id2):
for item in movie_list:
if item['id'] == id2:
print(item['name'])
def main(url, headers, j, pages, path):
movie_list = getMovieList(url, headers, 'div', 'nowplaying', 'li', 'list-item')
comment_list = getCommentList(movie_list[j]['id'], headers, pages, 'div', 'comment')
comment_fre = cleanComment(comment_list, path)
printMoveName(movie_list, movie_list[j]['id'])
return wordcloud(comment_fre)
test1:獲取前十頁《後來的我們》影評url = 'https://movie.douban.com/nowplaying/shanghai/'
path = ".../chineseStopwords.txt"
main(url, headers, 0, 10, path)
test2:獲取前十頁《頭號玩家》影評
八、參考
https://mp.weixin.qq.com/s/D5Q4Q6YcQDTOOlfwIytFJw
https://www.cnblogs.com/GuoYaxiang/p/6232831.html