1. 程式人生 > >python爬豆瓣影評&根據詞頻生成詞雲

python爬豆瓣影評&根據詞頻生成詞雲

python爬豆瓣影評&根據詞頻生成詞雲

通過爬取豆瓣上正在上映的電影影評資訊,並根據評論詞頻生成詞雲。

一、需要的包

import warnings  # 防止出現future warning
warnings.filterwarnings("ignore")
from urllib import request # 用於爬取網頁
from bs4 import BeautifulSoup as bs # 用於解析網頁
import re
import pandas as pd
import numpy as np
import jieba # 用於切詞
from wordcloud import WordCloud # 用於生成詞雲
import matplotlib.pyplot as plt
import matplotlib

二、獲取電影列表

開啟豆瓣上海(https://movie.douban.com/cinema/nowplaying/shanghai/),觀察原始碼內需要爬取內容的特徵。

1、獲取nowplaying電影,並將每一個電影的內容都存在list內。

'''get url'''
url = 'https://movie.douban.com/nowplaying/shanghai/'
resp = request.urlopen(url)
html_data = resp.read().decode('utf-8') # 防止亂碼

soup = bs(html_data, 'html.parser') # 解析
nowplaying = soup.find_all('div', id='nowplaying') # 網頁中id為nowplaying是現在正在上映的電影。
nowplaying_list = nowplaying[0].find_all('li', class_='list-item') # 尋找所有上映電影相關資訊

2、提取電影名稱和id

'''get movie list''' 
movie_list = [] # 獲取電影id和電影名
for item in nowplaying_list:
    movie_dic = {}
    movie_dic['id'] = item['id']
    movie_dic['name'] = item['data-title']
    movie_list.append(movie_dic)

當前nowplaying電影列表

[{'id': '26683723', 'name': '後來的我們'},
 {'id': '26420932', 'name': '巴霍巴利王2:終結'},
 {'id': '26774033', 'name': '幕後玩家'},
 {'id': '26430636', 'name': '狂暴巨獸'},
 {'id': '4920389', 'name': '頭號玩家'},
 {'id': '26935777', 'name': '瑪麗與魔女之花'},
 {'id': '26924141', 'name': '低壓槽:慾望之城'},
 {'id': '26640371', 'name': '犬之島'},
 {'id': '25881611', 'name': '戰神紀'},
 {'id': '26769474', 'name': '香港大營救'},
 {'id': '5330387', 'name': '青年馬克思'},
 {'id': '26691361', 'name': '21克拉'},
 {'id': '26588783', 'name': '冰雪女王3:火與冰'},
 {'id': '30183489', 'name': '小公主艾薇拉與神祕王國'},
 {'id': '26868408', 'name': '黃金花'},
 {'id': '26942631', 'name': '起跑線'},
 {'id': '26384741', 'name': '湮滅'},
 {'id': '30187395', 'name': '午夜十二點'},
 {'id': '26647117', 'name': '暴裂無聲'},
 {'id': '30152451', 'name': '厲害了,我的國'},
 {'id': '27075280', 'name': '青年馬克思'},
 {'id': '26661189', 'name': '脫單告急'},
 {'id': '27077266', 'name': '米花之味'},
 {'id': '26603666', 'name': '媽媽咪鴨'},
 {'id': '26967920', 'name': '遇見你真好'},
 {'id': '30162172', 'name': '出山記'},
 {'id': '20435622', 'name': '環太平洋:雷霆再起'}]

三、獲取《後來的我們》影評

《最好的我們》位於第一個,索引為0。根據影評地址爬取第一頁20條影評,並找到評論所在位置。



1、獲取影評所在div塊兒。

'''first is 'zuihaodewomen', get comments'''
url_comment = 'https://movie.douban.com/subject/' + movie_list[0]['id'] + '/comments?start=' + '0' + '&limit=20'
resp = request.urlopen(url_comment)
html_comment = resp.read().decode('utf-8')
soup_comment = bs(html_comment, 'html.parser')
comment_list = soup_comment.find_all('div', class_='comment')

2、獲取每個影評的內容

'''get comment list'''
comments = []
for item in comment_list:
    comment = item.find_all('p')[0].string
    comments.append(comment)

四、清洗影評

前面步驟得到的影評為list,為了能夠利用jieba包進行切詞,需要將其轉化為字元,並且去除所有標點。

'''clean comments'''
allComment = ''
for item in comments:
    allComment = allComment + item.strip()

# 至少匹配一個漢字,兩個unicode值正好是Unicode表中的漢字的頭和尾。
pattern = re.compile(r'[\u4e00-\u9fa5]+')
finalComment = ''.join(re.findall(pattern, allComment))

segment = jieba.lcut(finalComment)
words_df = pd.DataFrame({'segment': segment})

五、去除無關字元

利用stopwords檔案(百度即可下載)去除一些無用的片語(如我,你,的.....)。

'''remove useless words'''
stopwords = pd.read_csv(".../chineseStopwords.txt", index_col=False, quoting=3, sep="\t",
                        names=['stopword'], encoding='GBK')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

'''get words frequency'''
words_fre = words_df.groupby(by='segment')['segment'].agg({'count': np.size})
words_fre = words_fre.reset_index().sort_values(by='count', ascending=False)

六、畫出影評詞雲圖

'''use wordcloud'''
matplotlib.rcParams['figure.figsize'] = [10.0, 5.0]
wordcloud = WordCloud(font_path='simhei.ttf', background_color='white', max_font_size=80)
word_fre_dic = {x[0]: x[1] for x in words_fre.values}
wordcloud = wordcloud.fit_words(word_fre_dic)
plt.imshow(wordcloud)
plt.show()


七、完整版程式碼

import warnings  # 防止出現future warning
warnings.filterwarnings("ignore")
from urllib import request  # 用於爬取網頁
from bs4 import BeautifulSoup as bs  # 用於解析網頁
import re
import pandas as pd
import numpy as np
import jieba  # 用於切詞
from wordcloud import WordCloud  # 用於生成詞雲
import matplotlib.pyplot as plt
import matplotlib

def getMovieList(url, headers, pattern1='div', id1='nowplaying', pattern2='li', class_='list-item'):
    resp = request.urlopen(url)
    html = resp.read().decode('utf-8')
    soup = bs(html, 'html.parser')
    nowplaying = soup.find_all(pattern1, id=id1)
    nowplaying_list = nowplaying[0].find_all(pattern2, class_=class_)
    
    movie_list = []
    for item in nowplaying_list:
        movie_dic = {}
        movie_dic['id'] = item['id']
        movie_dic['name'] = item['data-title']
        movie_list.append(movie_dic)
    return movie_list

def getCommentList(id2, headers, pages=10, pattern='div', class_='comment'):
    assert pages > 0
    
    all_comments = []
    for i in range(pages):
        start = (i) * 20
        url = 'https://movie.douban.com/subject/' + id2 + '/comments' +'?' +'start=' + str(start) + '&limit=20'
        resp = request.urlopen(url)
        html = resp.read().decode('utf-8')
        soup = bs(html, 'html.parser')
        comment = soup.find_all(pattern, class_=class_)
        
        comments = []
        for item in comment:
            comment = item.find_all('p')[0].string
            comments.append(comment)
        
        all_comments.append(comments)
        
    allComment = ''
    for i in range(len(all_comments)):
         allComment =  allComment + (str(all_comments[i])).strip()

    wordpattern = re.compile(r'[\u4e00-\u9fa5]+')
    finalComment = ''.join(re.findall(wordpattern, allComment))
    
    return finalComment

def cleanComment(finalComment, path):
    segment = jieba.lcut(finalComment)
    comment = pd.DataFrame({'segment': segment})
    
    stopwords = pd.read_csv(path, quoting=3, sep='\t', names=['stopword'], encoding='GBK', index_col=False)
    comment = comment[~comment.segment.isin(stopwords.stopword)]
    
    comment_fre = comment.groupby(by='segment')['segment'].agg({'count': np.size})
    comment_fre = comment_fre.reset_index().sort_values(by='count', ascending=False)
    return comment_fre

def wordcloud(comment_fre):
    matplotlib.rcParams['figure.figsize'] = [10.0, 5.0]
    wordcloud = WordCloud(font_path='simhei.ttf', background_color='white', max_font_size=80)
    comment_fre_dic = {x[0]: x[1] for x in comment_fre.head(1000).values}
    wordcloud = wordcloud.fit_words(comment_fre_dic)
    plt.imshow(wordcloud)
    plt.show

def printMoveName(movie_list, id2):
    for item in movie_list:
        if item['id'] == id2:
            print(item['name'])

def main(url, headers, j, pages, path):
    movie_list = getMovieList(url, headers, 'div', 'nowplaying', 'li', 'list-item')
    comment_list = getCommentList(movie_list[j]['id'], headers, pages, 'div', 'comment')
    comment_fre = cleanComment(comment_list, path)
    printMoveName(movie_list, movie_list[j]['id'])
    return wordcloud(comment_fre)
test1:獲取前十頁《後來的我們》影評
url = 'https://movie.douban.com/nowplaying/shanghai/'
path = ".../chineseStopwords.txt"
main(url, headers, 0, 10, path)

test2:獲取前十頁《頭號玩家》影評


八、參考

https://mp.weixin.qq.com/s/D5Q4Q6YcQDTOOlfwIytFJw

https://www.cnblogs.com/GuoYaxiang/p/6232831.html