1. 程式人生 > >[轉載]Python爬取豆瓣影評並生成詞雲圖程式碼

[轉載]Python爬取豆瓣影評並生成詞雲圖程式碼

# -*- coding:utf-8 -*-
'''
抓取豆瓣電影某部電影的評論
這裡以《我不是潘金蓮為例》
網址連結:https://movie.douban.com/subject/26630781/comments
為了抓取全部評論需要先進行登入
'''
from selenium import webdriver
import time
import codecs
import jieba
import jieba.analyse as analyse
from wordcloud import WordCloud
from scipy.misc import imread
from os import
path def get_douban_comments(url): comments_list = [] # 評論列表 login_url = 'https://accounts.douban.com/login?source=movie' user_name = 'aaa' # 這裡替換成你的豆瓣使用者名稱 password = 'aaa' # 這裡替換成你的密碼 driver = webdriver.Firefox() # 啟動Firefox() driver.get(login_url) driver.find_element_by_id('email'
).clear() # 清除輸入框 driver.find_element_by_id('email').send_keys(user_name) # 輸入使用者名稱 driver.find_element_by_id('password').clear() driver.find_element_by_id('password').send_keys(password) # 輸入密碼 captcha_field = raw_input('請開啟瀏覽器輸入驗證碼:') # 手動填入驗證碼 driver.find_element_by_id('captcha_field'
).send_keys(captcha_field) driver.find_element_by_class_name('btn-submit').click() # 點選登入按鈕 time.sleep(5) # 等待跳轉到登入之後的頁面 driver.get(url) # 定位到目標頁面 driver.implicitly_wait(3) # 智慧等待3秒 n = 1 # 頁數 count = 0 # 評論數目 i = 50 while True: try: results = driver.find_elements_by_class_name('comment') for result in results: # author = result.find_elements_by_tag_name('a')[1].text # 作者 # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 贊同數目 # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 時間 comment = result.find_element_by_tag_name('p').text # 評論內容 comments_list.append(comment + u'\n') print u"查詢到第%d個評論" % count count += 1 driver.find_element_by_class_name('next').click() # 點選下一頁 print u'第%d頁查詢完畢!' % n n += 1 time.sleep(4) i -= 1 print i if(i == 0): break except Exception, e: print e break with codecs.open('pjl_comment.txt', 'a', encoding='utf-8') as f: f.writelines(comments_list) print u"查詢到第%d頁,第%d個評論!" % (n, count) # 得到所有關鍵詞 def get_all_keywords(file_name): word_lists = [] # 關鍵詞列表 with codecs.open(file_name, 'r', encoding='utf-8') as f: Lists = f.readlines() # 文字列表 for List in Lists: cut_list = list(jieba.cut(List)) for word in cut_list: word_lists.append(word) word_lists_set = set(word_lists) # 去除重複元素 sort_count = [] word_lists_set = list(word_lists_set) length = len(word_lists_set) print u"共有%d個關鍵詞" % length k = 1 for w in word_lists_set: sort_count.append(w + u':' + unicode(word_lists.count(w)) + u"次\n") print u"%d---" % k + w + u":" + unicode(word_lists.count(w)) + u"次" k += 1 with codecs.open('count_word.txt', 'w', encoding='utf-8') as f: f.writelines(sort_count) def get_top_keywords(file_name): top_word_lists = [] # 關鍵詞列表 with codecs.open(file_name, 'r', encoding='utf-8') as f: texts = f.read() # 讀取整個檔案作為一個字串 Result = analyse.textrank( texts, topK=20, withWeight=True, withFlag=True) n = 1 for result in Result: print u"%d:" % n, for C in result[0]: # result[0] 包含關鍵詞和詞性 print C, u" ", print u"權重:" + unicode(result[1]) # 關鍵詞權重 n += 1 # 繪製詞雲 def draw_wordcloud(): with codecs.open('pjl_comment.txt', encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) # 將jieba分詞得到的關鍵詞用空格連線成為字串 d = path.dirname(__file__) # 當前檔案資料夾所在目錄 color_mask = imread("alice_color.png") # 讀取背景圖片 cloud = WordCloud(font_path=path.join(d, 'simsun.ttc'), background_color='white', mask=color_mask, max_words=2000, max_font_size=40) word_cloud = cloud.generate(cut_text) # 產生詞雲 word_cloud.to_file("pjl_cloud.jpg") if __name__ == '__main__': # url = 'https://movie.douban.com/subject/26630781/comments?start=10581&limit=20&sort=new_score' # get_douban_comments(url) # file_name = 'pjl_comment.txt' # get_top_keywords(file_name) draw_wordcloud()