[轉載]Python爬取豆瓣影評並生成詞雲圖程式碼


# -*- coding:utf-8 -*-
from selenium import webdriver
import time
import codecs
import jieba
import jieba.analyse as analyse
from wordcloud import WordCloud
from scipy.misc import imread
from os import
path def get_douban_comments(url): comments_list = [] # 評論列表 login_url = 'https://accounts.douban.com/login?source=movie' user_name = 'aaa' # 這裡替換成你的豆瓣使用者名稱 password = 'aaa' # 這裡替換成你的密碼 driver = webdriver.Firefox() # 啟動Firefox() driver.get(login_url) driver.find_element_by_id('email'
).clear() # 清除輸入框 driver.find_element_by_id('email').send_keys(user_name) # 輸入使用者名稱 driver.find_element_by_id('password').clear() driver.find_element_by_id('password').send_keys(password) # 輸入密碼 captcha_field = raw_input('請開啟瀏覽器輸入驗證碼:') # 手動填入驗證碼 driver.find_element_by_id('captcha_field'
).send_keys(captcha_field) driver.find_element_by_class_name('btn-submit').click() # 點選登入按鈕 time.sleep(5) # 等待跳轉到登入之後的頁面 driver.get(url) # 定位到目標頁面 driver.implicitly_wait(3) # 智慧等待3秒 n = 1 # 頁數 count = 0 # 評論數目 i = 50 while True: try: results = driver.find_elements_by_class_name('comment') for result in results: # author = result.find_elements_by_tag_name('a')[1].text # 作者 # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 贊同數目 # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 時間 comment = result.find_element_by_tag_name('p').text # 評論內容 comments_list.append(comment + u'\n') print u"查詢到第%d個評論" % count count += 1 driver.find_element_by_class_name('next').click() # 點選下一頁 print u'第%d頁查詢完畢!' % n n += 1 time.sleep(4) i -= 1 print i if(i == 0): break except Exception, e: print e break with codecs.open('pjl_comment.txt', 'a', encoding='utf-8') as f: f.writelines(comments_list) print u"查詢到第%d頁,第%d個評論!" % (n, count) # 得到所有關鍵詞 def get_all_keywords(file_name): word_lists = [] # 關鍵詞列表 with codecs.open(file_name, 'r', encoding='utf-8') as f: Lists = f.readlines() # 文字列表 for List in Lists: cut_list = list(jieba.cut(List)) for word in cut_list: word_lists.append(word) word_lists_set = set(word_lists) # 去除重複元素 sort_count = [] word_lists_set = list(word_lists_set) length = len(word_lists_set) print u"共有%d個關鍵詞" % length k = 1 for w in word_lists_set: sort_count.append(w + u':' + unicode(word_lists.count(w)) + u"次\n") print u"%d---" % k + w + u":" + unicode(word_lists.count(w)) + u"次" k += 1 with codecs.open('count_word.txt', 'w', encoding='utf-8') as f: f.writelines(sort_count) def get_top_keywords(file_name): top_word_lists = [] # 關鍵詞列表 with codecs.open(file_name, 'r', encoding='utf-8') as f: texts = f.read() # 讀取整個檔案作為一個字串 Result = analyse.textrank( texts, topK=20, withWeight=True, withFlag=True) n = 1 for result in Result: print u"%d:" % n, for C in result[0]: # result[0] 包含關鍵詞和詞性 print C, u" ", print u"權重:" + unicode(result[1]) # 關鍵詞權重 n += 1 # 繪製詞雲 def draw_wordcloud(): with codecs.open('pjl_comment.txt', encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) # 將jieba分詞得到的關鍵詞用空格連線成為字串 d = path.dirname(__file__) # 當前檔案資料夾所在目錄 color_mask = imread("alice_color.png") # 讀取背景圖片 cloud = WordCloud(font_path=path.join(d, 'simsun.ttc'), background_color='white', mask=color_mask, max_words=2000, max_font_size=40) word_cloud = cloud.generate(cut_text) # 產生詞雲 word_cloud.to_file("pjl_cloud.jpg") if __name__ == '__main__': # url = 'https://movie.douban.com/subject/26630781/comments?start=10581&limit=20&sort=new_score' # get_douban_comments(url) # file_name = 'pjl_comment.txt' # get_top_keywords(file_name) draw_wordcloud()