1. 程式人生 > >(8)Python爬蟲——爬取豆瓣影評資料

(8)Python爬蟲——爬取豆瓣影評資料

利用python爬取豆瓣最受歡迎的影評50條的相關資訊,包括標題,作者,影片名,影片詳情連結,推薦級,迴應數,影評連結,影評,有用數這9項內容,然後將爬取的資訊寫入Excel表中。具體程式碼如下:

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys

reload(sys)
sys.setdefaultencoding('utf8')
from bs4 import BeautifulSoup
import re
import urllib2
import xlwt


# 得到頁面全部內容
def askURL(url):
    request = urllib2.Request(url)  # 傳送請求
try: response = urllib2.urlopen(request) # 取得響應 html = response.read() # 獲取網頁內容 # print html except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason return html # 獲取相關內容 def
getData(baseurl):
# 找到評論標題 pattern_title = re.compile(r'<a href=".*/review/\d+/">(.+)</a>') # 找到評論全文連結 pattern_link = re.compile(r'<a href="(.*/review/\d+/)">.+</a>') # 找到作者 pattern_author = re.compile(r'<a.+property="v:reviewer">(.+)</a>') # 找到評論的影片和影評詳情連結
pattern_subject_link = re.compile(r'<a class="subject-img" href="(.+subject.+)"> <img.+title="(.+)".+>') # 找到推薦等級 pattern_star = re.compile(r'<span.+property="v:rating" title="(.+)"></span>') # 找到迴應數 pattern_response = re.compile(r'<a class="reply" href=".*/review/\d+/#comments">(\d+)迴應</a>') # 找到有用數 pattern_use = re.compile(r'<span id="r-useful_count-\d+">\s*(\d+)\s*</span>') remove = re.compile(r'<.+?>') # 去除標籤 datalist = [] for i in range(0, 5): url = baseurl + str(i * 10) # 更新url # print url html = askURL(url) # print html soup = BeautifulSoup(html, "html.parser") # 找到每一個影評項 for item in soup.find_all('div', class_='main review-item'): data = [] item = str(item) # 轉換成字串 # print item title = re.findall(pattern_title, item)[0] # print title reviewlink = re.findall(pattern_link, item)[0] # print reviewlink data.append(title) # 新增標題 author = re.findall(pattern_author, item)[0] # print author data.append(author) # 新增作者 list_subject_link = re.findall(pattern_subject_link, item)[0] moviename = list_subject_link[1] # print moviename movielink = list_subject_link[0] # print movielink data.append(moviename) # 新增片名 data.append(movielink) # 新增影片連結 star = re.findall(pattern_star, item) if len(star) != 0: star = star[0] else: # 可能沒有star star = '' # print star data.append(star) # 新增推薦等級 response = re.findall(pattern_response, item)[0] # print response data.append(response) # 添加回應數 data.append(reviewlink) # 新增評論正文連結 use = re.findall(pattern_use, item)[0] # print use content = askURL(reviewlink) content = BeautifulSoup(content, "html.parser") desc = content.find_all('div', id='link-report')[0] desc = re.sub(remove, '', str(desc)) # 去掉標籤 # print desc data.append(desc) # 新增評論正文 data.append(use) # 新增有用數 # print data datalist.append(data) return datalist # 將相關資料寫入excel中 def saveData(datalist, savepath): book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('豆瓣最受歡迎影評', cell_overwrite_ok=True) col = ('標題', '作者', '影片名', '影片詳情連結', '推薦級', '迴應數', '影評連結', '影評', '有用數') for i in range(0, 9): sheet.write(0, i, col[i]) # 列名 for i in range(0, 50): # 總共50條影評 data = datalist[i] for j in range(0, 9): sheet.write(i + 1, j, data[j]) # 資料 book.save(savepath) # 儲存 def main(): baseurl = 'http://movie.douban.com/review/best/?start=' datalist = getData(baseurl) savapath = u'豆瓣最受歡迎影評.xlsx' saveData(datalist, savapath) print 'done' main()

Excel部分內容如下:這裡寫圖片描述

國家代表的影評正文如下:
這裡寫圖片描述