(8)Python爬蟲——爬取豆瓣影評資料
阿新 • • 發佈:2019-01-27
利用python爬取豆瓣最受歡迎的影評50條的相關資訊,包括標題,作者,影片名,影片詳情連結,推薦級,迴應數,影評連結,影評,有用數這9項內容,然後將爬取的資訊寫入Excel表中。具體程式碼如下:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from bs4 import BeautifulSoup
import re
import urllib2
import xlwt
# 得到頁面全部內容
def askURL(url):
request = urllib2.Request(url) # 傳送請求
try:
response = urllib2.urlopen(request) # 取得響應
html = response.read() # 獲取網頁內容
# print html
except urllib2.URLError, e:
if hasattr(e, "code"):
print e.code
if hasattr(e, "reason"):
print e.reason
return html
# 獲取相關內容
def getData(baseurl):
# 找到評論標題
pattern_title = re.compile(r'<a href=".*/review/\d+/">(.+)</a>')
# 找到評論全文連結
pattern_link = re.compile(r'<a href="(.*/review/\d+/)">.+</a>')
# 找到作者
pattern_author = re.compile(r'<a.+property="v:reviewer">(.+)</a>')
# 找到評論的影片和影評詳情連結
pattern_subject_link = re.compile(r'<a class="subject-img" href="(.+subject.+)"> <img.+title="(.+)".+>')
# 找到推薦等級
pattern_star = re.compile(r'<span.+property="v:rating" title="(.+)"></span>')
# 找到迴應數
pattern_response = re.compile(r'<a class="reply" href=".*/review/\d+/#comments">(\d+)迴應</a>')
# 找到有用數
pattern_use = re.compile(r'<span id="r-useful_count-\d+">\s*(\d+)\s*</span>')
remove = re.compile(r'<.+?>') # 去除標籤
datalist = []
for i in range(0, 5):
url = baseurl + str(i * 10) # 更新url
# print url
html = askURL(url)
# print html
soup = BeautifulSoup(html, "html.parser")
# 找到每一個影評項
for item in soup.find_all('div', class_='main review-item'):
data = []
item = str(item) # 轉換成字串
# print item
title = re.findall(pattern_title, item)[0]
# print title
reviewlink = re.findall(pattern_link, item)[0]
# print reviewlink
data.append(title) # 新增標題
author = re.findall(pattern_author, item)[0]
# print author
data.append(author) # 新增作者
list_subject_link = re.findall(pattern_subject_link, item)[0]
moviename = list_subject_link[1]
# print moviename
movielink = list_subject_link[0]
# print movielink
data.append(moviename) # 新增片名
data.append(movielink) # 新增影片連結
star = re.findall(pattern_star, item)
if len(star) != 0:
star = star[0]
else: # 可能沒有star
star = ''
# print star
data.append(star) # 新增推薦等級
response = re.findall(pattern_response, item)[0]
# print response
data.append(response) # 添加回應數
data.append(reviewlink) # 新增評論正文連結
use = re.findall(pattern_use, item)[0]
# print use
content = askURL(reviewlink)
content = BeautifulSoup(content, "html.parser")
desc = content.find_all('div', id='link-report')[0]
desc = re.sub(remove, '', str(desc)) # 去掉標籤
# print desc
data.append(desc) # 新增評論正文
data.append(use) # 新增有用數
# print data
datalist.append(data)
return datalist
# 將相關資料寫入excel中
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('豆瓣最受歡迎影評', cell_overwrite_ok=True)
col = ('標題', '作者', '影片名', '影片詳情連結', '推薦級', '迴應數', '影評連結', '影評', '有用數')
for i in range(0, 9):
sheet.write(0, i, col[i]) # 列名
for i in range(0, 50): # 總共50條影評
data = datalist[i]
for j in range(0, 9):
sheet.write(i + 1, j, data[j]) # 資料
book.save(savepath) # 儲存
def main():
baseurl = 'http://movie.douban.com/review/best/?start='
datalist = getData(baseurl)
savapath = u'豆瓣最受歡迎影評.xlsx'
saveData(datalist, savapath)
print 'done'
main()
Excel部分內容如下:
國家代表的影評正文如下: