1. 程式人生 > >爬取豆瓣Top250圖書【Beautiful】

爬取豆瓣Top250圖書【Beautiful】

由於我有一個喜歡看書的室友,最近比較鬧書荒,我想著爬取一下豆瓣評分Top250的圖書,看看他有沒有想看的,我是本著學習的態度加雙贏的結果(並不是為了裝那啥。。。

爬取目標
+ 爬取豆瓣評分Top250的圖書
+ 獲取每本圖書的詳細資訊
+ 把爬取結果存入Excel中

0、爬取效果


1、分析URL

爬取的目標url為,https://book.douban.com/top250?start=0,這也是一個多頁的爬取,url的規則為,start=0,25,分別為第一頁,第二頁,分別對應著每頁的25本圖書
需要爬取的整體內容

這次採用的是BeautifuSoup

    def getBooks(self):
        pageCode = self.getPage()
        bsObj = BeautifulSoup(pageCode, 'lxml')
        for book in bsObj.findAll("td", {"valign": "top"}):
            if book.find('div',{'class':re.compile(r'pl[2]{1}')})==None:
                continue
            bookUrl = book.a['href'
].strip() #圖書詳細資訊的連結 title = '《' + book.a['title'].strip() + '》' #圖書標題 detail = book.find('p',{'class':'pl'}).get_text().split('/') #圖書相關細節 author = detail[0].strip() #圖書作者 if len(detail)==5: translator = detail[1
].strip() #圖書譯者 press = detail[2].strip() #出版社 date = detail[3].strip() #出版日期 price = detail[4].strip() #圖書價格 else: translator = '' press = detail[1].strip() date = detail[2].strip() price = detail[3].strip() score = book.find('span',{'class':'rating_nums'}).get_text().strip() #圖書評分 scoreNum = book.find('span',{'class':'pl'}).get_text().strip('(').strip(')').strip() #圖書評價人數 quote = book.find('span',{'class':'inq'}).get_text() #簡述 self.book_list.append([title,author,translator,quote,press,date,price,score,scoreNum,bookUrl])

2、爬取內容存入到EXCEl

import xlwt
def load(self,datalist):
    file = xlwt.Workbook()
    sheet = file.add_sheet('豆瓣圖書Top250',cell_overwrite_ok=True)
    col = (u'圖書名字',u'作者',u'譯者',u'引述',u'出版社',u'發行日期',u'價格',u'評分',u'評價標準',u'圖書詳細連結')
    for i in range(0,10):
        sheet.write(0,i,col[i]) #列名
    for i in range(0,250):
        data = datalist[i]

        for j in range(0,10):
            sheet.write(i+1,j,data[j])    #資料
    file.save('豆瓣圖書Top250.xls')

3、整體程式碼

# coding:utf-8
"""
https://book.douban.com/top250?start=0
爬取豆瓣圖書評分最高的前250本,
第一頁:start=0,第二頁:start=25......
"""
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import URLError
from bs4 import BeautifulSoup
import re
import xlwt


class DoubanBook:
    def __init__(self, pageIndex):
        self.pageIndex = 0
        self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        self.headers = {'User-Agent': self.user_agent}
        self.book_list = []

    def getPage(self):
        try:
            url = 'https://book.douban.com/top250?' + str(self.pageIndex)
            request = Request(url, headers=self.headers)
            response = urlopen(request)
            page = response.read().decode('utf-8')
            return page
        except URLError as e:
            if hasattr(e, 'reason'):
                print("爬取失敗,失敗原因:", e.reason)

    def getBooks(self):
        pageCode = self.getPage()
        bsObj = BeautifulSoup(pageCode, 'lxml')
        for book in bsObj.findAll("td", {"valign": "top"}):
            if book.find('div',{'class':re.compile(r'pl[2]{1}')})==None:
                continue
            bookUrl = book.a['href'].strip()
            title = book.a['title'].strip()
            detail = book.find('p',{'class':'pl'}).get_text().split('/')
            author = detail[0].strip()
            if len(detail)==5:
                translator = detail[1].strip()
                press = detail[2].strip()
                date = detail[3].strip()
                price = detail[4].strip()
            else:
                translator = ''
                press = detail[1].strip()
                date = detail[2].strip()
                price = detail[3].strip()
            score = book.find('span',{'class':'rating_nums'}).get_text().strip()
            scoreNum = book.find('span',{'class':'pl'}).get_text().strip('(').strip(')').strip()
            quote = book.find('span',{'class':'inq'}).get_text()
            self.book_list.append([title,author,quote,press,date,price,score,scoreNum,bookUrl])

    def load(self,datalist):
        file = xlwt.Workbook()
        sheet = file.add_sheet('豆瓣圖書Top250',cell_overwrite_ok=True)
        col = (u'圖書名字',u'作者',u'引述',u'出版社',u'發行日期',u'價格',u'評分',u'評價標準',u'圖書詳細連結')
        for i in range(0,9):
            sheet.write(0,i,col[i]) #列名
        for i in range(0,250):
            data = datalist[i]
            for j in range(0,9):
                sheet.write(i+1,j,data[j])    #資料
        file.save('豆瓣圖書Top250.xls')

    def start(self):
        print('現開始抓取豆瓣圖書Top250的資料:')
        while self.pageIndex<=225:
            print('現抓取第%d頁'% (self.pageIndex/25+1))
            self.getBooks()
            self.pageIndex+=25
        print("抓取完成")
        self.load(self.book_list)


book = DoubanBook(0)
book.start()