Python3爬蟲之五:爬取網站資料並寫入excel
阿新 • • 發佈:2019-02-05
本文主要講解如何將網頁上的資料寫入到excel表中,因為我比較喜歡看小說,我們就以筆趣閣的小說資料為例,來說明怎麼把筆趣閣的小說關鍵資訊統計出來,比如:小說名、字數、作者、網址等。
根據之前的幾次爬蟲例項分析筆趣網原始碼知道,小說名在唯一的標籤h1中,因此可以通過h1.get_txt()得到小說名,作者在meta標籤,property=”og:novel:author”中,可以通過html.find_all(‘meta’,property=”og:novel:author”)獲取到包含該資訊的列表,其他資訊也可同樣得到。
這裡要用到的BeautifulSoup庫、處理讀excel的xlrd 庫、寫入excel的xlwt庫、負責excel複製的xlutils庫。
程式碼:
#coding:utf-8
import os
import sys
import re
from bs4 import BeautifulSoup
from urllib import request
import xlrd
# from xlwt import *
import xlwt
from xlutils.copy import copy
#from datetime import datetime
url = 'http://www.biqiuge.com/book/37708/'
url = 'http://www.biqiuge.com/book/'
def getHtmlTree(url):
webPage = request.urlopen(url)
htmlCode = webPage.read()
htmlTree = BeautifulSoup(htmlCode,'html.parser')
return htmlTree
# xlsName = r'2.xls'
#判斷網頁是否存在
def adjustExist(url):
try:
htmlTree=getHtmlTree(url)
title = htmlTree.h1.get_text()
author = htmlTree.find_all('meta' ,property="og:novel:author")
author = author[0]['content']
txtSize = htmlTree.find('div',id='info')
txtSize = txtSize.find_all('p')
txtSize = str(txtSize)
flag1 = txtSize.find('共')
flag2 = txtSize.find('字')
if -1 == flag1 or -1 == flag2:
txtSize = ''
else:
txtSize = txtSize[flag1:flag2+1]
if u'出現錯誤!-筆趣閣' == title:
print(url + ' 不存在!')
else:
print(url)
except:
author = 'fbl'
txtSize = '0 bytes'
title = 'Unknow'
pass
finally:
return (author,txtSize ,title)
def main():
reWriteFlag = False
start_url = 6000
end_url = 30000
if start_url > end_url:
(end_url,start_url) = (start_url,end_url)
# start_url = 40000
# end_url = 40001
#init = [u'序號',u'小說名',u'字數',u'作者',u'路徑']
# workbook = xlwt.Workbook(encoding = 'utf-8')
# data_sheet = workbook.add_sheet(u'筆趣閣小說')
fileName = u'筆趣閣.xls'
workbook = xlrd.open_workbook(fileName,formatting_info=True)
# newBook = copy(workbook)
# data_sheet = newBook.get_sheet(u'筆趣閣小說')
if reWriteFlag:
# old_sheet = workbook.sheet_by_name(u'筆趣閣小說')
newBook = copy(workbook)
data_sheet = newBook.get_sheet(u'筆趣閣小說')
for i in range(len(init)):
data_sheet.write(0,i,init[i])
newBook.save(fileName)
for j in range(start_url,end_url):
workbook = xlrd.open_workbook(fileName,formatting_info=True)
table = workbook.sheets()[0]
try:
cell_value = table.cell(j,0).value
# print(type(cell_value))
if cell_value != '':
print(cell_value)
continue
except:
print('NLL')
pass
url_tmp = url + str(j)
(author,size,title) = adjustExist(url_tmp)
tmp = [j,title,size,author,url_tmp]
newBook = copy(workbook)
data_sheet = newBook.get_sheet(u'筆趣閣小說')
# data_sheet = newBook.sheet_by_name(u'筆趣閣小說')
# print(cell_value)
for k in range(len(tmp)):
data_sheet.write(j,k,tmp[k])
newBook.save(fileName)
main()
效果圖展示:
在通過excel的資料分列功能可以將字數提取出來作為關鍵資料:
有需要這份資料的請去我的資源下載,資源名:筆趣閣小說資料彙總.xls