1. 程式人生 > >【python爬蟲】讀寫、追加到excel檔案中

【python爬蟲】讀寫、追加到excel檔案中

爬取糗事百科熱門

安裝 讀寫excel 依賴 pip install xlwt
安裝 追加excel檔案內容 依賴 pip install xlutils
安裝 lxml

import csv
import requests
from lxml import etree
import time
import xlwt
import os
from xlutils.copy import copy
import xlrd

data_infos_list = []

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'} # f = open('C:\\Users\\Administrator\\Desktop\\qiubaibook.csv', 'a+', newline='', encoding='utf-8') # writer = csv.writer(f) # writer.writerow(('author', 'sex', 'rank', 'content', 'great', 'comment', 'time')) filename = 'C:\\Users\\Administrator\\Desktop\\qiubaibook.xls'
def get_info(url): res = requests.get(url, headers=headers) selector = etree.HTML(res.text) # print(res.text) htmls = selector.xpath('//div[contains(@class,"article block untagged mb15")]') # // *[ @ id = "qiushi_tag_120024357"] / a[1] / div / span 內容 # //*[@id="qiushi_tag_120024357"]/div[2]/span[1]/i 好笑
# //*[@id="c-120024357"]/i 評論 # //*[@id="qiushi_tag_120024357"]/div[1]/a[2]/h2 作者 # //*[@id="qiushi_tag_120024357"]/div[1]/div 等級 # // womenIcon manIcon 性別 for html in htmls: author = html.xpath('div[1]/a[2]/h2/text()') if len(author) == 0: author = html.xpath('div[1]/span[2]/h2/text()') rank = html.xpath('div[1]/div/text()') sex = html.xpath('div[1]/div/@class') if len(sex) == 0: sex = '未知' elif 'manIcon' in sex[0]: sex = '男' elif 'womenIcon' in sex[0]: sex = '女' if len(rank) == 0: rank = '-1' contents = html.xpath('a[1]/div/span/text()') great = html.xpath('div[2]/span[1]/i/text()') # //*[@id="qiushi_tag_112746244"]/div[3]/span[1]/i if len(great) == 0: great = html.xpath('div[3]/span[1]/i/text()') comment = html.xpath('div[2]/span[2]/a/i/text()') # //*[@id="c-112746244"]/i if len(comment) == 0: comment = html.xpath('div[3]/span[2]/a/i/text()') # classes = html.xpath('a[1]/@class') # writer.writerow((author[0].strip(), sex, rank[0].strip(), contents[0].strip(), great[0].strip(), # comment[0].strip(), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))) data_infos = [author[0].strip(), sex, rank[0].strip(), contents[0].strip(), great[0].strip(), comment[0].strip(), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))] data_infos_list.append(data_infos) def write_data(sheet, row): for data_infos in data_infos_list: j = 0 for data in data_infos: sheet.write(row, j, data) j += 1 row += 1 if __name__ == '__main__': urls = ['https://www.qiushibaike.com/8hr/page/{}/'.format(num) for num in range(1, 14)] for url in urls: print(url) get_info(url) time.sleep(2) # 如果檔案存在,則追加。如果檔案不存在,則新建 if os.path.exists(filename): # 開啟excel rb = xlrd.open_workbook(filename, formatting_info=True) # formatting_info=True 保留原有字型顏色等樣式 # 用 xlrd 提供的方法獲得現在已有的行數 rn = rb.sheets()[0].nrows # 複製excel wb = copy(rb) # 從複製的excel檔案中得到第一個sheet sheet = wb.get_sheet(0) # 向sheet中寫入檔案 write_data(sheet, rn) # 刪除原先的檔案 os.remove(filename) # 儲存 wb.save(filename) else: header = ['author', 'sex', 'rank', 'content', 'great', 'comment', 'time'] book = xlwt.Workbook(encoding='utf-8') sheet = book.add_sheet('糗百') # 向 excel 中寫入表頭 for h in range(len(header)): sheet.write(0, h, header[h]) # 向sheet中寫入內容 write_data(sheet, 1) book.save(filename)