python爬蟲並將資料儲存到MySQL或Excel中
阿新 • • 發佈:2018-12-11
爬蟲爬取的是豆瓣top250圖書,以儲存到MySQL為例,流程如下:
1.先建表,可以用命令列,也可以用資料庫視覺化軟體,建立好需要用的到的欄位
2.寫好爬蟲,並在爬蟲中連線資料庫,把爬下來的資料按對應的欄位填入資料庫中
# -*- coding: utf-8 -*- # Captain_N from lxml import etree import random import requests import time import pymysql #匯入相應庫檔案 conn = pymysql.connect(host='localhost',user='root',password='1234',db='DouBan',port=3306,charset='utf8') cursor=conn.cursor() #連線資料庫及游標 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5478.400 QQBrowser/10.1.1550.400' } #請求頭 def get_info(url): res=requests.get(url,headers=headers) if res.status_code==200: selector=etree.HTML(res.text) infos=selector.xpath('//tr[@class="item"]') for info in infos: name=info.xpath('td/div/a/@title')[0] url=info.xpath('td/div/a/@href')[0] book_infos=info.xpath('td/p/text()')[0] author=book_infos.split('/')[0] publisher=book_infos.split('/')[-3] date=book_infos.split('/')[-2] price=book_infos.split('/')[-1] rate=info.xpath('td/div[@class="star clearfix"]/span[2]/text()')[0] comments=info.xpath('td/p/span/text()') if len(comments)!=0: comment=comments[0] else: comment='空' #以上是獲取電影詳細資訊 cursor.execute("insert into doubanmovie(name,author,publisher,date,price,rate,comments) values(%s,%s,%s,%s,%s,%s,%s)", (str(name),str(author),str(publisher),str(date),str(price),str(rate),str(comments))) #按對應欄位寫入資料庫 else: print('failed') if __name__=='__main__': #主程式入口 urls=['https://book.douban.com/top250?start={}'.format(i*25) for i in range(0,10)] #構建需要爬去的頁面連線 #urls = ['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)] for url in urls: get_info(url) #呼叫爬去詳細資訊函式 time.sleep(random.random()*2) conn.commit()
3.儲存到Excel中的流程與之相仿
# -*- coding: utf-8 -*- # Captain_N from lxml import etree import csv import requests import time #匯入相關的庫 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5478.400 QQBrowser/10.1.1550.400' } #請求頭 fp=open('E:\爬蟲\projects\DouBan250.csv','wt',newline='',encoding='utf-8') #建立Excel檔案 writer=csv.writer(fp) writer.writerow(('name','url','author','publisher','date','price','rate','comment')) #建立csv,寫入表頭 def get_info(url): res=requests.get(url,headers=headers) if res.status_code==200: selector=etree.HTML(res.text) infos=selector.xpath('//tr[@class="item"]') for info in infos: name=info.xpath('td/div/a/@title')[0] url=info.xpath('td/div/a/@href')[0] book_infos=info.xpath('td/p/text()')[0] author=book_infos.split('/')[0] publisher=book_infos.split('/')[-3] date=book_infos.split('/')[-2] price=book_infos.split('/')[-1] rate=info.xpath('td/div[@class="star clearfix"]/span[2]/text()')[0] comments=info.xpath('td/p/span/text()') if len(comments)!=0: comment=comments[0] else: comment='空' #以上為獲取詳細資訊 writer.writerow((name,url,author,publisher,date,price,rate,comment)) #按對應的表頭寫入資料 else: print('failed') if __name__=='__main__': #主程式入口 urls=['https://book.douban.com/top250?start={}'.format(i*25) for i in range(0,10)] #urls = ['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)] for url in urls: get_info(url) time.sleep(1) fp.close()#關閉csv檔案