Python爬蟲實踐:獲取石家莊空氣質量歷史資料(13年至今)
阿新 • • 發佈:2018-11-27
from urllib.request import urlopen from bs4 import BeautifulSoup import re import numpy import csv import time def getdatawithtablehead(url): """ 該函式用於獲取帶表頭的資料 """ html = urlopen(url) bsobj = BeautifulSoup(html, "lxml", from_encoding="gb18030") # 獲取BeautifulSoup物件 tablelist = bsobj.findAll("tr") # 獲取所有的表格 Dataset = [] tablehead = tablelist[0].get_text().strip("\n").split("\n\n") Dataset.append(tablehead) # 獲取表頭 dataset = [] for datalist in tablelist[1:]: data = datalist.get_text().replace(" ", "").replace("\r\n", "").strip("\n").split("\n") dataset.append(data) # 獲取當月每一天的資料 Dataset = numpy.row_stack((Dataset, dataset)) return Dataset def getdata(url): """ 該函式用於獲取不帶表頭的資料 """ html = urlopen(url) bsobj = BeautifulSoup(html, "lxml", from_encoding="gb18030") tablelist = bsobj.findAll("tr") dataset = [] for datalist in tablelist[1:]: data = datalist.get_text().replace(" ", "").replace("\r\n", "").strip("\n").split("\n") dataset.append(data) return dataset start =time.clock() # 長沙空氣質量指數(AQI)-PM2.5查詢地址: starturl = "http://www.tianqihoubao.com/aqi/shijiazhuang.html" html = urlopen(starturl) bsobj = BeautifulSoup(html, "lxml") # 獲取BeautifulSoup物件 # 找到所有存放月度資料的網頁連結,並以列表的形式按月份先後順序儲存這些連結 Sites = [] for link in bsobj.findAll(href=re.compile("^(/aqi/shijiazhuang-)")): site = "http://www.tianqihoubao.com" + link.attrs['href'] Sites.append(site) # 陣列反轉 Sites.reverse() print(Sites) Dataset = getdatawithtablehead(Sites[0]) # 獲取表頭和第一個月度資料 for url in Sites[1:]: dataset = getdata(url) Dataset = numpy.row_stack((Dataset, dataset)) # 獲取所有月度資料 csvfile = open("shijiazhuang.csv", "w") # 建立csv檔案用於儲存資料 try: writer = csv.writer(csvfile) for i in range(numpy.shape(Dataset)[0]): writer.writerow((Dataset[i, :])) # 將資料逐行寫入csv檔案 finally: csvfile.close() # 關閉csv檔案 end = time.clock() print('Running time: %s Seconds' % (end - start))
原文地址:
https://blog.csdn.net/qq_36185831/article/details/79123144
https://blog.csdn.net/u013337691/article/details/51894453#commentsedit