python爬取杭州市幼兒園資訊
阿新 • • 發佈:2018-12-11
一、爬取前準備
1、IDE使用pycharm
2、安裝相關的庫,requests,re,xlsxwritter,beautifulsoup
如圖看到,網頁由頂部的區域,中間的學校列表和底部的分頁等幾個重要的部分組成。檢視網頁原始碼,可以看到上述的三個部分都可以在頁面中找到,不需要填寫瀏覽器資訊和cookie驗證等.
二、爬取資訊
1、引入相關庫
import requests
import re
import xlsxwriter
from bs4 import BeautifulSoup
2、獲取請求
def get_soup(url, param): response = requests.get(url, params=param) soup = BeautifulSoup(response.text, 'html.parser') return soup
param為url後面的地址,在該網頁中,不同地區、不同學校的資訊都是通過引數的變化實現的,url的前面不變
3、獲取地區列表,儲存在陣列中
分析地區的結構可知,所有的地區都儲存在role屬性為presentation的<li> 的<a>中
傳入的引數S為包含網頁資訊的Beautisoup 物件,遍歷beautisoup查詢的結果集,除“全部”外都儲存在陣列res_areas中
def get_area(s): res_areas = [] areas = s.find_all(name='li', attrs={"role": "presentation"}) for area in areas: t = area.find('a').string if t != '全部': res_areas.append(t) return res_areas
4、獲取某地區學校的分頁總數
def get_page_num(s):
r = s.find_all(name="div", attrs={"class": re.compile(r'page')})[0]
if r.find("strong") is None:
return 0
else:
n = r.find("strong").find_next_siblings()[0].get_text()
return int(n)
分頁的子頁和總數用<strong>包括,被外層div包含,很容易找到。這裡做的判斷為了避免出現某地區沒有學校的現象,否則程式會報錯。
5、寫主函式main,匯出xlsx
三、總結
1、實現了快速提取所有杭州市幼兒園資訊,節省了人力物力
2、頁面結構簡單,提取相對容易
附上原始碼
import requests
import re
import xlsxwriter
from bs4 import BeautifulSoup
# 獲取請求
def get_soup(url, param):
response = requests.get(url, params=param)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
# 獲取某地區分頁數
def get_page_num(s):
r = s.find_all(name="div", attrs={"class": re.compile(r'page')})[0]
if r.find("strong") is None:
return 0
else:
n = r.find("strong").find_next_siblings()[0].get_text()
return int(n)
# 獲取url引數
def get_param(grade, area, page):
para = {'grade_type': '1', 'area_type': area, "page": page}
return para
# 獲取地區
def get_area(s):
res_areas = []
areas = s.find_all(name='li', attrs={"role": "presentation"})
for area in areas:
t = area.find('a').string
if t != '全部':
res_areas.append(t)
return res_areas
def main():
url = "http://hzjiaoyufb.hangzhou.com.cn/school_list.php"
soup = get_soup(url, {'grade_type': '1'})
# 初始化xlsx
print('初始化xlsx...')
workbook = xlsxwriter.Workbook('school.xlsx')
worksheet = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})
worksheet.write('A1', '學校名稱', bold)
worksheet.write('B1', '學校地址', bold)
worksheet.write('C1', '學校網址', bold)
worksheet.write('D1', '學校電話', bold)
worksheet.write('E1', '學校微信', bold)
worksheet.write('F1', '學校微博', bold)
worksheet.write('G1', '班級數目', bold)
worksheet.write('H1', '學校型別', bold)
worksheet.write('I1', '學校層次', bold)
worksheet.write('J1', '地區', bold)
# 根據地點和分頁的遍歷獲取所有的子葉超連結,儲存在arr中
arr = [] # 儲存連結地址
area = [] # 儲存地址資訊
school_name = []
school_location = []
school_website = []
school_tel = []
school_wx = []
school_nature = []
school_class = []
school_pic = []
school_wb = []
school_type = []
school_level = []
print('獲取所有區域...')
for res_area in get_area(soup):
soup = get_soup(url, get_param('1', res_area, '1'))
for num in range(get_page_num(soup)):
soup = get_soup(url, get_param('1', res_area, num - 1))
schools = soup.find_all('div', class_="pInfo")
for school in schools:
arr.append('http://hzjiaoyufb.hangzhou.com.cn/' + school.find('a').attrs['href'])
area.append(res_area)
# 遍歷arr中url,獲取子葉資訊
print('獲取所有學校資料...')
for item in enumerate(arr):
response = requests.get(item[1])
soup = BeautifulSoup(response.text, 'html.parser')
panel1 = soup.find('h2').text
panel2 = soup.find_all(name='div', attrs='panel-body')
school_name.append(panel1)
array = []
for panel in panel2:
if panel.find('h6') is not None:
array.append(panel.find('h6').text.strip())
school_location.append(array[1])
school_website.append(array[4])
school_tel.append(array[5])
school_wx.append(array[6])
school_wb.append(array[7])
school_nature.append(array[8])
school_type.append(array[9])
school_level.append(array[10])
school_class.append(array[11])
row = 1
print('寫入elsx檔案...')
for i in range(len(school_name)):
worksheet.write(row, 0, school_name[i])
worksheet.write(row, 1, school_location[i])
worksheet.write(row, 2, school_website[i])
worksheet.write(row, 3, school_tel[i])
worksheet.write(row, 4, school_wx[i])
worksheet.write(row, 5, school_wb[i])
worksheet.write(row, 6, school_class[i])
worksheet.write(row, 7, school_type[i])
worksheet.write(row, 8, school_level[i])
worksheet.write(row, 9, area[i])
row += 1
workbook.close()
if __name__ == '__main__':
main()