1. 程式人生 > >使用python爬取8684.cn公交資訊

使用python爬取8684.cn公交資訊

  • 使用庫
  • 如果庫缺失請自行下載
import requests
import time
from bs4 import BeautifulSoup
import json
  • 原始碼
import requests
import time
from bs4 import BeautifulSoup
import json


headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}

def parse_first_page(url):
	r = requests.get(url, headers=headers)
	soup = BeautifulSoup(r.text, 'lxml')
	# 查詢得到所有的以數字開頭的連結
	number_a_list = soup.select('.bus_kt_r1 > a')
	char_a_list = soup.select('.bus_kt_r2 > a')
	# 提取a裡面的href
	a_list = number_a_list + char_a_list
	href_list = []
	for oa in a_list:
		href = url.rstrip('/') + oa['href']
		href_list.append(href)
	return href_list

def parse_second_page(url, href):
	r = requests.get(url=href, headers=headers)
	soup = BeautifulSoup(r.text, 'lxml')
	# 查詢得到所有的公交連結
	bus_a_list = soup.select('#con_site_1 > a')
	href_list = []
	for oa in bus_a_list:
		href = url.rstrip('/') + oa['href']
		href_list.append(href)
	return href_list

def parse_third_page(href, fp):
	r = requests.get(href, headers=headers)
	soup = BeautifulSoup(r.text, 'lxml')
	# 線路名稱
	route_name = soup.select('.bus_i_t1 > h1')[0].string
	print('正在爬取---%s---...' %route_name)
	# 執行時間
	run_time = soup.select('.bus_i_content > p')[0].string.lstrip('執行時間:')
	# 票價資訊
	price_info = soup.select('.bus_i_content > p')[1].string.lstrip('票價資訊:')
	# 公交公司
	company = soup.select('.bus_i_content > p > a')[0].string
	# 更新時間
	update_time = soup.select('.bus_i_content > p')[-1].string.lstrip('最後更新:')
	# 上行總個數
	up_total = soup.select('.bus_line_top > span')[0].string.strip('共站').strip()
	# 上行總站牌
	up_name_list = []
	number = int(up_total)
	up_a_list = soup.select('.bus_site_layer > div > a')[:number]
	for oa in up_a_list:
		up_name_list.append(oa.string)
	# 下行總個數
	# 下行總站牌
	down_a_list = soup.select('.bus_site_layer > div > a')[number:]
	down_total = len(down_a_list)
	down_name_list = []
	for oa in down_a_list:
		down_name_list.append(oa.string)
	
	# 儲存到字典中
	item = {
		'線路名稱': route_name,
		'執行時間': run_time,
		'票價資訊': price_info,
		'公交公司': company,
		'更新時間': update_time,
		'上行個數': up_total,
		'上行站牌': up_name_list,
		'下行個數': down_total,
		'下行站牌': down_name_list,
	}
	string = json.dumps(item, ensure_ascii=False)
	fp.write(string + '\n')
	print('結束爬取---%s---' %route_name)
	# time.sleep(1)

def main():
	url = 'http://beijing.8684.cn/'
	number_char_list = parse_first_page(url)
	fp = open('北京.txt', 'w', encoding='utf8')
	# 解析二級頁面
	for href in number_char_list:
		bus_href_list = parse_second_page(url, href)
		# 遍歷所有的公交詳情頁,獲取每一路公交的詳細資訊
		for href_detail in bus_href_list:
			parse_third_page(href_detail, fp)

	fp.close()

if __name__ == '__main__':
	main()