1. 程式人生 > >第一個爬蟲程式,基於requests和BeautifulSoup

第一個爬蟲程式,基於requests和BeautifulSoup

斷斷續續學了1年多python,最近總算感覺自己入門了,記錄下這幾天用requests和BeautifulSoup寫的爬蟲。
python的環境是anaconda+pycharm。
直接上程式碼

@requires_authorization
"""
    作者:西瓜不是我的
    日期:2017年12月26日
    功能:爬取全國每個城市各個監測點的AQI
    版本:v11.0
"""
import requests
from bs4 import BeautifulSoup
import csv

def get_city_area_aqi(url):
    '''
        獲取城市監測點aqi
    '''
r = requests.get(url, timeout=20) soup = BeautifulSoup(r.text, 'lxml') area_name = soup.find_all('thead') # len = 1 area_final_name = area_name[0].find_all('th') # len = 13 area_aqi = soup.find_all('tbody') # len = 1 area_final_aqi = area_aqi[0].find_all('tr') # len = 13 # 各監測點名稱
area_name_list = [] # 最終目錄 final_list = [] # 為各監測點名稱列表賦初值 for i in range(len(area_final_name)): area_name_list.append(area_final_name[i].text) # 將資訊寫入到最終目錄中 for i in range(len(area_final_aqi)): final_aqi = area_final_aqi[i].text.strip() aqi = final_aqi.split('\n'
) for j in range(len(area_name_list)): final_list.append((area_name_list[j], aqi[j])) return final_list def write_to_csv(final_list,city_name): ''' 將獲取到的city_aqi列表寫到csv檔案中 ''' with open('aqi.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) row = [city_name] # for迴圈用來寫csv檔案的資料行 for i, name in enumerate(final_list): row.append(name[1]) if (i + 1) % 11 == 0: writer.writerow(row) row = [city_name] def get_all_city_name(url): ''' 獲取所有城市的中文名和英文名 ''' r = requests.get(url,timeout = 30) soup = BeautifulSoup(r.text, 'lxml') city_div = soup.find_all('div',{'class':'bottom'})[1] city_name_list = city_div.find_all('a') city_name = [] for name in city_name_list: name_text = name.text name_pinyin = name['href'][1:] city_name.append((name_text,name_pinyin)) return city_name def main(): url = 'http://www.pm25.in' all_city_list = get_all_city_name(url) #寫csv檔案的第一行,即標題行 write_row = ['city','監測點','AQI','空氣質量','首要汙染物','PM2.5','PM10','CO','NO2','O3-1','O3-8','SO2'] with open('aqi.csv', 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(write_row) #先獲取所以城市的英文名,根據英文名找到對應的url,再獲得每個城市各監測點的aqi值,最後寫入到scv檔案中 for i,city in enumerate(all_city_list): city_name = city[0] city_pinyin = city[1] url = 'http://www.pm25.in/' + city_pinyin city_list = get_city_area_aqi(url) write_to_csv(city_list,city_name) #只爬取10個城市 if i == 10: break if __name__=='__main__': main()

執行結果

第一次用markdown,就寫這麼多。