Python爬取百度地圖API-查詢全國地址資訊-MySQL資料庫
阿新 • • 發佈:2019-02-03
1.採用轉檔案轉MySQL資料庫形式儲存內容
2.爬取速度不宜過快,太快容易失敗
程式碼如下:
先爬取並將其全部儲存至檔案中
import json, time import random import requests six_cities_list = ['北京市', '上海市', '重慶市', '天津市', '香港特別行政區', '澳門特別行政區'] province_list = ['河北省', '山西省', '遼寧省', '吉林省', '黑龍江省', '江蘇省', '浙江省', '安徽省', '福建省', '江西省', '山東省', '河南省','湖北省', '湖南省', '廣東省', '海南省', '四川省', '貴州省', '雲南省', '陝西省', '甘肅省', '青海省', '臺灣省', '內蒙古自治區', '廣西壯族自治區', '西藏自治區', '寧夏回族自治區', '新疆維吾爾自治區'] def getjson(loc, page_num=0): headers = { 'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\ Gecko Chrome/63.0.3239.132' } pa = { 'query': '公園', # 'tag': '', 'region': loc, 'scope': '2', 'page_size': 20, 'page_num': page_num, 'ak': '自己的API' } try: r = requests.get('http://api.map.baidu.com/place/v2/search?query={}®ion={}&output=json&ak={}'\ .format(pa['query'], pa['region'], pa['ak']), params=pa, headers=headers) decodejson=json.loads(r.text) return decodejson except Exception as e: getjson(loc) print('over-requests! Error:', e) def six_city(): decodejson = getjson('全國') for eachprovince in decodejson['results']: try: city = eachprovince['name'] num = eachprovince['num'] if city in six_cities_list: output = '\t'.join([city, str(num)]) + '\n' with open('cities.txt', 'a+', encoding='UTF-8') as f: f.write(output) f.close() except Exception as e: print('over_cities! Error:', e) def else_city(): for eachprovince in province_list: decodejson = getjson(eachprovince) try: for eachcity in decodejson['results']: try: city = eachcity['name'] num = eachcity['num'] output = '\t'.join([city, str(num)]) + '個\n' with open('cities.txt', 'a+', encoding='UTF-8') as f: f.write(output) f.close() except Exception as e: continue except Exception as e: print('over-eachprovince! Error:', e) finally: time.sleep(random.random()) if __name__ == '__main__': print('正在爬取全國各地"公園"分佈數目並存入cities.txt.') six_city() else_city()
利用全國公園裡各大城市的地址獲取想要的結果
import json, time import random import pymysql import requests city_list = list() config = { 'host':'localhost', 'port':3306, 'user':'root', 'password':'113754', 'db':'baidu_map', 'charset':'utf8mb4', 'cursorclass':pymysql.cursors.DictCursor, } conn = pymysql.connect(**config) cur = conn.cursor() def Word(): with open('cities.txt', 'r', encoding='UTF-8') as txt_file: for eachLine in txt_file: if eachLine != '' and eachLine != '\n': fields = eachLine.split('\t') city = fields[0] city_list.append(city) txt_file.close() def getjson(loc, page_num=0): headers = { 'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\ Gecko Chrome/63.0.3239.132' } pa = { 'query': 'CoCo', 'tag': '美食', 'region': loc, 'scope': '2', 'page_size': 20, 'page_num': page_num, 'ak': 'yX91zbGwxNxaGWMwo3LPx3MWovVCScHj' } try: r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&tag={}®ion={}&output=json&ak={}'\ .format(pa['query'], pa['tag'], pa['region'], pa['ak']), params=pa, headers=headers) decodejson=json.loads(r.text) return decodejson except Exception as e: print('over-requests! Error:', e) getjson(loc) def Insert_mysql(): for eachcity in city_list: not_last_page = True page_num = 0 while not_last_page: decodejson = getjson(eachcity, page_num) time.sleep(random.random()) print(eachcity, page_num) try: if decodejson['results']: for eachone in decodejson['results']: try: park = eachone['name'] except: park = None try: location_lat = eachone['location']['lat'] except: location_lat = None try: location_lng = eachone['location']['lng'] except: location_lng = None try: address = eachone['address'] except: address = None try: street_id = eachone['street_id'] except: street_id = None try: uid = eachone['uid'] except: uid = None sql = '''INSERT INTO baidu_map.city (city, park, location_lat, location_lng, address, street_id, uid) VALUES (%s, %s, %s, %s, %s, %s, %s)''' cur.execute(sql, (eachcity, park, location_lat, location_lng, address, street_id, uid)) conn.commit() page_num += 1 except Exception as e: print('Error:', e) not_last_page = False if __name__ == '__main__': Word() Insert_mysql() cur.close() conn.close()
執行截圖如下: