1. 程式人生 > >Python爬取百度地圖API-查詢全國地址資訊-MySQL資料庫

Python爬取百度地圖API-查詢全國地址資訊-MySQL資料庫

1.採用轉檔案轉MySQL資料庫形式儲存內容

2.爬取速度不宜過快,太快容易失敗

程式碼如下:

先爬取並將其全部儲存至檔案中

import json, time
import random
import requests

six_cities_list = ['北京市', '上海市', '重慶市', '天津市', '香港特別行政區', '澳門特別行政區']

province_list = ['河北省', '山西省', '遼寧省', '吉林省', '黑龍江省', '江蘇省', '浙江省', '安徽省', '福建省', '江西省',
                 '山東省', '河南省','湖北省', '湖南省', '廣東省', '海南省', '四川省', '貴州省', '雲南省', '陝西省', '甘肅省',
                 '青海省', '臺灣省', '內蒙古自治區', '廣西壯族自治區', '西藏自治區', '寧夏回族自治區', '新疆維吾爾自治區']

def getjson(loc, page_num=0):
    headers = {
    'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
     Gecko Chrome/63.0.3239.132'
    }
    pa = {
        'query': '公園',
        # 'tag': '',
        'region': loc,
        'scope': '2',
        'page_size': 20,
        'page_num': page_num,
            'ak': '自己的API'
    }
    try:
        r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&region={}&output=json&ak={}'\
                         .format(pa['query'], pa['region'], pa['ak']), params=pa, headers=headers)
        decodejson=json.loads(r.text)
        return decodejson
    except Exception as e:
        getjson(loc)
        print('over-requests! Error:', e)


def six_city():
    decodejson = getjson('全國')
    for eachprovince in decodejson['results']:
        try:
            city = eachprovince['name']
            num = eachprovince['num']
            if city in six_cities_list:
                output = '\t'.join([city, str(num)]) + '\n'
                with open('cities.txt', 'a+', encoding='UTF-8') as f:
                    f.write(output)
                    f.close()
        except Exception as e:
            print('over_cities! Error:', e)

def else_city():
    for eachprovince in province_list:
        decodejson = getjson(eachprovince)
        try:
            for eachcity in decodejson['results']:
                try:
                    city = eachcity['name']
                    num = eachcity['num']
                    output = '\t'.join([city, str(num)]) + '個\n'
                    with open('cities.txt', 'a+', encoding='UTF-8') as f:
                        f.write(output)
                        f.close()
                except Exception as e:
                    continue
        except Exception as e:
            print('over-eachprovince! Error:', e)
        finally:
            time.sleep(random.random())

if __name__ == '__main__':
    print('正在爬取全國各地"公園"分佈數目並存入cities.txt.')
    six_city()
    else_city()

利用全國公園裡各大城市的地址獲取想要的結果

import json, time
import random
import pymysql
import requests
city_list = list()

config = {
          'host':'localhost',
          'port':3306,
          'user':'root',
          'password':'113754',
          'db':'baidu_map',
          'charset':'utf8mb4',
          'cursorclass':pymysql.cursors.DictCursor,
}

conn = pymysql.connect(**config)
cur = conn.cursor()

def Word():
    with open('cities.txt', 'r', encoding='UTF-8') as txt_file:
        for eachLine in txt_file:
            if eachLine != '' and eachLine != '\n':
                fields = eachLine.split('\t')
                city = fields[0]
                city_list.append(city)
        txt_file.close()

def getjson(loc, page_num=0):
    headers = {
    'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
     Gecko Chrome/63.0.3239.132'
    }
    pa = {
        'query': 'CoCo',
         'tag': '美食',
        'region': loc,
        'scope': '2',
        'page_size': 20,
        'page_num': page_num,
            'ak': 'yX91zbGwxNxaGWMwo3LPx3MWovVCScHj'
    }
    try:
        r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&tag={}&region={}&output=json&ak={}'\
                         .format(pa['query'], pa['tag'], pa['region'], pa['ak']), params=pa, headers=headers)
        decodejson=json.loads(r.text)
        return decodejson
    except Exception as e:
        print('over-requests! Error:', e)
        getjson(loc)

def Insert_mysql():
    for eachcity in city_list:
        not_last_page = True
        page_num = 0
        while not_last_page:
            decodejson = getjson(eachcity, page_num)
            time.sleep(random.random())
            print(eachcity, page_num)
            try:
                if decodejson['results']:
                    for eachone in decodejson['results']:
                        try:
                            park = eachone['name']
                        except:
                            park = None
                        try:
                            location_lat = eachone['location']['lat']
                        except:
                            location_lat = None
                        try:
                            location_lng = eachone['location']['lng']
                        except:
                            location_lng = None
                        try:
                            address = eachone['address']
                        except:
                            address = None
                        try:
                            street_id = eachone['street_id']
                        except:
                            street_id = None
                        try:
                            uid = eachone['uid']
                        except:
                            uid = None
                        sql = '''INSERT INTO baidu_map.city
                        (city, park, location_lat, location_lng, address, street_id, uid)
                        VALUES (%s, %s, %s, %s, %s, %s, %s)'''
                        cur.execute(sql, (eachcity, park, location_lat, location_lng, address, street_id, uid))
                        conn.commit()
                    page_num += 1
            except Exception as e:
                print('Error:', e)
                not_last_page = False


if __name__ == '__main__':
    Word()
    Insert_mysql()
    cur.close()
    conn.close()

執行截圖如下: