1. 程式人生 > >利用Python呼叫百度地圖介面爬取小區資訊

利用Python呼叫百度地圖介面爬取小區資訊

前幾天有一個需求,就是想要查到每個一二線城市裡所有小區的經緯度,因此爬取程式和啟動程式如下

community_info_do.py 的程式碼如下

# -*- coding: utf-8 -*-
"""
功能:呼叫百度地圖介面爬取各城市小區基本資訊
呼叫介面:Place APIWeb服務API      url:http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi
        Geocoding APIWeb服務API   url:http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding
資料庫:house.community_info
欄位:'name', 'address', 'city', 'district', 'scope', 'crawler_time', 'location'

"""
try:from crawler_module import community_info except:import community_info import time if __name__ == '__main__': a = [('杭州', [30.4582460000, 119.8922790000, 30.1479400000, 120.6408190000]), ('北京', [40.3151980000, 115.7020240000, 39.5770810000, 117.0335290000]), ('上海', [31.4260020000, 121.1090570000, 30.8731770000, 121.9392350000]), ('廣州'
, [23.1852140000, 113.1890830000, 22.6491570000, 113.5918110000]), ('深圳', [22.7734420000, 113.7838330000, 22.3906220000, 114.3794460000]), ('南京', [32.4371590000, 118.3747230000, 31.8992230000, 119.2049010000]), ('濟南', [36.7355790000, 116.8419980000, 36.5799340000, 117.3203280000]), ('重慶', [29.6732250000, 106.3021180000, 29.4102850000, 106.7804480000]), ('青島'
, [36.6390240000, 119.9957000000, 35.9071840000, 120.8017310000]), ('大連', [39.0805160000, 121.0735680000, 38.8619430000, 121.7157490000]), ('寧波', [30.0171180000, 121.4077110000, 29.6853290000, 121.8170510000]), ('廈門', [24.6268840000, 117.9478020000, 24.4154270000, 118.2450340000]), ('成都', [30.8966130000, 103.7507720000, 30.5240300000, 104.2647470000]), ('武漢', [30.7207000000, 114.1312360000, 30.4212040000, 114.5239030000]), ('哈爾濱', [45.8530720000, 126.4512390000, 45.5355840000, 127.0278790000]), ('瀋陽', [42.0065330000, 123.1972930000, 41.6146800000, 123.7486370000]), ('西安', [34.4725210000, 108.1609000000, 34.0342580000, 109.4774560000]), ('長春', [44.0637620000, 125.0409160000, 43.7128250000, 125.6054830000]), ('長沙', [28.3946260000, 112.6841060000, 28.0371120000, 113.2113030000]), ('福州', [26.1688100000, 119.1421770000, 25.9304270000, 119.5426060000]), ('南昌', [28.7944680000, 115.7332320000, 28.5063950000, 116.0528850000]), ('合肥', [31.9998630000, 117.0551390000, 31.6921480000, 117.5087470000]), ('鄭州', [34.9179840000, 113.4948780000, 34.6062510000, 113.9657340000]), ('石家莊', [38.1679890000, 114.2795970000, 37.9552500000, 114.6958360000]), ('蘇州', [31.4552580000, 120.4292830000, 31.1046660000, 120.9030140000]), ('佛山', [23.1270500000, 112.9962730000, 22.9274910000, 113.2636090000]), ('東莞', [23.1087920000, 113.6438950000, 22.9219870000, 114.1797160000]), ('無錫', [31.8931660000, 120.0925090000, 31.3480540000, 120.9939770000]), ('煙臺', [37.8594990000, 120.2461680000, 37.3334120000, 121.8605310000]), ('太原', [38.0184780000, 112.4213080000, 37.7039610000, 112.6725460000])] into_db=("0","0","0","utf8") for x, y in a: example1 = community_info.community_info(y, x, into_db) print '現在進行的城市是' + x example1.do()

community_info.py的程式碼如下

# -*- coding: utf-8 -*-
import MySQLdb
import time
import urllib2
import json
class community_info(object):
    def __init__(self, location, city, into_db):
        self.location = location
        self.city = city
        self.ziduan = ['name', 'address', 'city', 'district', 'scope', 'crawler_time', 'location']
        self.seq = ['"', '"', '"', '"', '"', '', '"']
        self.into_db = into_db
    def do(self):
        # location =
        dif = [self.location[0] - self.location[2], self.location[3] - self.location[1]]
        b = [x / 100.0 + self.location[2] for x in xrange(int(dif[0] * 100))]
        c = [x / 100.0 + self.location[1] for x in xrange(int(dif[1] * 100))]

        d = [[x, y] for x in b for y in c]

        cnxn = MySQLdb.connect(host=self.into_db[0], user=self.into_db[1], passwd=self.into_db[2], charset=self.into_db[3])
        cursor = cnxn.cursor()
        sql = "select name from house.community_info where  city = '{}' ".format(self.city)
        cursor.execute(sql)
        url_database = [item[0] for item in cursor.fetchall()]  # 取出當前城市已有小區的名字
        # print url_database
        # for x in url_database:
        #     print x
        cnxn.commit()
        dict_data_list = []  # 字典列表
        i = 0
        for x in d:  # 遍歷當前城市所有劃分出來的小矩形

            html = urllib2.urlopen(
                r'http://api.map.baidu.com/place/v2/search?query=小區&bounds={},{},{},{}4&page_size=20&output=json&ak=你的ak'.format(
                    x[0], x[1], x[0] + 0.01, x[1] + 0.01))
            b = html.read()  # str
            print b
            c = json.loads(b)  # dict

            if not c['results']:
                continue
                # print json.dumps(c, ensure_ascii=False, encoding='UTF-8', indent=4)
            for x in c['results']:
                dict_data = {}
                dict_data['city'] = self.city
                dict_data['name'] = x['name'].encode('utf-8', 'ignore')
                dict_data['address'] = x['address'].encode('utf-8', 'ignore')
                try:
                    lng_lat = str(x['location']['lng']) + ',' + str(x['location']['lat'])
                except KeyError:
                    lng_lat = '0.0,0.0'
                dict_data['location'] = lng_lat
                lng_lat = ','.join(lng_lat.split(',')[::-1])
                html = urllib2.urlopen(
                    r'http://api.map.baidu.com/geocoder/v2/?callback=renderReverse&location={}&output=json&pois=1&ak=c9nNGFV74RjSG70xIXdVLVxWPizCqXdw&callba'.format(
                        lng_lat))
                b = html.read()  # str
                b = b.split('renderReverse&&renderReverse(')[1][:-1]
                c = json.loads(b)  # dict
                dict_data['scope'] = c['result']['business'].split(',')[0].encode('utf-8', 'ignore')
                dict_data['crawler_time'] = str(int(time.time())).encode('utf-8', 'ignore')
                if not dict_data['scope']:
                    dict_data['scope'] = '其他'
                dict_data['district'] = c['result']['addressComponent']['district'].encode('utf-8', 'ignore')
                if not dict_data['district']:
                    dict_data['district'] = '其他'
                dict_data_list.append(dict_data)
                # print json.dumps(dict_data, ensure_ascii=False, encoding='UTF-8', indent=4)


        cnxn = MySQLdb.connect(host=self.into_db[0], user=self.into_db[1], passwd=self.into_db[2],
                               charset=self.into_db[3])  ###
        cursor = cnxn.cursor()
        # for x in dict_data_list:
        #     print json.dumps(x, ensure_ascii=False, encoding='UTF-8', indent=4)
        print len(dict_data_list)
        for x in dict_data_list :  # 遍歷字典列表
            if not x['name'].decode('utf-8') in url_database:  # 判斷小區是否已經存在
                sql = "insert into house.community_info ({}) values ({})".format(
                    ",".join([item for item in self.ziduan]),
                    ",".join([j + x[i] + j for j, i in zip(self.seq, self.ziduan)]))
                cursor.execute(sql)
        cnxn.commit()
        cnxn.close()

如有好的建議或者提醒請在下方留言,我一定會虛心採納