利用Python呼叫百度地圖介面爬取小區資訊
阿新 • • 發佈:2019-01-10
前幾天有一個需求,就是想要查到每個一二線城市裡所有小區的經緯度,因此爬取程式和啟動程式如下
community_info_do.py 的程式碼如下
# -*- coding: utf-8 -*-
"""
功能:呼叫百度地圖介面爬取各城市小區基本資訊
呼叫介面:Place APIWeb服務API url:http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi
Geocoding APIWeb服務API url:http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding
資料庫:house.community_info
欄位:'name', 'address', 'city', 'district', 'scope', 'crawler_time', 'location'
"""
try:from crawler_module import community_info
except:import community_info
import time
if __name__ == '__main__':
a = [('杭州', [30.4582460000, 119.8922790000, 30.1479400000, 120.6408190000]),
('北京', [40.3151980000, 115.7020240000, 39.5770810000, 117.0335290000]),
('上海', [31.4260020000, 121.1090570000, 30.8731770000, 121.9392350000]),
('廣州' , [23.1852140000, 113.1890830000, 22.6491570000, 113.5918110000]),
('深圳', [22.7734420000, 113.7838330000, 22.3906220000, 114.3794460000]),
('南京', [32.4371590000, 118.3747230000, 31.8992230000, 119.2049010000]),
('濟南', [36.7355790000, 116.8419980000, 36.5799340000, 117.3203280000]),
('重慶', [29.6732250000, 106.3021180000, 29.4102850000, 106.7804480000]),
('青島' , [36.6390240000, 119.9957000000, 35.9071840000, 120.8017310000]),
('大連', [39.0805160000, 121.0735680000, 38.8619430000, 121.7157490000]),
('寧波', [30.0171180000, 121.4077110000, 29.6853290000, 121.8170510000]),
('廈門', [24.6268840000, 117.9478020000, 24.4154270000, 118.2450340000]),
('成都', [30.8966130000, 103.7507720000, 30.5240300000, 104.2647470000]),
('武漢', [30.7207000000, 114.1312360000, 30.4212040000, 114.5239030000]),
('哈爾濱', [45.8530720000, 126.4512390000, 45.5355840000, 127.0278790000]),
('瀋陽', [42.0065330000, 123.1972930000, 41.6146800000, 123.7486370000]),
('西安', [34.4725210000, 108.1609000000, 34.0342580000, 109.4774560000]),
('長春', [44.0637620000, 125.0409160000, 43.7128250000, 125.6054830000]),
('長沙', [28.3946260000, 112.6841060000, 28.0371120000, 113.2113030000]),
('福州', [26.1688100000, 119.1421770000, 25.9304270000, 119.5426060000]),
('南昌', [28.7944680000, 115.7332320000, 28.5063950000, 116.0528850000]),
('合肥', [31.9998630000, 117.0551390000, 31.6921480000, 117.5087470000]),
('鄭州', [34.9179840000, 113.4948780000, 34.6062510000, 113.9657340000]),
('石家莊', [38.1679890000, 114.2795970000, 37.9552500000, 114.6958360000]),
('蘇州', [31.4552580000, 120.4292830000, 31.1046660000, 120.9030140000]),
('佛山', [23.1270500000, 112.9962730000, 22.9274910000, 113.2636090000]),
('東莞', [23.1087920000, 113.6438950000, 22.9219870000, 114.1797160000]),
('無錫', [31.8931660000, 120.0925090000, 31.3480540000, 120.9939770000]),
('煙臺', [37.8594990000, 120.2461680000, 37.3334120000, 121.8605310000]),
('太原', [38.0184780000, 112.4213080000, 37.7039610000, 112.6725460000])]
into_db=("0","0","0","utf8")
for x, y in a:
example1 = community_info.community_info(y, x, into_db)
print '現在進行的城市是' + x
example1.do()
community_info.py的程式碼如下
# -*- coding: utf-8 -*-
import MySQLdb
import time
import urllib2
import json
class community_info(object):
def __init__(self, location, city, into_db):
self.location = location
self.city = city
self.ziduan = ['name', 'address', 'city', 'district', 'scope', 'crawler_time', 'location']
self.seq = ['"', '"', '"', '"', '"', '', '"']
self.into_db = into_db
def do(self):
# location =
dif = [self.location[0] - self.location[2], self.location[3] - self.location[1]]
b = [x / 100.0 + self.location[2] for x in xrange(int(dif[0] * 100))]
c = [x / 100.0 + self.location[1] for x in xrange(int(dif[1] * 100))]
d = [[x, y] for x in b for y in c]
cnxn = MySQLdb.connect(host=self.into_db[0], user=self.into_db[1], passwd=self.into_db[2], charset=self.into_db[3])
cursor = cnxn.cursor()
sql = "select name from house.community_info where city = '{}' ".format(self.city)
cursor.execute(sql)
url_database = [item[0] for item in cursor.fetchall()] # 取出當前城市已有小區的名字
# print url_database
# for x in url_database:
# print x
cnxn.commit()
dict_data_list = [] # 字典列表
i = 0
for x in d: # 遍歷當前城市所有劃分出來的小矩形
html = urllib2.urlopen(
r'http://api.map.baidu.com/place/v2/search?query=小區&bounds={},{},{},{}4&page_size=20&output=json&ak=你的ak'.format(
x[0], x[1], x[0] + 0.01, x[1] + 0.01))
b = html.read() # str
print b
c = json.loads(b) # dict
if not c['results']:
continue
# print json.dumps(c, ensure_ascii=False, encoding='UTF-8', indent=4)
for x in c['results']:
dict_data = {}
dict_data['city'] = self.city
dict_data['name'] = x['name'].encode('utf-8', 'ignore')
dict_data['address'] = x['address'].encode('utf-8', 'ignore')
try:
lng_lat = str(x['location']['lng']) + ',' + str(x['location']['lat'])
except KeyError:
lng_lat = '0.0,0.0'
dict_data['location'] = lng_lat
lng_lat = ','.join(lng_lat.split(',')[::-1])
html = urllib2.urlopen(
r'http://api.map.baidu.com/geocoder/v2/?callback=renderReverse&location={}&output=json&pois=1&ak=c9nNGFV74RjSG70xIXdVLVxWPizCqXdw&callba'.format(
lng_lat))
b = html.read() # str
b = b.split('renderReverse&&renderReverse(')[1][:-1]
c = json.loads(b) # dict
dict_data['scope'] = c['result']['business'].split(',')[0].encode('utf-8', 'ignore')
dict_data['crawler_time'] = str(int(time.time())).encode('utf-8', 'ignore')
if not dict_data['scope']:
dict_data['scope'] = '其他'
dict_data['district'] = c['result']['addressComponent']['district'].encode('utf-8', 'ignore')
if not dict_data['district']:
dict_data['district'] = '其他'
dict_data_list.append(dict_data)
# print json.dumps(dict_data, ensure_ascii=False, encoding='UTF-8', indent=4)
cnxn = MySQLdb.connect(host=self.into_db[0], user=self.into_db[1], passwd=self.into_db[2],
charset=self.into_db[3]) ###
cursor = cnxn.cursor()
# for x in dict_data_list:
# print json.dumps(x, ensure_ascii=False, encoding='UTF-8', indent=4)
print len(dict_data_list)
for x in dict_data_list : # 遍歷字典列表
if not x['name'].decode('utf-8') in url_database: # 判斷小區是否已經存在
sql = "insert into house.community_info ({}) values ({})".format(
",".join([item for item in self.ziduan]),
",".join([j + x[i] + j for j, i in zip(self.seq, self.ziduan)]))
cursor.execute(sql)
cnxn.commit()
cnxn.close()
如有好的建議或者提醒請在下方留言,我一定會虛心採納