1. 程式人生 > >Python獲取全國所有的省、市、縣、鎮、村

Python獲取全國所有的省、市、縣、鎮、村

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=He

"""
通過國家統計局資料
獲取中國所有城市列表
"""
import sys
import os
import re
from urllib import request
from bs4 import BeautifulSoup
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/'
header = {
    'Cookie'
: 'AD_RS_COOKIE=20080917', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ ' 'Chrome/58.0.3029.110 Safari/537.36'} class GetHttp: def __init__(self, url, headers=None, charset='utf8'): if headers is None: headers = {} self._response = ''
try: print(url) self._response = request.urlopen(request.Request(url=url, headers=headers)) except Exception as e: print(e) self._c = charset @property def text(self): try: return self._response.read().decode(self._c) except
Exception as e: print(e) return '' def provincetr(u, he, lists): # 獲取全國省份和直轄市 t = GetHttp(u, he, 'gbk').text if t: soup = BeautifulSoup(t, 'html.parser') for i in soup.find_all(attrs={'class': 'provincetr'}): for a in i.find_all('a'): id = re.sub("\D", "", a.get('href')) lists[id] = {'id': id, 'name': a.text, 'pid': '0', 'pid1': '0', 'pid2': '0', 'pid3': '0', 'pid4': '0', 'code': id} # time.sleep(1 / 10) return lists def citytr(u, he, lists): # 獲取省下級市 l = lists.copy() for i in l: t = GetHttp(u+i+'.html', he, 'gbk').text if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'citytr'}): id = str(v.find_all('td')[0].text) if id[0:4] not in lists.keys(): lists[id[0:4]] = {'id': id[0:4], 'name': str(v.find_all('td')[1].text), 'pid': '0', 'pid1': i, 'pid2': '0', 'pid3': '0', 'pid4': '0', 'code': id} return lists def countytr(u, he, lists): # 獲取市下級縣 l = lists.copy() a = {} for i in l: t = GetHttp(u+i[0:2]+'/'+i+'.html', he, 'gbk').text if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'countytr'}): id = str(v.find_all('td')[0].text) if id[0:6] not in lists.keys(): lists[id[0:6]] = {'id': id[0:6], 'name': str(v.find_all('td')[1].text), 'pid': '0', 'pid1': l[i]['pid1'], 'pid2': i, 'pid3': '0', 'pid4': '0', 'code': id} return lists def towntr(u, he, lists): # 縣下級鎮 l = lists.copy() for i in l: t = GetHttp(u+i[0:2]+'/'+i[2:4]+'/'+i+'.html', he, 'gbk').text if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'towntr'}): id = str(v.find_all('td')[0].text) if id[0:9] not in lists.keys(): lists[id[0:9]] = {'id': id[0:9], 'name': str(v.find_all('td')[1].text), 'pid': '0', 'pid1': l[i]['pid1'], 'pid2': l[i]['pid2'], 'pid3': i, 'pid4': '0', 'code': id} return lists def villagetr(u, he, lists): # 鎮下級村 l = lists.copy() for i in l: t = GetHttp(u+i[0:2]+'/'+i[2:4]+'/'+i[4:6]+'/'+i+'.html', he, 'gbk').text if not t: continue soup = BeautifulSoup(t, 'html.parser') for v in soup.find_all(attrs={'class': 'villagetr'}): id = str(v.find_all('td')[0].text) if id[0:12] not in lists.keys(): lists[id[0:12]] = {'id': id[0:12], 'name': str(v.find_all('td')[1].text), 'pid': '0', 'pid1': l[i]['pid1'], 'pid2': l[i]['pid2'], 'pid3': l[i]['pid2'], 'pid4': i, 'code': id} return lists p = provincetr(u=url, he=header, lists={}) print('省') c = citytr(u=url, he=header, lists=p) print('市') o = countytr(u=url, he=header, lists=c) print('縣') t = towntr(u=url, he=header, lists=o) print('鎮') v = villagetr(u=url, he=header, lists=t) print('村')