1. 程式人生 > >Python爬蟲系列之郵編區號爬取

Python爬蟲系列之郵編區號爬取

Python爬蟲之<—>全國郵編區號爬取

  • 僅供交流探討
  • 歡迎提出改進

程式碼部分

import re
import requests
import time
import MySQLdb
'''
	@author:王磊
	@time  :2018/11/8 21:15:05
'''

cursor = MySQLdb.connect(user='root', password='root', database='python', charset='utf8').cursor()


def getHTML(url):
    '''通過url獲取html'
'' res = requests.get(url) return res.content.decode(res.apparent_encoding, 'ignore') def getPrivince(html): req1 = re.compile(r'<td><a href="(.*?)" target="_blank">.*?</a></td>') res = re.findall(req1, html) nowIndex = 0 for res0 in res: if 'http'
in res0: res.pop(nowIndex) else: res[nowIndex] = "http://www.ip138.com" + res0 nowIndex += 1 return res[0:-2] def getCity(html): req1 = re.compile(r'<tr bgcolor="#ffffff"><td><a href=".*?"><b>(.*?)</a></b></td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>'
) citys = re.findall(req1, html) req2 = re.compile(r'<td>(.*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>') req3 = re.compile(r'<td>([^<a|^&nbsp;].*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>') if citys: '''省區帶市區部分''' area = re.findall(req3, html) if area: '''區存在''' return citys + area else: return citys else: '''直轄部分,只有區''' citys = re.findall(req2, html) return citys def run(): urlIndex = 'http://www.ip138.com/post/' indexHtml = getHTML(urlIndex) provinceUrls = getPrivince(indexHtml) with open('c:/Users/asus/Desktop/pc/text/zipCode.txt', 'a') as f: for provinceUrl in provinceUrls: provinceHtml = getHTML(provinceUrl) citys = getCity(provinceHtml) for city in citys: try: wStr = "地區:%s ,郵編:%s ,區號:%s \r\n" % city f.write(wStr) cursor.execute("insert city_zip_code values('%s', '%s', '%s')" % city) except Exception as e: pass time.sleep(2) if __name__ == '__main__': run()

☞點選這裡與我探討☚

♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪後續會更新系列基於Python的爬蟲小例子,歡迎關注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪