python之爬蟲的入門05------實戰:爬取貝殼網(用re匹配需要的資料)
阿新 • • 發佈:2018-11-13
# 第二頁:https://hz.zu.ke.com/zufang/pg2 # 第一頁:https://hz.zu.ke.com/zufang/pg1 import urllib.request import random import re def user_ip(): '''使用IP代理''' iplist = ['117.191.11.109:8542','186.46.192.110:8177', '39.137.2.214:8882'] # 代理IP地址 proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)}) # 建立一個請求物件,處理http請求,引數是一個字典{'型別':'代理IP:埠號'} opener = urllib.request.build_opener(proxy_support) # 訂製建立一個opener urllib.request.install_opener(opener) # 替代預設opener def create_request(url,headers): '''生成請求request''' req = urllib.request.Request(url=url, headers=headers) return req def get_response(req): '''得到response迴應''' response = urllib.request.urlopen(req) return response def get_html(response): '''得到HTML頁面''' html = response.read().decode('utf-8') return html def get_home_img(html): # 1.圖片地址 #list2儲存了所有的圖片地址 list1 = re.findall(r'''data-src\=\"(.*?)\.jpg\"''', html) home_img = [] for i in list1: home_img.append(i + '.jpg') # print(home_img) return home_img def get_home_name(html): home_name = re.findall(r'''<p class="content__list--item--title twoline"> <a target="_blank" href=".*"> (.*?) </a> </p>''', html) # print(home_name) # print(len(home_name)) return home_name def chu_kongji(list4): list7 = [] for i in list4: list6 = [] for j in i: if j == '': continue list6.append(j) list7.append(list6) return list7 def get_home_details(html): # 3.詳細資訊 home_details = re.findall(r'''<p class="content__list--item--des"> <a target="_blank" href=".*">(.*?)</a>-<a href=".*" target="_blank">(.*?)</a> <i>/</i> (.*?) <i>/</i>(.*?) <i>/</i> (.*?) |<p class="content__list--item--des"> <span class="room__left">(.*?)</span> <i>/</i> (.*?) <i>/</i>(.*?) <i>/</i> (.*?) |<p class="content__list--item--des"> (.*?) <i>/</i>(.*?) <i>/</i> (.*?) ''', html) home_details = chu_kongji(home_details) # print(home_details) return home_details def get_home_time(html): # 4.時間 # <p class="content__list--item--time oneline">1天前釋出</p> home_time = re.findall(r'''<p class="content__list--item--time oneline">(.*?)</p>''', html) # print(home_time) # print(len(time1)) return home_time def get_home_price(html): # 5.價格 # <span class="content__list--item-price"><em>4960</em> 元/月</span> home_price = re.findall(r'''<span class="content__list--item-price"><em>(.*?)</em>(.*?)</span>''', html) # print(home_price) # print(len(price)) return home_price def get_home_biaoqian(html): home_biaoqian = re.findall(r'''<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> <i class="content__item__tag--.*">(.*?)</i> </p>|<p class="content__list--item--bottom oneline"> <i class="content__item__tag--.*">(.*?)</i> </p>''', html) # print(biaoqian) # print(len(biaoqian)) home_biaoqian = chu_kongji(home_biaoqian) return home_biaoqian def get_home(home_img,home_name,home_details,home_time,home_price,home_biaoqian): home = [] for i in range(len(home_img)): a = { 'home_img': home_img[i], 'home_name': home_name[i], 'home_details': home_details[i], 'home_time': home_time[i], 'home_price': home_price[i], 'home_biaoqian': home_biaoqian[i] } home.append(a) return home def save_wenjian(wenjian_name,data): for i in data: with open(wenjian_name,'a') as f: f.write(str(i)+'\n') if __name__ == '__main__': start = int(input('請輸入起始頁:')) end = int(input('請輸入結束頁:')) for i in range(start,end+1): url = 'https://hz.zu.ke.com/zufang/pg%s'%i headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } #呼叫代理 user_ip() #建立請求 req = create_request(url=url,headers=headers) #獲得響應 response = get_response(req=req) #獲得html html = get_html(response=response) #獲得home_img home_img =get_home_img(html) # print(home_img) #獲得home_name home_name = get_home_name(html) # 獲得home_details home_details = get_home_details(html) # print(home_details) # 獲得home_time home_time =get_home_time(html) # print(home_time) # 獲得home_price home_price = get_home_price(html) # print(home_price) # 獲得home_biaoqian home_biaoqian = get_home_biaoqian(html) # print(home_biaoqian) # print(len(home_biaoqian)) # 獲得home home = get_home(home_img,home_name,home_details,home_time,home_price,home_biaoqian) # print(home) #儲存資料到檔案 save_wenjian('./home_info.json',home)