1. 程式人生 > >python之爬蟲的入門05------實戰:爬取貝殼網(用re匹配需要的資料)

python之爬蟲的入門05------實戰:爬取貝殼網(用re匹配需要的資料)

# 第二頁:https://hz.zu.ke.com/zufang/pg2
# 第一頁:https://hz.zu.ke.com/zufang/pg1


import urllib.request
import random
import re

def user_ip():
    '''使用IP代理'''

    iplist = ['117.191.11.109:8542','186.46.192.110:8177', '39.137.2.214:8882']  # 代理IP地址
    proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})  # 建立一個請求物件,處理http請求,引數是一個字典{'型別':'代理IP:埠號'}
    opener = urllib.request.build_opener(proxy_support)  # 訂製建立一個opener
    urllib.request.install_opener(opener)  # 替代預設opener

def create_request(url,headers):
    '''生成請求request'''
    req = urllib.request.Request(url=url, headers=headers)
    return req

def get_response(req):
    '''得到response迴應'''
    response = urllib.request.urlopen(req)
    return response

def get_html(response):
    '''得到HTML頁面'''
    html = response.read().decode('utf-8')
    return html

def get_home_img(html):
    # 1.圖片地址 #list2儲存了所有的圖片地址
    list1 = re.findall(r'''data-src\=\"(.*?)\.jpg\"''', html)
    home_img = []
    for i in list1:
        home_img.append(i + '.jpg')
    # print(home_img)
    return home_img

def get_home_name(html):
    home_name = re.findall(r'''<p class="content__list--item--title twoline">
                  <a target="_blank" href=".*">
                    (.*?)                  </a>
                </p>''', html)
    # print(home_name)
    # print(len(home_name))
    return home_name

def chu_kongji(list4):
    list7 = []
    for i in list4:
        list6 = []
        for j in i:
            if j == '':
                continue
            list6.append(j)
        list7.append(list6)
    return list7

def get_home_details(html):
    # 3.詳細資訊
    home_details = re.findall(r'''<p class="content__list--item--des">
                  <a target="_blank" href=".*">(.*?)</a>-<a href=".*" target="_blank">(.*?)</a>
                  <i>/</i>
                  (.*?)
                  <i>/</i>(.*?)                  <i>/</i>
                    (.*?)                  |<p class="content__list--item--des">
                                      <span class="room__left">(.*?)</span>
                    <i>/</i>
                                    (.*?)
                  <i>/</i>(.*?)                  <i>/</i>
                    (.*?)                |<p class="content__list--item--des">
                                    (.*?)
                  <i>/</i>(.*?)                  <i>/</i>
                    (.*?)                ''', html)

    home_details = chu_kongji(home_details)
    # print(home_details)
    return home_details

def get_home_time(html):
    # 4.時間
    # <p class="content__list--item--time oneline">1天前釋出</p>
    home_time = re.findall(r'''<p class="content__list--item--time oneline">(.*?)</p>''', html)
    # print(home_time)
    # print(len(time1))
    return home_time

def get_home_price(html):
    # 5.價格
    # <span class="content__list--item-price"><em>4960</em> 元/月</span>
    home_price = re.findall(r'''<span class="content__list--item-price"><em>(.*?)</em>(.*?)</span>''', html)
    # print(home_price)
    # print(len(price))
    return home_price

def get_home_biaoqian(html):
    home_biaoqian = re.findall(r'''<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>|<p class="content__list--item--bottom oneline">
                                <i class="content__item__tag--.*">(.*?)</i>
                                </p>''', html)
    # print(biaoqian)
    # print(len(biaoqian))
    home_biaoqian = chu_kongji(home_biaoqian)
    return home_biaoqian

def get_home(home_img,home_name,home_details,home_time,home_price,home_biaoqian):
    home = []
    for i in range(len(home_img)):
        a = {
            'home_img': home_img[i],
            'home_name': home_name[i],
            'home_details': home_details[i],
            'home_time': home_time[i],
            'home_price': home_price[i],
            'home_biaoqian': home_biaoqian[i]
        }
        home.append(a)
    return home

def save_wenjian(wenjian_name,data):
    for i in data:
        with open(wenjian_name,'a') as f:
            f.write(str(i)+'\n')

if __name__ == '__main__':
    start = int(input('請輸入起始頁:'))
    end = int(input('請輸入結束頁:'))

    for i in range(start,end+1):


        url = 'https://hz.zu.ke.com/zufang/pg%s'%i

        headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', }



        #呼叫代理
        user_ip()

        #建立請求
        req = create_request(url=url,headers=headers)

        #獲得響應
        response = get_response(req=req)

        #獲得html
        html = get_html(response=response)

        #獲得home_img
        home_img =get_home_img(html)
        # print(home_img)

        #獲得home_name
        home_name = get_home_name(html)

        # 獲得home_details
        home_details = get_home_details(html)
        # print(home_details)

        # 獲得home_time
        home_time =get_home_time(html)
        # print(home_time)

        # 獲得home_price
        home_price = get_home_price(html)
        # print(home_price)

        # 獲得home_biaoqian
        home_biaoqian = get_home_biaoqian(html)
        # print(home_biaoqian)
        # print(len(home_biaoqian))

        # 獲得home
        home = get_home(home_img,home_name,home_details,home_time,home_price,home_biaoqian)
        # print(home)

        #儲存資料到檔案
        save_wenjian('./home_info.json',home)