1. 程式人生 > >python爬取自如房間資訊(一)

python爬取自如房間資訊(一)

使用python和selenium+Chrome Headless爬取自如房間資訊,並將結果儲存在MongoDB中。其中最麻煩的應該是每間房的價格,因為自如是用一張圖片和offset來顯示價格,所以不能直接獲得。但我們可以通過將圖片轉為文字,再通過偏移量將數字組合為價格。

在這裡我們使用的是Chrome Headless而不是PhantomJS, 主要是因為前者不需要設定size大小,同時也更加穩定。因為PhantomJS總是會獲取到跟原本元素標籤不一樣的值。該例子主要爬取的是深圳市南山區,地鐵2號線,型別為友家合租房間資訊。

在爬取資訊時,如果能用逆向解析就最好用逆向解析,因為相比於用動態解析,前者更加的穩定。我們點進自如的網站,會發現每個連結不同的地方在地鐵站的名字,所以在爬取時在固定連結後上地鐵站名,就可以訪問不同地鐵站附近的房源。

其次是翻頁,我們也能發現同樣的規律。除了首頁,其他頁數都是跟在同樣的連結後面。我們可以利用這個規律來進行翻頁操作。

 

下面是具體的程式碼,對於價格的爬取,也就是檔案getNumbers在python爬取自如房間資訊(二)中可以看到

# -*- coding:utf-8 -*-
import time
import re
import getNumbers
import pymongo
from config import *
from urllib.request import urlretrieve
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

option = webdriver.ChromeOptions()
prefs = {'profile.managed.default_content_setting.images':2}
option.add_argument("headless")
option.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome('/Users/zhangxin/Downloads/chromedriver', chrome_options=option)

wait = WebDriverWait(browser, 10)

def get_picture(stationname, image_url, page):
    """ 獲取圖片 """
    picture_name = stationname + str(page) + 'ziroom.png'
    urlretrieve(url=image_url, filename='./data/'+picture_name)
    return picture_name

def get_file_content(file_path):
    """ 讀取圖片 """
    with open("./data/"+file_path, 'rb') as fp:
        return fp.read()


def get_price(stationname, page = 1):
    html = browser.page_source
    priceImage = re.compile('var ROOM_PRICE = (.*?);').search(html).group(1)
    priceImage = eval(priceImage)
    imageUrl = 'http:' + priceImage['image']
    imageOffset = priceImage['offset']
    imageFile = get_picture(stationname, imageUrl, page)
    print(imageFile)
    numbers = getNumbers.getNum("./data/"+imageFile)
    print(imageOffset)
    print(numbers)
    return numbers, imageOffset


def get_rooms_info(newHandle, price):
    """
    :param newHandle: the browser page of each room
    :return: rooms: the information of a room
    """
    browser.switch_to.window(newHandle)
    html = browser.page_source
    ## html中存在xmlns屬性,將此屬性替換為空
    html = html.replace('xmlns="http://www.w3.org/1999/xhtml"','')
    print('getting the room information')

    #print(html)
    time.sleep(2)
    doc = pq(html)
    detail = doc('.room_detail_right .detail_room')
    ## 去除<b></b>屬性,這個會影響到children取text,但不會影響直接取text
    detail('b').remove()
    #print(detail)
    new_detail = detail('ul').children()
    #print(new_detail[0].text)
    imagesUrl = []
    for item in doc('.lof-navigator-outer ul li .lidiv img').items():
        imagesUrl.append(item.attr('src'))
    rooms = {
            'roomname': doc('.room_name h2').text(),
            'images': imagesUrl,
            'price': price,
            'size': re.compile('(\d+.*?)').search(new_detail[0].text).group(1),
            'direction': new_detail[1].text,
            'structure': re.sub('\s','',new_detail[2].text),
            'floor': new_detail[3].text,
            'traffic': doc('.room_detail_right .detail_room .last').text().replace('\n', ' ')
        }
    print(rooms)
    save_to_mongo(rooms)

    ## 獲取總的房間
def get_rooms(stationname, page = 1):
    wait.until(
        EC.presence_of_element_located((By.ID, "houseList"))
    )
    links = browser.find_elements_by_css_selector("#houseList .clearfix .txt h3 a")
    numbers, offsets = get_price(stationname, page)
    # print(links)
    count = 0
    for link in links:
        print(link.get_attribute('href'))
        ## 當前視窗資訊
        handle = browser.current_window_handle
        ## 獲取房間的價格
        offset = offsets[count]
        price = []
        for i in range(len(offset)):
            price.append(str(numbers[9 - offset[i]]))
        ## 開啟每一個房間的詳細資訊視窗
        link.click()
        time.sleep(2)
        ## 所有視窗資訊
        handles = browser.window_handles
        for newHandle in handles:
            if newHandle != handle:
                get_rooms_info(newHandle, "".join(price))
                browser.close()
            else:
                continue
        ## 回到主視窗
        browser.switch_to.window(handle)
        count += 1


### 翻頁操作
def next_page(stationname, stationUrl, page = 1 ):
    try:
        # print('正在翻頁:')
        browser.get(stationUrl+'?p='+str(page))
        print(stationUrl+'?p='+str(page))
        print("page:",page)
        get_rooms(stationname, page)
    except Exception:
        next_page(stationname, stationUrl , page)

def get_page(i):

    ### 選擇具體的幾個站
    stationUrl = STATION + i + ".html"
    print(stationUrl)
    browser.get(stationUrl)
        # 獲取總的頁數
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#page'))
    )
    html = browser.page_source
    pages = re.compile('class="next".*?<span>\w(.*?)\w</span>').search(html).group(1)
    print('total pages:', pages)
    get_rooms(i)
    return pages, stationUrl

def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('Save  Success!!!')
    except Exception:
        print('Fail')


if __name__ == "__main__":
    for station in STATIONLIST:
        pages, stationUrl = get_page(station)
        for page in range(2, int(pages) + 1):
            next_page(station, page, stationUrl)

配置檔案程式碼:

MONGO_URL = 'localhost'
MONGO_DB = 'ziroom'
MONGO_TABLE = 'rooms'


SERVICE_ARGS = ['--disk-cache=true']

STATION = "http://sz.ziroom.com/z/nl/z2-s2號線%28蛇口線%29-t"

STATIONLIST = ["海上世界", "水灣", "東角頭","灣廈","海月", "登良", "後海", "科苑"]