python爬取自如房間資訊(一)
阿新 • • 發佈:2018-11-09
使用python和selenium+Chrome Headless爬取自如房間資訊,並將結果儲存在MongoDB中。其中最麻煩的應該是每間房的價格,因為自如是用一張圖片和offset來顯示價格,所以不能直接獲得。但我們可以通過將圖片轉為文字,再通過偏移量將數字組合為價格。
在這裡我們使用的是Chrome Headless而不是PhantomJS, 主要是因為前者不需要設定size大小,同時也更加穩定。因為PhantomJS總是會獲取到跟原本元素標籤不一樣的值。該例子主要爬取的是深圳市南山區,地鐵2號線,型別為友家合租房間資訊。
在爬取資訊時,如果能用逆向解析就最好用逆向解析,因為相比於用動態解析,前者更加的穩定。我們點進自如的網站,會發現每個連結不同的地方在地鐵站的名字,所以在爬取時在固定連結後上地鐵站名,就可以訪問不同地鐵站附近的房源。
其次是翻頁,我們也能發現同樣的規律。除了首頁,其他頁數都是跟在同樣的連結後面。我們可以利用這個規律來進行翻頁操作。
下面是具體的程式碼,對於價格的爬取,也就是檔案getNumbers在python爬取自如房間資訊(二)中可以看到:
# -*- coding:utf-8 -*- import time import re import getNumbers import pymongo from config import * from urllib.request import urlretrieve from pyquery import PyQuery as pq from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] option = webdriver.ChromeOptions() prefs = {'profile.managed.default_content_setting.images':2} option.add_argument("headless") option.add_experimental_option('prefs', prefs) browser = webdriver.Chrome('/Users/zhangxin/Downloads/chromedriver', chrome_options=option) wait = WebDriverWait(browser, 10) def get_picture(stationname, image_url, page): """ 獲取圖片 """ picture_name = stationname + str(page) + 'ziroom.png' urlretrieve(url=image_url, filename='./data/'+picture_name) return picture_name def get_file_content(file_path): """ 讀取圖片 """ with open("./data/"+file_path, 'rb') as fp: return fp.read() def get_price(stationname, page = 1): html = browser.page_source priceImage = re.compile('var ROOM_PRICE = (.*?);').search(html).group(1) priceImage = eval(priceImage) imageUrl = 'http:' + priceImage['image'] imageOffset = priceImage['offset'] imageFile = get_picture(stationname, imageUrl, page) print(imageFile) numbers = getNumbers.getNum("./data/"+imageFile) print(imageOffset) print(numbers) return numbers, imageOffset def get_rooms_info(newHandle, price): """ :param newHandle: the browser page of each room :return: rooms: the information of a room """ browser.switch_to.window(newHandle) html = browser.page_source ## html中存在xmlns屬性,將此屬性替換為空 html = html.replace('xmlns="http://www.w3.org/1999/xhtml"','') print('getting the room information') #print(html) time.sleep(2) doc = pq(html) detail = doc('.room_detail_right .detail_room') ## 去除<b></b>屬性,這個會影響到children取text,但不會影響直接取text detail('b').remove() #print(detail) new_detail = detail('ul').children() #print(new_detail[0].text) imagesUrl = [] for item in doc('.lof-navigator-outer ul li .lidiv img').items(): imagesUrl.append(item.attr('src')) rooms = { 'roomname': doc('.room_name h2').text(), 'images': imagesUrl, 'price': price, 'size': re.compile('(\d+.*?)').search(new_detail[0].text).group(1), 'direction': new_detail[1].text, 'structure': re.sub('\s','',new_detail[2].text), 'floor': new_detail[3].text, 'traffic': doc('.room_detail_right .detail_room .last').text().replace('\n', ' ') } print(rooms) save_to_mongo(rooms) ## 獲取總的房間 def get_rooms(stationname, page = 1): wait.until( EC.presence_of_element_located((By.ID, "houseList")) ) links = browser.find_elements_by_css_selector("#houseList .clearfix .txt h3 a") numbers, offsets = get_price(stationname, page) # print(links) count = 0 for link in links: print(link.get_attribute('href')) ## 當前視窗資訊 handle = browser.current_window_handle ## 獲取房間的價格 offset = offsets[count] price = [] for i in range(len(offset)): price.append(str(numbers[9 - offset[i]])) ## 開啟每一個房間的詳細資訊視窗 link.click() time.sleep(2) ## 所有視窗資訊 handles = browser.window_handles for newHandle in handles: if newHandle != handle: get_rooms_info(newHandle, "".join(price)) browser.close() else: continue ## 回到主視窗 browser.switch_to.window(handle) count += 1 ### 翻頁操作 def next_page(stationname, stationUrl, page = 1 ): try: # print('正在翻頁:') browser.get(stationUrl+'?p='+str(page)) print(stationUrl+'?p='+str(page)) print("page:",page) get_rooms(stationname, page) except Exception: next_page(stationname, stationUrl , page) def get_page(i): ### 選擇具體的幾個站 stationUrl = STATION + i + ".html" print(stationUrl) browser.get(stationUrl) # 獲取總的頁數 wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#page')) ) html = browser.page_source pages = re.compile('class="next".*?<span>\w(.*?)\w</span>').search(html).group(1) print('total pages:', pages) get_rooms(i) return pages, stationUrl def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print('Save Success!!!') except Exception: print('Fail') if __name__ == "__main__": for station in STATIONLIST: pages, stationUrl = get_page(station) for page in range(2, int(pages) + 1): next_page(station, page, stationUrl)
配置檔案程式碼:
MONGO_URL = 'localhost'
MONGO_DB = 'ziroom'
MONGO_TABLE = 'rooms'
SERVICE_ARGS = ['--disk-cache=true']
STATION = "http://sz.ziroom.com/z/nl/z2-s2號線%28蛇口線%29-t"
STATIONLIST = ["海上世界", "水灣", "東角頭","灣廈","海月", "登良", "後海", "科苑"]