1. 程式人生 > >python爬蟲:使用selenium + ChromeDriver爬取途家網

python爬蟲:使用selenium + ChromeDriver爬取途家網

說明

本站(途家網https://www.tujia.com)通過常規抓頁面的方法不能獲取資料,可以使用selenium + ChromeDriver來獲取頁面資料。

0 指令碼執行順序與說明

0.1 先執行craw_url.py,獲得所有房子詳情頁的url
0.2 執行slice_url.py,把所有的url等份,便於後續作多執行緒爬取
0.3 執行craw.py,獲取每個房子的具體資料

1 注意

1.1 本站的資料為動態載入,用到了selenium + ChromeDriver來獲取頁面資料
1.2 專案中附有chromedriver.exe,需要安裝谷歌瀏覽器(如果執行不了,可能是瀏覽器和chromedriver.exe版本不對應,對應的瀏覽器版本為69)
1.3 注意driver模擬操作後,需要等待1-2s後才能獲取到資料
1.4 本站有反爬,每一次頁面操作設定睡眠6s即可
1.5 chrome_options.add_argument(“headless”) 設定為不開啟瀏覽器介面

2 爬取內容

2.1 途家網https://www.tujia.com/unitlist?cityId=10
2.2 爬取欄位及說明見截圖

截圖

在這裡插入圖片描述
在這裡插入圖片描述在這裡插入圖片描述在這裡插入圖片描述

程式碼

1 craw_url.py (獲得所有房子詳情頁的url)

#! /usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import os


# 啟動driver
def init_driver(url):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.
add_argument("headless") # 不開啟瀏覽器 driver_path = "./bin/chromedriver.exe" driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path) driver.get(url) # html = driver.page_source # print(html.encode("GBK",'ignore')) # time.sleep(3) return driver # 如果檔案存在,則刪除
def del_file(file_path): if os.path.exists(file_path): os.remove(file_path) # 獲取頁面url def get_url(drive): # 獲取總頁數 total_str = driver.find_elements_by_class_name('pageItem')[-1].get_attribute('page-data') total = int(total_str) # 點選下一頁 click_num = 0 while click_num < total: driver.find_elements_by_class_name('pageItem')[-2].click() click_num += 1 time.sleep(6) # 每一頁的項數 item = driver.find_elements_by_class_name('searchresult-cont') item_num = len(item) # 獲取到該頁面所有項的url for i in range(item_num): xpath = '//*[@id="unitList-container"]/div/div[' + str(i+1) + ']/div[2]/div[1]/h3/a' url = driver.find_element_by_xpath(xpath).get_attribute('href') print(str(i) + '\t' + url) # 把url寫到本地 with open('./data/url/url.txt', 'a', encoding='utf-8') as f: f.write(url + '\n') close_driver(driver) def close_driver(driver): driver.quit() if __name__ == '__main__': root_url = 'https://www.tujia.com/unitlist?startDate=2018-12-10&endDate=2018-12-11&cityId=10&ssr=off' driver = init_driver(root_url) del_file('./data/url/url.txt') get_url(driver)

2 slice_url.py(把所有的url等份,便於後續作多執行緒爬取)

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import math


# url比較多,一次性爬取可能會出現問題,分多步爬取
def main(slice_num):
    # 讀取所有的url
    with open('./data/url/url.txt', 'r') as f:
        urls = f.readlines()

    urls_num = len(urls)
    step = math.ceil(urls_num / slice_num)

    # 寫url
    for i in range(slice_num):
        with open('./data/url/url_' + str(i+1) + '.txt', 'w', encoding='utf-8') as f:
            for j in range(step*i, step*(i+1)):
                try:
                    f.write(urls[j])
                except:
                    break

if __name__ == '__main__':
    # 分30等份
    main(30)

3 craw.py(獲取每個房子的具體資料)

#! /usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import os
import time
import threading


# 啟動driver
def init_driver(url, index):
    global threads
    threads['Thread_' + str(index)] += 1
    print('Thread_' + str(index) + '\t' + str(threads['Thread_' + str(index)]))

    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument("headless") # 不開啟瀏覽器

    driver_path = "./bin/chromedriver.exe"
    driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path)

    try:
        driver.get(url)
    except:
        pass
    # html = driver.page_source
    # print(html.encode("GBK",'ignore'))

    # time.sleep(2)
    return driver


def close_driver(driver):
    driver.quit()


# 如果檔案存在,則刪除
def del_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)


# 讀取本地的url
def read_url(file_path):
    with open(file_path, 'r') as f:
        urls = f.readlines()
    return urls


# 獲取頁面資料
def get_data(driver, file_path, index):
    try:
        # 店名,價格,房屋標籤,支付標籤,優勢標籤
        name = driver.find_element_by_xpath('//div[@class="house-name"]').text
        price = ''
        try:
            price = driver.find_element_by_xpath('//a[@class="present-price"]').text
        except:
            pass
        # 房屋面積
        area = ''
        try:
            house_type_element = driver.find_element_by_xpath('//*[@id="houseInfo"]/div/div/div[1]/div[3]/ul/li[2]')
            ActionChains(driver).move_to_element(house_type_element).perform()
            area = driver.find_element_by_xpath('//*[@id="houseInfo"]/div/div/div[1]/div[3]/ul/li[2]/div').text
        except:
            pass

        room_tag = ''
        try:
            room_tag = driver.find_element_by_xpath('//ul[@class="room-tag"]').text.replace('\n', ' ')
        except:
            pass
        pay_tag = ''
        try:
            pay_tag = driver.find_element_by_xpath('//ul[@class="pay-tag"]').text.replace('\n', ' ')
        except:
            pass
        advan_tag = ''
        try:
            advan_tag = driver.find_element_by_xpath('//div[@class="hotel-advan-tag"]').text.replace('\n', ' ')
        except:
            pass



        # 房屋守則
        house_rules = ''
        try:
            house_rules_all = driver.find_elements_by_xpath('//*[@id="unitcheckinneedtoknow"]/div[2]/div[2]/div[5]/ol/li')
            house_rules_dis = driver.find_elements_by_xpath('//*[@id="unitcheckinneedtoknow"]/div[2]/div[2]/div[5]/ol/li[@class="not"]')
            house_rules = ''
            for item in house_rules_all:
                house_rules += item.text + ' '
            for item in house_rules_dis:
                if item.text:
                    house_rules = house_rules.replace(item.text + ' ', '')
            # print(house_rules.encode('gbk', 'ignore').decode('gbk'))
        except:
            pass

        # 設施服務
        facility_service = ''
        # try:
            # 點選檢視更多
        scrollTop = 800
        success = False
        while not success:
            try:
                js = "var q=document.documentElement.scrollTop=800"
                driver.execute_script(js)
                driver.find_element_by_xpath('//*[@id="facilityshowmore"]/a').click()
                success = True
            except:
                scrollTop += 100
            time.sleep(1)
        # 分類,內容
        try:
            category_item = driver.find_elements_by_xpath('//*[@id="listWrap"]/h5')
            # print(category_item)
            content_item = driver.find_elements_by_xpath('//*[@id="listWrap"]/ul')
            # print(content_item)
            for index, category_ in enumerate(category_item):
                category = category_.text
                content = content_item[index].text.replace('\n', ' ')
                if category:
                    facility_service += category + '('
                    facility_service += content + ')  '
        except:
            pass

        try:
            facility_dis = driver.find_elements_by_xpath('//*[@id="listWrap"]//li[@class="i-not"]')
            for item in facility_dis:
                # print(item)
                if item.text:
                    facility_service = facility_service.replace(item.text + ' ', '')
                # print(item.text.encode('gbk', 'ignore').decode('gbk'),end=' ')
            # print(facility_service.encode('gbk', 'ignore').decode('gbk'))
        except:
            pass

        # 房東資訊
        # 房東型別
        landlord_type = ''
        try:
            landlord_type = driver.find_element_by_xpath('//*[@id="landlordInfo"]/div/div[2]/div/h2/span').text
        except:
            pass
        # 房東認證
        landlord_authentication = ''
        try:
            landlord_authentication = driver.find_element_by_xpath('//*[@id="landlordInfo"]/div/div[2]/div/div[2]').text
        except:
            pass
        # 其他房屋數
        landlord_other_house_num = ''
        try:
            landlord_other_house_num = driver.find_element_by_xpath('//div[@class="landlord-other-house"]/h2/span').text
        except:
            pass
        # print(landlord_type)
        # print(landlord_authentication)
        # print(landlord_other_house_num)

        # # 評價
        # # 綜合評分,單項評分,評論數,帶照片評論數
        overall_score = ''
        single_score = ''
        comment_sum = ''
        comment_photo_sum = ''
        try:
            overall_score = driver.find_element_by_xpath('//*[@id="overallScore"]').text
            single_score = driver.find_element_by_xpath('//*[@id="comment-summary"]/div[2]/div[1]/div[2]').text.replace('分', '')
            comment_sum = driver.find_element_by_xpath('//*[@id="comment_filter"]/li[1]/span').text.replace('(', '').replace(')', '')
            comment_photo_sum = driver.find_element_by_xpath('//*[@id="comment_filter"]/li[2]/span').text.replace('(', '').replace(')', '')
        except:
            pass

        # print('Thread_' + str(index) + '\t' + str(threads['Thread_' + str(index)]), end='\t')
        # print('\tThread_' + str(index))
        # # 先用 GBK 編碼,加個 ignore 丟棄錯誤的字元,然後再解碼
        print('\t----店名----\t' + name.encode('gbk', 'ignore').decode('gbk'))
        # print('\t----價格----\t' + price.encode('gbk', 'ignore').decode('gbk'))
        print('\t--建築面積--\t' + area.encode('gbk', 'ignore').decode('gbk'))
        # print('\t----房屋----\t' + room_tag.encode('gbk', 'ignore').decode('gbk'))
        # print('\t----支付----\t' + pay_tag.encode('gbk', 'ignore').decode('gbk'))
        # print('\t----優勢----\t' + advan_tag.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--設施服務--\t' + facility_service.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--房屋守則--\t' + house_rules.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--房東型別--\t' + landlord_type.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--房東認證--\t' + landlord_authentication.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--其他房數--\t' + landlord_other_house_num.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--綜合評分--\t' + overall_score.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--單項評分--\t' + single_score.encode('gbk', 'ignore').decode('gbk'))
        # print('\t---評論數---\t' + comment_sum.encode('gbk', 'ignore').decode('gbk'))
        # print('\t--照評論數--\t' + comment_photo_sum.encode('gbk', 'ignore').decode('gbk'))


        # 寫入資料到本地
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write('--------------------------------------------------------------\n')
            f.write('\t----店名----\t' + name.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t----價格----\t' + price.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--建築面積--\t' + area.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t----房屋----\t' + room_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t----支付----\t' + pay_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t----優勢----\t' + advan_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--設施服務--\t' + facility_service.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--房屋守則--\t' + house_rules.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--房東型別--\t' + landlord_type.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--房東認證--\t' + landlord_authentication.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--其他房數--\t' + landlord_other_house_num.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--綜合評分--\t' + overall_score.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--單項評分--\t' + single_score.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t---評論數---\t' + comment_sum.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t--照評論數--\t' + comment_photo_sum.encode('gbk', 'ignore').decode('gbk') + '\n')

        # 獲取當前頁評論
        get_data_comment(driver, file_path)

        # 評論內容
        # 評論總頁數
        comment_page_num = 1
        try:
            comment_page_num_str = driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[2]/ul/li')[-1].get_attribute('page-data')
            comment_page_num = int(comment_page_num_str)
        except:
            pass
        # 點選下一頁
        if comment_page_num > 1:
            click_num = 0
            while click_num < comment_page_num:
                # 當前頁最後一項評論的時間
                try:
                    last_item = driver.find_element_by_xpath('//*[@id="comment_list"]/li[1]/div[1]/ul/li[last()]/div[2]/div[1]/div/span[2]').text
                    date = last_item.replace('-', '')[:6]
                    # 日期大於2017年9月的
                    if int(date) < 201709:
                        break
                except:
                    pass
                # print(date.encode('gbk', 'ignore').decode('gbk'))
                # 滑動到底部
                js = "var q=document.documentElement.scrollTop=10000"
                driver.execute_script(js)
                time.sleep(2)
                try:
                    driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[2]/ul/li')[-2].click()
                except:
                    break
                '//*[@id="comment_list"]/li[1]/div[2]/ul/li[7]'
                click_num += 1
                time.sleep(4)
                # 獲取當前頁評論
                get_data_comment(driver, file_path)

        close_driver(driver)
    except:
        print('error')
        close_driver(driver)


# 獲取評論模組資料
def get_data_comment(driver, file_path):
    try:
        # 當前頁評論數
        comment_curr_page = driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[1]/ul/li')
        comment_curr_page_num = len(comment_curr_page)
        for index in range(comment_curr_page_num):
            xpath_head = '//*[@id="comment_list"]/li[1]/div[1]/ul/li[' + str(index + 1) + ']'
            # 評論人
            comment_person = driver.find_element_by_xpath(xpath_head + '/div[2]/div[1]/div/span[1]').text
            # 評論時間
            comment_time = driver.find_element_by_xpath(xpath_head + '/div[2]/div[1]/div/span[2]').text.replace('點評', '')
            # 評論內容
            comment_content = driver.find_element_by_xpath(xpath_head + '/div[2]/div[2]').text

            # 是否回覆
            comment_replay = ''
            try:
                comment_replay = driver.find_element_by_xpath(xpath_head + '/div[2]/div[4]/div[1]/div[2]/p').text.replace(
                    ':', '')