python爬蟲:使用selenium + ChromeDriver爬取途家網
阿新 • • 發佈:2019-02-15
說明
本站(途家網https://www.tujia.com)通過常規抓頁面的方法不能獲取資料,可以使用selenium + ChromeDriver來獲取頁面資料。
0 指令碼執行順序與說明
0.1 先執行craw_url.py,獲得所有房子詳情頁的url
0.2 執行slice_url.py,把所有的url等份,便於後續作多執行緒爬取
0.3 執行craw.py,獲取每個房子的具體資料
1 注意
1.1 本站的資料為動態載入,用到了selenium + ChromeDriver來獲取頁面資料
1.2 專案中附有chromedriver.exe,需要安裝谷歌瀏覽器(如果執行不了,可能是瀏覽器和chromedriver.exe版本不對應,對應的瀏覽器版本為69)
1.3 注意driver模擬操作後,需要等待1-2s後才能獲取到資料
1.4 本站有反爬,每一次頁面操作設定睡眠6s即可
1.5 chrome_options.add_argument(“headless”) 設定為不開啟瀏覽器介面
2 爬取內容
2.1 途家網https://www.tujia.com/unitlist?cityId=10
2.2 爬取欄位及說明見截圖
截圖
程式碼
1 craw_url.py (獲得所有房子詳情頁的url)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import os
# 啟動driver
def init_driver(url):
chrome_options = webdriver.ChromeOptions()
chrome_options. add_argument("headless") # 不開啟瀏覽器
driver_path = "./bin/chromedriver.exe"
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
driver.get(url)
# html = driver.page_source
# print(html.encode("GBK",'ignore'))
# time.sleep(3)
return driver
# 如果檔案存在,則刪除
def del_file(file_path):
if os.path.exists(file_path):
os.remove(file_path)
# 獲取頁面url
def get_url(drive):
# 獲取總頁數
total_str = driver.find_elements_by_class_name('pageItem')[-1].get_attribute('page-data')
total = int(total_str)
# 點選下一頁
click_num = 0
while click_num < total:
driver.find_elements_by_class_name('pageItem')[-2].click()
click_num += 1
time.sleep(6)
# 每一頁的項數
item = driver.find_elements_by_class_name('searchresult-cont')
item_num = len(item)
# 獲取到該頁面所有項的url
for i in range(item_num):
xpath = '//*[@id="unitList-container"]/div/div[' + str(i+1) + ']/div[2]/div[1]/h3/a'
url = driver.find_element_by_xpath(xpath).get_attribute('href')
print(str(i) + '\t' + url)
# 把url寫到本地
with open('./data/url/url.txt', 'a', encoding='utf-8') as f:
f.write(url + '\n')
close_driver(driver)
def close_driver(driver):
driver.quit()
if __name__ == '__main__':
root_url = 'https://www.tujia.com/unitlist?startDate=2018-12-10&endDate=2018-12-11&cityId=10&ssr=off'
driver = init_driver(root_url)
del_file('./data/url/url.txt')
get_url(driver)
2 slice_url.py(把所有的url等份,便於後續作多執行緒爬取)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import math
# url比較多,一次性爬取可能會出現問題,分多步爬取
def main(slice_num):
# 讀取所有的url
with open('./data/url/url.txt', 'r') as f:
urls = f.readlines()
urls_num = len(urls)
step = math.ceil(urls_num / slice_num)
# 寫url
for i in range(slice_num):
with open('./data/url/url_' + str(i+1) + '.txt', 'w', encoding='utf-8') as f:
for j in range(step*i, step*(i+1)):
try:
f.write(urls[j])
except:
break
if __name__ == '__main__':
# 分30等份
main(30)
3 craw.py(獲取每個房子的具體資料)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import os
import time
import threading
# 啟動driver
def init_driver(url, index):
global threads
threads['Thread_' + str(index)] += 1
print('Thread_' + str(index) + '\t' + str(threads['Thread_' + str(index)]))
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("headless") # 不開啟瀏覽器
driver_path = "./bin/chromedriver.exe"
driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path)
try:
driver.get(url)
except:
pass
# html = driver.page_source
# print(html.encode("GBK",'ignore'))
# time.sleep(2)
return driver
def close_driver(driver):
driver.quit()
# 如果檔案存在,則刪除
def del_file(file_path):
if os.path.exists(file_path):
os.remove(file_path)
# 讀取本地的url
def read_url(file_path):
with open(file_path, 'r') as f:
urls = f.readlines()
return urls
# 獲取頁面資料
def get_data(driver, file_path, index):
try:
# 店名,價格,房屋標籤,支付標籤,優勢標籤
name = driver.find_element_by_xpath('//div[@class="house-name"]').text
price = ''
try:
price = driver.find_element_by_xpath('//a[@class="present-price"]').text
except:
pass
# 房屋面積
area = ''
try:
house_type_element = driver.find_element_by_xpath('//*[@id="houseInfo"]/div/div/div[1]/div[3]/ul/li[2]')
ActionChains(driver).move_to_element(house_type_element).perform()
area = driver.find_element_by_xpath('//*[@id="houseInfo"]/div/div/div[1]/div[3]/ul/li[2]/div').text
except:
pass
room_tag = ''
try:
room_tag = driver.find_element_by_xpath('//ul[@class="room-tag"]').text.replace('\n', ' ')
except:
pass
pay_tag = ''
try:
pay_tag = driver.find_element_by_xpath('//ul[@class="pay-tag"]').text.replace('\n', ' ')
except:
pass
advan_tag = ''
try:
advan_tag = driver.find_element_by_xpath('//div[@class="hotel-advan-tag"]').text.replace('\n', ' ')
except:
pass
# 房屋守則
house_rules = ''
try:
house_rules_all = driver.find_elements_by_xpath('//*[@id="unitcheckinneedtoknow"]/div[2]/div[2]/div[5]/ol/li')
house_rules_dis = driver.find_elements_by_xpath('//*[@id="unitcheckinneedtoknow"]/div[2]/div[2]/div[5]/ol/li[@class="not"]')
house_rules = ''
for item in house_rules_all:
house_rules += item.text + ' '
for item in house_rules_dis:
if item.text:
house_rules = house_rules.replace(item.text + ' ', '')
# print(house_rules.encode('gbk', 'ignore').decode('gbk'))
except:
pass
# 設施服務
facility_service = ''
# try:
# 點選檢視更多
scrollTop = 800
success = False
while not success:
try:
js = "var q=document.documentElement.scrollTop=800"
driver.execute_script(js)
driver.find_element_by_xpath('//*[@id="facilityshowmore"]/a').click()
success = True
except:
scrollTop += 100
time.sleep(1)
# 分類,內容
try:
category_item = driver.find_elements_by_xpath('//*[@id="listWrap"]/h5')
# print(category_item)
content_item = driver.find_elements_by_xpath('//*[@id="listWrap"]/ul')
# print(content_item)
for index, category_ in enumerate(category_item):
category = category_.text
content = content_item[index].text.replace('\n', ' ')
if category:
facility_service += category + '('
facility_service += content + ') '
except:
pass
try:
facility_dis = driver.find_elements_by_xpath('//*[@id="listWrap"]//li[@class="i-not"]')
for item in facility_dis:
# print(item)
if item.text:
facility_service = facility_service.replace(item.text + ' ', '')
# print(item.text.encode('gbk', 'ignore').decode('gbk'),end=' ')
# print(facility_service.encode('gbk', 'ignore').decode('gbk'))
except:
pass
# 房東資訊
# 房東型別
landlord_type = ''
try:
landlord_type = driver.find_element_by_xpath('//*[@id="landlordInfo"]/div/div[2]/div/h2/span').text
except:
pass
# 房東認證
landlord_authentication = ''
try:
landlord_authentication = driver.find_element_by_xpath('//*[@id="landlordInfo"]/div/div[2]/div/div[2]').text
except:
pass
# 其他房屋數
landlord_other_house_num = ''
try:
landlord_other_house_num = driver.find_element_by_xpath('//div[@class="landlord-other-house"]/h2/span').text
except:
pass
# print(landlord_type)
# print(landlord_authentication)
# print(landlord_other_house_num)
# # 評價
# # 綜合評分,單項評分,評論數,帶照片評論數
overall_score = ''
single_score = ''
comment_sum = ''
comment_photo_sum = ''
try:
overall_score = driver.find_element_by_xpath('//*[@id="overallScore"]').text
single_score = driver.find_element_by_xpath('//*[@id="comment-summary"]/div[2]/div[1]/div[2]').text.replace('分', '')
comment_sum = driver.find_element_by_xpath('//*[@id="comment_filter"]/li[1]/span').text.replace('(', '').replace(')', '')
comment_photo_sum = driver.find_element_by_xpath('//*[@id="comment_filter"]/li[2]/span').text.replace('(', '').replace(')', '')
except:
pass
# print('Thread_' + str(index) + '\t' + str(threads['Thread_' + str(index)]), end='\t')
# print('\tThread_' + str(index))
# # 先用 GBK 編碼,加個 ignore 丟棄錯誤的字元,然後再解碼
print('\t----店名----\t' + name.encode('gbk', 'ignore').decode('gbk'))
# print('\t----價格----\t' + price.encode('gbk', 'ignore').decode('gbk'))
print('\t--建築面積--\t' + area.encode('gbk', 'ignore').decode('gbk'))
# print('\t----房屋----\t' + room_tag.encode('gbk', 'ignore').decode('gbk'))
# print('\t----支付----\t' + pay_tag.encode('gbk', 'ignore').decode('gbk'))
# print('\t----優勢----\t' + advan_tag.encode('gbk', 'ignore').decode('gbk'))
# print('\t--設施服務--\t' + facility_service.encode('gbk', 'ignore').decode('gbk'))
# print('\t--房屋守則--\t' + house_rules.encode('gbk', 'ignore').decode('gbk'))
# print('\t--房東型別--\t' + landlord_type.encode('gbk', 'ignore').decode('gbk'))
# print('\t--房東認證--\t' + landlord_authentication.encode('gbk', 'ignore').decode('gbk'))
# print('\t--其他房數--\t' + landlord_other_house_num.encode('gbk', 'ignore').decode('gbk'))
# print('\t--綜合評分--\t' + overall_score.encode('gbk', 'ignore').decode('gbk'))
# print('\t--單項評分--\t' + single_score.encode('gbk', 'ignore').decode('gbk'))
# print('\t---評論數---\t' + comment_sum.encode('gbk', 'ignore').decode('gbk'))
# print('\t--照評論數--\t' + comment_photo_sum.encode('gbk', 'ignore').decode('gbk'))
# 寫入資料到本地
with open(file_path, 'a', encoding='utf-8') as f:
f.write('--------------------------------------------------------------\n')
f.write('\t----店名----\t' + name.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t----價格----\t' + price.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--建築面積--\t' + area.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t----房屋----\t' + room_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t----支付----\t' + pay_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t----優勢----\t' + advan_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--設施服務--\t' + facility_service.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--房屋守則--\t' + house_rules.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--房東型別--\t' + landlord_type.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--房東認證--\t' + landlord_authentication.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--其他房數--\t' + landlord_other_house_num.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--綜合評分--\t' + overall_score.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--單項評分--\t' + single_score.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t---評論數---\t' + comment_sum.encode('gbk', 'ignore').decode('gbk') + '\n')
f.write('\t--照評論數--\t' + comment_photo_sum.encode('gbk', 'ignore').decode('gbk') + '\n')
# 獲取當前頁評論
get_data_comment(driver, file_path)
# 評論內容
# 評論總頁數
comment_page_num = 1
try:
comment_page_num_str = driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[2]/ul/li')[-1].get_attribute('page-data')
comment_page_num = int(comment_page_num_str)
except:
pass
# 點選下一頁
if comment_page_num > 1:
click_num = 0
while click_num < comment_page_num:
# 當前頁最後一項評論的時間
try:
last_item = driver.find_element_by_xpath('//*[@id="comment_list"]/li[1]/div[1]/ul/li[last()]/div[2]/div[1]/div/span[2]').text
date = last_item.replace('-', '')[:6]
# 日期大於2017年9月的
if int(date) < 201709:
break
except:
pass
# print(date.encode('gbk', 'ignore').decode('gbk'))
# 滑動到底部
js = "var q=document.documentElement.scrollTop=10000"
driver.execute_script(js)
time.sleep(2)
try:
driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[2]/ul/li')[-2].click()
except:
break
'//*[@id="comment_list"]/li[1]/div[2]/ul/li[7]'
click_num += 1
time.sleep(4)
# 獲取當前頁評論
get_data_comment(driver, file_path)
close_driver(driver)
except:
print('error')
close_driver(driver)
# 獲取評論模組資料
def get_data_comment(driver, file_path):
try:
# 當前頁評論數
comment_curr_page = driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[1]/ul/li')
comment_curr_page_num = len(comment_curr_page)
for index in range(comment_curr_page_num):
xpath_head = '//*[@id="comment_list"]/li[1]/div[1]/ul/li[' + str(index + 1) + ']'
# 評論人
comment_person = driver.find_element_by_xpath(xpath_head + '/div[2]/div[1]/div/span[1]').text
# 評論時間
comment_time = driver.find_element_by_xpath(xpath_head + '/div[2]/div[1]/div/span[2]').text.replace('點評', '')
# 評論內容
comment_content = driver.find_element_by_xpath(xpath_head + '/div[2]/div[2]').text
# 是否回覆
comment_replay = ''
try:
comment_replay = driver.find_element_by_xpath(xpath_head + '/div[2]/div[4]/div[1]/div[2]/p').text.replace(
':', '')