1. 程式人生 > >PhantomJS(轉)

PhantomJS(轉)

load mage start random head .com bs4 隱式 解決

# coding=utf—8
  
import random,headers,xmlParse  
from bs4 import BeautifulSoup  
from selenium import webdriver  
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities  
from selenium.webdriver.common.proxy import ProxyType  
  
phantomjs_driver=C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe
ips=xmlParse.get_ip_port_from_xml(proxy_ip.xml) def dynamic_load(url): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # 從USER_AGENTS列表中隨機選一個瀏覽器頭,偽裝瀏覽器 desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice(headers.my_headers)) # 不載入圖片,爬頁面速度會快很多
desired_capabilities["phantomjs.page.settings.loadImages"] = False # 利用DesiredCapabilities(代理設置)參數值,重新打開一個sessionId,我看意思就相當於瀏覽器清空緩存後,加上代理重新訪問一次url # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = random.choice(ips) # proxy.add_to_capabilities(desired_capabilities)
# 打開帶配置信息的phantomJS瀏覽器 # driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities) driver = webdriver.PhantomJS(executable_path=phantomjs_driver) driver.start_session(desired_capabilities) # 隱式等待5秒,可以自己調節 driver.implicitly_wait(5) # 設置10秒頁面超時返回,類似於requests.get()的timeout選項,driver.get()沒有timeout選項 # 以前遇到過driver.get(url)一直不返回,但也不報錯的問題,這時程序會卡住,設置超時選項能解決這個問題。 driver.set_page_load_timeout(20) # 設置10秒腳本超時時間 driver.set_script_timeout(20) driver.get(url) #next_page=driver.find_element_by_id (idd)#.get_attribute(‘href‘) #driver.get(next_page) #next_page html=BeautifulSoup(driver.page_source,xml).prettify() print html return html if __name__==__main__: url=http://www.chnmuseum.cn/tabid/218/Default.aspx?DynastySortID=5 dynamic_load(url)

PhantomJS(轉)