Selenium&&PhantomJS獲取網站中的JS返回的資料
阿新 • • 發佈:2019-02-01
一、安裝Selenium模組
pip install selenium
Selenium 是一套完整的Web應用程式測試系統,包含了測試的錄製、編寫及執行和測試的並行處理。
官網下載:http://phantomjs.org/)
下載後放在python安裝目錄,和python.exe在一個資料夾下。Selenium和Phantomjs配合可以模擬獲取包括JavaScript的資料。
三、執行環境
(1) win7
(2) python 2.7
(3) pycharm
四、獲取百度搜索結果
from selenium import webdriver browser = webdriver.PhantomJS() browser.get('https://www.baidu.com') browser.implicitly_wait(10) data = browser.find_element_by_xpath('/*') print browser.title print data.text with open('2.html', 'w') as fp: fp.write(browser.page_source.encode('utf8')) browser.quit()
五、獲取ip代理
(1)mylog.py
(2)ip.pyimport logging import getpass import sys #### 定義MyLog類 class MyLog(object): #### 類MyLog的建構函式 def __init__(self): self.user = getpass.getuser() self.logger = logging.getLogger(self.user) self.logger.setLevel(logging.DEBUG) #### 日誌檔名 self.logFile = sys.argv[0][0:-3] + '.log' self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n') #### 日誌顯示到螢幕上並輸出到日誌檔案內 self.logHand = logging.FileHandler(self.logFile, encoding='utf8') self.logHand.setFormatter(self.formatter) self.logHand.setLevel(logging.DEBUG) self.logHandSt = logging.StreamHandler() self.logHandSt.setFormatter(self.formatter) self.logHandSt.setLevel(logging.DEBUG) self.logger.addHandler(self.logHand) self.logger.addHandler(self.logHandSt) #### 日誌的5個級別對應以下的5個函式 def debug(self,msg): self.logger.debug(msg) def info(self,msg): self.logger.info(msg) def warn(self,msg): self.logger.warn(msg) def error(self,msg): self.logger.error(msg) def critical(self,msg): self.logger.critical(msg)
from selenium import webdriver from mylog import MyLog as mylog class Item(object): ip = None #代理ip port = None #代理埠 anonymous = None #是否匿名 type = None #型別 support = None #支援的協議 local = None #實體地址 speed = None #代理速度 class GetProxy(object): def __init__(self): self.startUrl = 'http://www.kuaidaili.com/proxylist/' self.log = mylog() self.urls = self.getUrls() self.proxyList = self.getProxyList(self.urls) self.fileName = 'proxy.txt' self.saveFile(self.fileName, self.proxyList) def getUrls(self): urls = [] for i in xrange(1,11): url = self.startUrl + str(i) urls.append(url) self.log.info('get url %s to urls' %url) return urls def getProxyList(self, urls): browser = webdriver.PhantomJS() proxyList = [] item = Item() for url in urls: browser.get(url) browser.implicitly_wait(5) elements = browser.find_elements_by_xpath('//tbody/tr') for element in elements: item.ip = element.find_element_by_xpath('./td[1]').text.encode('utf8') item.port = element.find_element_by_xpath('./td[2]').text.encode('utf8') item.anonymous = element.find_element_by_xpath('./td[3]').text.encode('utf8') item.type = element.find_element_by_xpath('./td[4]').text.encode('utf8') item.support = element.find_element_by_xpath('./td[5]').text.encode('utf8') item.local = element.find_element_by_xpath('./td[6]').text.encode('utf8') item.speed = element.find_element_by_xpath('./td[7]').text.encode('utf8') proxyList.append(item) self.log.info('add proxy %s:%s to list' %(item.ip, item.port)) browser.quit() return proxyList def saveFile(self, fileName, proxyList): self.log.info('add all proxy to %s' %fileName) with open(fileName, 'w') as fp: for item in proxyList: fp.write(item.ip + '\t') fp.write(item.port + '\t') fp.write(item.anonymous + '\t') fp.write(item.type + '\t') fp.write(item.support + '\t') fp.write(item.local + '\t') fp.write(item.speed + '\n') if __name__ == '__main__': GP = GetProxy()
六、獲取漫畫截圖
from selenium import webdriver
from mylog import MyLog as mylog
import os
import time
class GetCartoon(object):
def __init__(self):
#self.startUrl = u'http://www.1kkk.com/ch1-406302/'
self.startUrl = u'http://www.1kkk.com/ch1-397573/'
self.log = mylog()
self.browser = self.getBrowser()
self.saveCartoon(self.browser)
self.browser.quit()
def getBrowser(self):
browser = webdriver.PhantomJS()
try:
browser.get(self.startUrl)
except:
mylog.error('open the %s failed' %self.startUrl)
browser.implicitly_wait(20)
return browser
def saveCartoon(self, browser):
cartoonTitle = browser.title.split('_')[0]
self.createDir(cartoonTitle)
os.chdir(cartoonTitle)
sumPage = int(self.browser.find_element_by_xpath('//font[@class="zf40"]/span[2]').text)
i = 1
while i<=sumPage:
imgName = str(i) + '.png'
browser.get_screenshot_as_file(imgName)
self.log.info('save img %s' %imgName)
i += 1
NextTag = browser.find_element_by_id('next')
NextTag.click()
# browser.implicitly_wait(20)
time.sleep(5)
self.log.info('save img sccess')
def createDir(self, dirName):
if os.path.exists(dirName):
self.log.error('create directory %s failed, hava a same name file or directory' %dirName)
else:
try:
os.makedirs(dirName)
except:
self.log.error('create directory %s failed' %dirName)
else:
self.log.info('create directory %s success' %dirName)
if __name__ == '__main__':
GC = GetCartoon()