1. 程式人生 > >Selenium&&PhantomJS獲取網站中的JS返回的資料

Selenium&&PhantomJS獲取網站中的JS返回的資料

一、安裝Selenium模組

pip install selenium
       Selenium 是一套完整的Web應用程式測試系統,包含了測試的錄製、編寫及執行和測試的並行處理。

官網下載:http://phantomjs.org/)

        下載後放在python安裝目錄,和python.exe在一個資料夾下。Selenium和Phantomjs配合可以模擬獲取包括JavaScript的資料。

三、執行環境

     (1) win7 

    (2) python 2.7

    (3) pycharm

四、獲取百度搜索結果

from selenium import webdriver

browser = webdriver.PhantomJS()
browser.get('https://www.baidu.com')
browser.implicitly_wait(10)
data = browser.find_element_by_xpath('/*')
print browser.title
print data.text
with open('2.html', 'w') as fp:
    fp.write(browser.page_source.encode('utf8'))
browser.quit()

五、獲取ip代理

(1)mylog.py

import logging
import getpass
import sys


#### 定義MyLog類
class MyLog(object):
#### 類MyLog的建構函式
	def __init__(self):
		self.user = getpass.getuser()
		self.logger = logging.getLogger(self.user)
		self.logger.setLevel(logging.DEBUG)

####  日誌檔名
		self.logFile = sys.argv[0][0:-3] + '.log'
		self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')

####  日誌顯示到螢幕上並輸出到日誌檔案內
		self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
		self.logHand.setFormatter(self.formatter)
		self.logHand.setLevel(logging.DEBUG)

		self.logHandSt = logging.StreamHandler()
		self.logHandSt.setFormatter(self.formatter)
		self.logHandSt.setLevel(logging.DEBUG)

		self.logger.addHandler(self.logHand)
		self.logger.addHandler(self.logHandSt)

####  日誌的5個級別對應以下的5個函式
	def debug(self,msg):
		self.logger.debug(msg)

	def info(self,msg):
		self.logger.info(msg)

	def warn(self,msg):
		self.logger.warn(msg)

	def error(self,msg):
		self.logger.error(msg)

	def critical(self,msg):
		self.logger.critical(msg)
(2)ip.py
from selenium import webdriver
from mylog import MyLog as mylog


class Item(object):
	ip = None              #代理ip
	port = None            #代理埠
	anonymous = None       #是否匿名
	type = None            #型別
	support = None         #支援的協議
	local = None           #實體地址
	speed = None           #代理速度

class GetProxy(object):
	def __init__(self):
		self.startUrl = 'http://www.kuaidaili.com/proxylist/'
		self.log = mylog()
		self.urls = self.getUrls()
		self.proxyList = self.getProxyList(self.urls)
		self.fileName = 'proxy.txt'
		self.saveFile(self.fileName, self.proxyList)

	def getUrls(self):
		urls = []
		for i in xrange(1,11):
			url = self.startUrl + str(i)
			urls.append(url)
			self.log.info('get url %s to urls' %url)
		return urls

	def getProxyList(self, urls):
		browser = webdriver.PhantomJS()
		proxyList = []
		item = Item()
		for url in urls:
			browser.get(url)
			browser.implicitly_wait(5)
			elements = browser.find_elements_by_xpath('//tbody/tr')
			for element in elements:
				item.ip = element.find_element_by_xpath('./td[1]').text.encode('utf8')
				item.port = element.find_element_by_xpath('./td[2]').text.encode('utf8')
				item.anonymous = element.find_element_by_xpath('./td[3]').text.encode('utf8')
				item.type = element.find_element_by_xpath('./td[4]').text.encode('utf8')
				item.support = element.find_element_by_xpath('./td[5]').text.encode('utf8')
				item.local = element.find_element_by_xpath('./td[6]').text.encode('utf8')
				item.speed = element.find_element_by_xpath('./td[7]').text.encode('utf8')
				proxyList.append(item)
				self.log.info('add proxy %s:%s to list' %(item.ip, item.port))
		browser.quit()
		return proxyList

	def saveFile(self, fileName, proxyList):
		self.log.info('add all proxy to %s' %fileName)
		with open(fileName, 'w') as fp:
			for item in proxyList:
				fp.write(item.ip + '\t')
				fp.write(item.port + '\t')
				fp.write(item.anonymous + '\t')
				fp.write(item.type + '\t')
				fp.write(item.support + '\t')
				fp.write(item.local + '\t')
				fp.write(item.speed + '\n')
				

if __name__ == '__main__':
	GP = GetProxy()

六、獲取漫畫截圖
from selenium import webdriver
from mylog import MyLog as mylog
import os
import time

class GetCartoon(object):
    def __init__(self):
        #self.startUrl = u'http://www.1kkk.com/ch1-406302/'
        self.startUrl = u'http://www.1kkk.com/ch1-397573/'
        self.log = mylog()
        self.browser = self.getBrowser()
        self.saveCartoon(self.browser)
        self.browser.quit()
        
        
    def getBrowser(self):
        browser = webdriver.PhantomJS()
        try:
            browser.get(self.startUrl)
        except:
            mylog.error('open the %s failed' %self.startUrl)
        browser.implicitly_wait(20)
        return browser
            
    def saveCartoon(self, browser):
        cartoonTitle = browser.title.split('_')[0]
        self.createDir(cartoonTitle)
        os.chdir(cartoonTitle)
        sumPage = int(self.browser.find_element_by_xpath('//font[@class="zf40"]/span[2]').text)
        i = 1
        while i<=sumPage:
            imgName = str(i) + '.png'
            browser.get_screenshot_as_file(imgName)
            self.log.info('save img %s' %imgName)
            i += 1    
            NextTag = browser.find_element_by_id('next')
            NextTag.click()
#            browser.implicitly_wait(20)
            time.sleep(5)
        self.log.info('save img sccess')
    
    def createDir(self, dirName):
        if os.path.exists(dirName):
            self.log.error('create directory %s failed, hava a same name file or directory' %dirName)
        else:
            try:
                os.makedirs(dirName)
            except:
                self.log.error('create directory %s failed' %dirName)
            else:
                self.log.info('create directory %s success' %dirName)
                
            
if __name__ == '__main__':
    GC = GetCartoon()