1. 程式人生 > >Java爬蟲進階-phantomJS+selenium2抓取網站圖片和小說

Java爬蟲進階-phantomJS+selenium2抓取網站圖片和小說

閒來無事,應小夥伴要求,最近寫了一個專門爬取小說和美女圖片的爬蟲工具類,有不足之處歡迎小夥伴們指出。

準備工作:

            新建maven工程,匯入pom依賴如下:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>zhy.crawler</groupId>
  <artifactId>zhy_crawler</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
	<properties>
		<jonguoLib.version>0.0.1</jonguoLib.version>
		<HighEncoder.version>0.0.1</HighEncoder.version>
		<RedisTool.version>0.0.2</RedisTool.version>
		<comLog.version>1.1.1</comLog.version>
		<slf4j.version>1.6.1</slf4j.version>
		<jackson.version>1.9.13</jackson.version>
		<spring.version>4.0.2.RELEASE</spring.version>
	</properties>
	
	<dependencies>
	    <dependency>
			<groupId>net.sf.json-lib</groupId>
			<artifactId>json-lib</artifactId>
			<version>2.4</version>
			<classifier>jdk15</classifier><!--鎸囧畾jdk鐗堟湰 -->
		</dependency>
		<dependency>
			<groupId>com.jonguo</groupId>
			<artifactId>JonguoLib</artifactId>
			<version>${jonguoLib.version}</version>
		</dependency>
		<dependency>
			<groupId>com.jonguo</groupId>
			<artifactId>HighEncoder</artifactId>
			<version>${HighEncoder.version}</version>
		</dependency>
		<dependency>
			<groupId>com.jonguo</groupId>
			<artifactId>RedisTool</artifactId>
			<version>${RedisTool.version}</version>
		</dependency>
		<dependency>
			<groupId>redis.clients</groupId>
			<artifactId>jedis</artifactId>
			<version>2.9.0</version>
		</dependency>
	 	<dependency>
	 		<groupId>commons-logging</groupId>
	 		<artifactId>commons-logging</artifactId>
	 		<version>${comLog.version}</version>
	 	</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>${slf4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-log4j12</artifactId>
			<version>${slf4j.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-core-asl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-core-lgpl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-jaxrs</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-mapper-asl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-mapper-lgpl</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-smile</artifactId>
			<version>${jackson.version}</version>
		</dependency>
		<dependency>
			<groupId>org.codehaus.jackson</groupId>
			<artifactId>jackson-xc</artifactId>
			<version>${jackson.version}</version>
		</dependency>		
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-aop</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-beans</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context-support</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-core</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-expression</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-web</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-webmvc</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-webmvc-portlet</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.mortbay.jetty</groupId>
			<artifactId>servlet-api-2.5</artifactId>
			<version>6.1.14</version>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.aspectj</groupId>
			<artifactId>aspectjweaver</artifactId>
			<version>1.6.8</version>
		</dependency>
		<dependency>
			<groupId>org.freemarker</groupId>
			<artifactId>freemarker</artifactId>
			<version>2.3.16</version>
		</dependency>
		<dependency>
			<groupId>joda-time</groupId>
			<artifactId>joda-time</artifactId>
			<version>2.7</version>
		</dependency>
		<dependency>
			<groupId>commons-lang</groupId>
			<artifactId>commons-lang</artifactId>
			<version>2.6</version>
		</dependency>
		<dependency>
			<groupId>org.apache.activemq</groupId>
			<artifactId>activemq-all</artifactId>
			<version>5.14.1</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-jms</artifactId>
			<version>${spring.version}</version>
		</dependency>
		
		<!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver -->
		<dependency>
		    <groupId>com.codeborne</groupId>
		    <artifactId>phantomjsdriver</artifactId>
		    <version>1.2.1</version>
		</dependency>
		
	</dependencies>
	

	<build>
		<finalName>${project.artifactId}-${project.version}</finalName>
		<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
		<outputDirectory>${basedir}/src/main/webapp/WEB-INF/classes</outputDirectory>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-compiler-plugin</artifactId>
				<configuration>
					<source>1.8</source>
					<target>1.8</target>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
			<plugin>
				<artifactId>maven-resources-plugin</artifactId>
				<executions>
					<execution>
						<id>copy-resources</id>
						<phase>process-resources</phase>
						<goals>
							<goal>copy-resources</goal>
						</goals>
						<configuration>
							<overwrite>true</overwrite>
							<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
							<outputDirectory>${basedir}/src/main/webapp/WEB-INF/classes</outputDirectory>
							<resources>
								<resource>
									<directory>src/main/resources/env/${env}</directory>
									<targetPath>${basedir}/src/main/webapp/WEB-INF/classes/conf</targetPath>
								</resource>
							</resources>
						</configuration>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>
</project>

工具類原始碼如下(小說):

package com.zhy.crawler.base;

import java.io.File;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;

public class NovelCrawler {
	
	/**  
	* @Title: NovelCrawler.java  
	* @Package com.zhy.crawler.base  
	* @Description: 小說爬取工具 
	* @author John_Hawkings
	* @date 2018年6月1日  
	* @version V1.0  
	*/  
	public static void main(String[] args) {
		//設定必要引數
        DesiredCapabilities dcaps = new DesiredCapabilities();
        //ssl證書支援
        dcaps.setCapability("acceptSslCerts", true);
        //截圖支援
        dcaps.setCapability("takesScreenshot", true);
        //css搜尋支援
        dcaps.setCapability("cssSelectorsEnabled", true);
        //js支援
        dcaps.setJavascriptEnabled(true);
        //驅動支援
        dcaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,"D:\\Devlop\\PhantomJS\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe");
        //建立無介面瀏覽器物件
        PhantomJSDriver driver = new PhantomJSDriver(dcaps);
        try {
            // 讓瀏覽器訪問空間主頁
            // driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
             driver.get("http://www.biqule.com/book_57885/");
            // driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
             Thread.sleep(1000L);
             WebElement  webElement = driver.findElementByClassName("article-list");
             List<WebElement> elements = webElement.findElements(By.tagName("dd"));
             List<String> linkLst = new ArrayList<>();
             for (int i = 0;i< elements.size();i++) {
            	 if(doesWebElementExist(elements.get(i),By.tagName("a"))) {
            		 linkLst.add(elements.get(i).findElement(By.tagName("a")).getAttribute("href"));
 				}
             }
             for (int i = 0;i< linkLst.size();i++) {
	            driver.get(linkLst.get(i));
	            //獲取新頁面視窗控制代碼並跳轉
	            String windowHandle = driver.getWindowHandle();
	            driver.switchTo().window(windowHandle);
	            driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
	            String text = driver.findElementById("content").getText();
	            File file  = new File("D:\\Novel\\極道天魔.txt");
	            if(!file.exists()) {
	            	file.createNewFile();
	            }
	            //2: 例項化OutputString 物件
	            FileOutputStream output = new FileOutputStream(file,true);
	            //3: 準備好實現內容的輸出
	            //將字串變為位元組陣列
	            byte data[] = text.getBytes();
	            output.write(data);
	            //4: 資源操作的最後必須關閉
	            output.close();
			}
            } catch (Exception e) {
                 e.printStackTrace();
             }finally{
                 //關閉並退出瀏覽器
                 driver.close();
                 driver.quit();
             }
        
	}
	
	public static boolean doesWebElementExist(WebElement element, By selector)
	  { 
	  
	         try 
	          { 
	        	 element.findElement(selector); 
	                 return true; 
	          } 
	          catch (NoSuchElementException e) 
	         { 
	                 return false; 
	         } 
	 }    

}

工具類原始碼如下(圖片):

package com.zhy.crawler.base;

import java.awt.image.BufferedImage;
import java.io.File;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import javax.imageio.ImageIO;

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;

public class GirlsCrawler {
	
	/**  
	* @Title: NovelCrawler.java  
	* @Package com.zhy.crawler.base  
	* @Description: 小說爬取工具 
	* @author John_Hawkings
	* @date 2018年6月1日  
	* @version V1.0  
	*/  
	public static void main(String[] args) {
		//設定必要引數
        DesiredCapabilities dcaps = new DesiredCapabilities();
        //ssl證書支援
        dcaps.setCapability("acceptSslCerts", true);
        //截圖支援
        dcaps.setCapability("takesScreenshot", true);
        //css搜尋支援
        dcaps.setCapability("cssSelectorsEnabled", true);
        //js支援
        dcaps.setJavascriptEnabled(true);
        //驅動支援
        dcaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,"D:\\Devlop\\PhantomJS\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe");
        //建立無介面瀏覽器物件
        PhantomJSDriver driver = new PhantomJSDriver(dcaps);
        try {
            // 讓瀏覽器訪問圖片主頁
             driver.get("http://www.94img.com/photos/QingDouKe-16286.html");
             Thread.sleep(1000L);
             List<WebElement> girlElements = driver.findElementsByClassName("gallary_item");
             List<String> linkLst = new ArrayList<String>();
             //獲取第一頁的類容放入集合中
             for (WebElement webElement : girlElements) {
            	 linkLst.add(webElement.findElement(By.tagName("img")).getAttribute("src"));
			 }
             //獲取後幾頁內容放入集合中
             WebElement xPath = driver.findElementByXPath("//*[@id=\"bodywrap\"]/table/tbody/tr/td/div/div[1]/div/div[10]");
             List<WebElement> pageElements = xPath.findElements(By.tagName("a"));
             String pageSize = pageElements.get(pageElements.size()-2).getText();
             for(int i = 2;i<Integer.valueOf(pageSize)+1;i++) {
            	 driver.get("http://www.94img.com/photos/QingDouKe-16286-"+i+".html");
            	 String windowHandle = driver.getWindowHandle();
 	            driver.switchTo().window(windowHandle);
 	            driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
 	           girlElements = driver.findElementsByClassName("gallary_item");
 	             //獲取後幾頁的類容放入集合中
 	             for (WebElement webElement : girlElements) {
 	            	 linkLst.add(webElement.findElement(By.tagName("img")).getAttribute("src"));
 				 }
             }
             //遍歷圖片連結集合下載圖片到本地
             HttpURLConnection connection=null;  
             URL url=null;  
            	 for (int k =0;k<linkLst.size();k++) {
            		 System.out.println("第"+(k+1)+"張圖片下載成功");
            		  url = new URL(linkLst.get(k));  
                      connection=(HttpURLConnection) url.openConnection();  
                      int code=connection.getResponseCode();  
	                  if(code == 200){  //響應成功  
	                        BufferedImage image=ImageIO.read(connection.getInputStream()); //讀取圖片檔案流  
	                        String path="D:\\Novel\\Girls\\2\\"+(k+1)+".jpeg";  //建立儲存圖片檔案的路徑  
	                        File file=new File(path);  
	                        ImageIO.write(image,"jpeg",  file);  //將圖片寫進建立的路徑  
	                    }  
				}
            } catch (Exception e) {
                 e.printStackTrace();
             }finally{
                 //關閉並退出瀏覽器
                 driver.close();
                 driver.quit();
             }
        
	}
	
	public static boolean doesWebElementExist(WebElement element, By selector)
	  { 
	  
	         try 
	          { 
	        	 element.findElement(selector); 
	                 return true; 
	          } 
	          catch (NoSuchElementException e) 
	         { 
	                 return false; 
	         } 
	 }    

}
思路:

      獲取小說或者圖片的所有連線加入到一個集合裡面然後遍歷集合依次訪問頁面獲取資料,有人會問為什麼不在獲取主頁面資料時一次性寫完爬蟲,因為這裡面會存在一個小問題,就是在多個頁面跳轉的時候連線會失效,這個問題可以解決但是有點繁瑣這裡不多說,我們以最通俗易懂的方式來寫方便記憶與學習

總結:

phantomJS+selenium真的很好用,特別是它支援的截圖功能在特定的需求下完全就是個神器,這裡不多說,後面有更多的爬蟲例項演示。歡迎小夥伴留言互相交流