1. 程式人生 > >htmlunit爬取js非同步載入後的頁面

htmlunit爬取js非同步載入後的頁面

直接上程式碼:

一、 index.html
呼叫後臺請求獲取content中的內容。

<html>
<head>
    <script type="text/javascript" src="./jquery.min.js"></script>
</head>
<body>
<h2>Hello World!</h2>
<div id="content"></div>
<script type="text/javascript">
$(document).ready(function
(){
$.post("/evh/test/testList",{},function(data){ $("#content").text(JSON.stringify(data)); }); });
</script> </body> </html>

二、TestController.java
/test/testList介面從後臺資料庫獲取資料。

package com.everhomes.proxy.controller;

import javax.annotation.Resource;

import org.slf
4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.web.bind.annotation.ExceptionHandler; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import com.everhomes.proxy.mapper.TestMapper; @RestController @RequestMapping("/test"
) public class TestController { private static final Logger logger = LoggerFactory.getLogger(TestController.class); @Resource private TestMapper testMapper; @RequestMapping("testList") public Object testList(){ return testMapper.testList(); }; @ExceptionHandler(Exception.class) public Object exception(Exception e){ logger.error("error: ", e); return "error: " + e.toString(); } }

三、Crawler.java

package com.everhomes.generate;

import java.io.IOException;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class Crawler {
    public static void main(String[] args) throws IOException, InterruptedException {
        WebClient webClient = new WebClient(BrowserVersion.CHROME);  
            webClient.getOptions().setJavaScriptEnabled(true);
            webClient.getOptions().setCssEnabled(false);
            webClient.getOptions().setRedirectEnabled(true);
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setTimeout(50000);
            HtmlPage rootPage = webClient.getPage("http://localhost:8080/evh/index.html");  
            webClient.waitForBackgroundJavaScript(10000);

            FileUtils.createFile(DIRECTORY+"cc.html", rootPage.asXml());
            webClient.close();
    }
}

四、pom.xml
新增相關依賴。


    <dependency>
        <groupId>commons-lang</groupId>
        <artifactId>commons-lang</artifactId>
        <version>2.6</version>
    </dependency>
    <dependency> 
            <groupId>net.sourceforge.htmlunit</groupId> 
            <artifactId>htmlunit-core-js</artifactId> 
            <version>2.23</version> 
    </dependency> 
    <dependency> 
            <groupId>net.sourceforge.htmlunit</groupId> 
            <artifactId>htmlunit</artifactId> 
            <version>2.25</version> 
    </dependency>