1. 程式人生 > >JAVA實現網頁抓取(htmlunit)

JAVA實現網頁抓取(htmlunit)

準確條件

加入依賴jar包

<dependency>
     <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.15</version>
</dependency>

程式碼示例

private WebClient initWc() throws IOException {
    WebClient wc = new WebClient
(BrowserVersion.CHROME); wc.getOptions().setJavaScriptEnabled(false); wc.getOptions().setCssEnabled(false); wc.getOptions().setTimeout(8000); wc.setJavaScriptTimeout(8000); wc.setAjaxController(new NicelyResynchronizingAjaxController()); wc.waitForBackgroundJavaScript(8000); // Cache cache=new Cache();
// wc.setCache(cache); wc.getOptions().setThrowExceptionOnScriptError(false); // wc.getOptions().setThrowExceptionOnFailingStatusCode(false); return wc; } public void loadData() { WebClient wc = null; if ( wc == null ) { try { wc = initWc(); } catch
(IOException e) { e.printStackTrace(); } } try { //圖片中文字解析時使用 IIORegistry registry = IIORegistry.getDefaultInstance(); registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi()); registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi()); StringBuffer errPage =new StringBuffer(); for(int i =1 ; i<=97;i++){ loadPage(i,errPage,wc); riskCompanyDao.flush(); } log.info("errPage:"+errPage); // loadPage(27,errPage,wc); } catch (Exception e) { log.warn("loadData error! ", e); } finally { wc.closeAllWindows(); } } private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){ HtmlPage page; try { String refer="http://www.baidu.com/"; URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html"); WebRequest request=new WebRequest(link); request.setCharset("UTF-8"); request.setAdditionalHeader("Referer", refer);//設定請求報文頭裡的refer欄位 ////設定請求報文頭裡的User-Agent欄位 request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"); request.setAdditionalHeader("Connection", "keep-alive"); request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770"); page = wc.getPage(request); HtmlPage pageResult = page; HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0); HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1); int indexRow = 0; for ( DomNode node2 : body.getChildNodes() ) { if (node2 instanceof HtmlTableRow ) { HtmlTableRow row = (HtmlTableRow) node2; List<HtmlTableCell> cells = row.getCells(); HtmlTableCell cell0=cells.get(0); String companyName = cell0.getElementsByTagName("a").get(0).getTextContent(); String industryName = cell0.getElementsByTagName("div").get(0).getTextContent(); industryName = industryName.split(":")[1]; String addr = cell0.getElementsByTagName("div").get(1).getTextContent(); if (addr.split(":").length>1){ addr = addr.split(":")[1]; }else{ addr=null; } String mobile =null; if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){ HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0); String imgStr =img.getAttribute("src"); imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22"); mobile = ImageRead.getImgStr(imgStr); log.info("mobile:"+mobile); } } indexRow++; } } catch (Exception e) { errPage.append(pageNo).append(","); log.warn("page error :"+pageNo,e); } }

注意事項

  1. 普通的httpConnection容易被攔截,需設定請求報文頭,模擬瀏覽器請求
  2. WebClient在請求發起前初始化一次即可
  3. 不同瀏覽器版返回的html程式碼有一定差異,需單獨除錯