1. 程式人生 > >java呼叫百度搜索+Jsoup實現網路資源收集

java呼叫百度搜索+Jsoup實現網路資源收集

Jsoup核心jar包:Jsoup核心jar包下載地址
java程式碼:
抽象搜尋資源的實體:Webpage

package com.sinosoft.lhresource.search.common;

public class Webpage {
    // 標題
    private String title;
    // 連結
    private String url;
    // 簡介
    private String summary;
    // 正文內容
    private String content;
    public String getTitle() {
        return
title; } public void setTitle(String title) { this.title = title; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getSummary() { return summary; } public void setSummary
(String summary) { this.summary = summary; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }

通過資源連接獲取資源內容:TextExtract.java;Tools.java


package com.sinosoft.lhresource.search.common;

import
java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TextExtract { private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class); private static List<String> lines; private final static int blocksWidth; private static int threshold; private static String html; private static boolean flag; private static int start; private static int end; private static StringBuilder text; private static ArrayList<Integer> indexDistribution; static { lines = new ArrayList<>(); indexDistribution = new ArrayList<>(); text = new StringBuilder(); blocksWidth = 3; flag = false; /* 當待抽取的網頁正文中遇到成塊的新聞標題未剔除時,只要增大此閾值即可。*/ /* 閾值增大,準確率提升,召回率下降;值變小,噪聲會大,但可以保證抽到只有一句話的正文 */ threshold = 86; } public static void setthreshold(int value) { threshold = value; } /** * 抽取網頁正文,不判斷該網頁是否是目錄型。即已知傳入的肯定是可以抽取正文的主題類網頁。 * * @param _html 網頁HTML字串 * * @return 網頁正文string */ public static String parse(String _html) { return parse(_html, false); } /** * 判斷傳入HTML,若是主題類網頁,則抽取正文;否則輸出<b>"unkown"</b>。 * * @param _html 網頁HTML字串 * @param _flag true進行主題類判斷, 省略此引數則預設為false * * @return 網頁正文string */ public static String parse(String _html, boolean _flag) { flag = _flag; html = _html; preProcess(); LOG.debug(html); return getText(); } private static void preProcess() { html = html.replaceAll("(?is)<!DOCTYPE.*?>", ""); html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char html = html.replaceAll("(?is)<.*?>", ""); //<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]--> } private static String getText() { lines = Arrays.asList(html.split("\n")); indexDistribution.clear(); for (int i = 0; i < lines.size() - blocksWidth; i++) { int wordsNum = 0; for (int j = i; j < i + blocksWidth; j++) { lines.set(j, lines.get(j).replaceAll("\\s+", "")); wordsNum += lines.get(j).length(); } indexDistribution.add(wordsNum); LOG.debug(wordsNum + ""); } start = -1; end = -1; boolean boolstart = false, boolend = false; text.setLength(0); for (int i = 0; i < indexDistribution.size() - 1; i++) { if (indexDistribution.get(i) > threshold && !boolstart) { if (indexDistribution.get(i + 1).intValue() != 0 || indexDistribution.get(i + 2).intValue() != 0 || indexDistribution.get(i + 3).intValue() != 0) { boolstart = true; start = i; continue; } } if (boolstart) { if (indexDistribution.get(i).intValue() == 0 || indexDistribution.get(i + 1).intValue() == 0) { end = i; boolend = true; } } StringBuilder tmp = new StringBuilder(); if (boolend) { LOG.debug(start + 1 + "\t\t" + end + 1); for (int ii = start; ii <= end; ii++) { if (lines.get(ii).length() < 5) { continue; } tmp.append(lines.get(ii)).append("\n"); } String str = tmp.toString(); LOG.debug(str); if (str.contains("Copyright") || str.contains("版權所有")) { continue; } text.append(str); boolstart = boolend = false; } } return text.toString(); } } package com.sinosoft.lhresource.search.common; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.net.URL; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class Tools { private static final Logger LOG = LoggerFactory.getLogger(Tools.class); public static String getHTMLContent(String url) { return getHTMLContent(url, "utf-8"); } public static String getHTMLContent(String url, String encoding) { try { BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream(),encoding)); StringBuilder html = new StringBuilder(); String line = reader.readLine(); while (line != null) { html.append(line).append("\n"); line = reader.readLine(); } String content = TextExtract.parse(html.toString()); return content; } catch (Exception e) { LOG.debug("解析URL失敗:" + url, e); } return null; } public static void copyFile(InputStream in, File outFile){ OutputStream out = null; try { byte[] data=readAll(in); out = new FileOutputStream(outFile); out.write(data, 0, data.length); out.close(); } catch (IOException ex) { LOG.error("檔案操作失敗",ex); } finally { try { if(in!=null){ in.close(); } } catch (IOException ex) { LOG.error("檔案操作失敗",ex); } try { if(out!=null){ out.close(); } } catch (IOException ex) { LOG.error("檔案操作失敗",ex); } } } public static byte[] readAll(InputStream in) { ByteArrayOutputStream out = new ByteArrayOutputStream(); try { byte[] buffer = new byte[1024]; for (int n; (n = in.read(buffer)) > 0;) { out.write(buffer, 0, n); } } catch (IOException e) { LOG.error("讀取失敗", e); } return out.toByteArray(); } }

自定義檢索介面:Searcher.java

package com.sinosoft.lhresource.search.common;

import java.util.List;

public interface Searcher {

     public List<Webpage> search(String keyword);
     public List<Webpage> search(String keyword, int page);
}

自定義處理百度檢索介面:BaiduSearcher.java


package com.sinosoft.lhresource.search.common;

import java.util.List;

public interface BaiduSearcher extends Searcher {

    /**
     * 新聞搜尋
     * @param keyword
     * @return 
     */
    public List<Webpage> searchNews(String keyword);
    /**
     * 新聞搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchNews(String keyword, int page);
    /**
     * 貼吧搜尋
     * @param keyword
     * @return 
     */
    public List<Webpage> searchTieba(String keyword);
    /**
     * 貼吧搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchTieba(String keyword, int page);
    /**
     * 知道搜尋
     * @param keyword
     * @return 
     */
    public List<Webpage> searchZhidao(String keyword);
    /**
     * 知道搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchZhidao(String keyword, int page);
    /**
     * 文庫搜尋
     * @param keyword
     * @return 
     */
    public List<Webpage> searchWenku(String keyword);
    /**
     * 文庫搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchWenku(String keyword, int page);
}


package com.sinosoft.lhresource.search.common;

import java.util.List;

public abstract class AbstractBaiduSearcher implements BaiduSearcher {

    /**
     * 新聞搜尋
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchNews(String keyword){
        return searchNews(keyword, 1);
    }
    /**
     * 新聞搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchNews(String keyword, int page){
        throw new RuntimeException("未實現");
    }
    /**
     * 貼吧搜尋
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchTieba(String keyword){
        return searchTieba(keyword, 1);
    }
    /**
     * 貼吧搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchTieba(String keyword, int page){
        throw new RuntimeException("未實現");
    }
    /**
     * 知道搜素
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchZhidao(String keyword){
        return searchZhidao(keyword, 1);
    }
    /**
     * 知道搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchZhidao(String keyword, int page){
        throw new RuntimeException("未實現");
    }
    /**
     * 文庫搜尋
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchWenku(String keyword){
        return searchWenku(keyword, 1);
    }
    /**
     * 文庫搜尋(分頁)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchWenku(String keyword, int page){
        throw new RuntimeException("未實現");
    }
}

百度搜索+Jsoup實現資源收集:JSoupBaiduSearcher.java


package com.sinosoft.lhresource.search.common;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JSoupBaiduSearcher extends AbstractBaiduSearcher {

     private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);

        @Override
        public List<Webpage> search(String keyword) {
            return search(keyword, 1);
        }
        @Override
        public List<Webpage> search(String keyword, int page) {
            int pageSize = 10;
            //百度搜索結果每頁大小為10,pn引數代表的不是頁數,而是返回結果的開始數
            //如獲取第一頁則pn=0,第二頁則pn=10,第三頁則pn=20,以此類推,抽象出模式:(page-1)*pageSize
            String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;

//          SearchResult searchResult = new SearchResult();
//          searchResult.setPage(page);
            List<Webpage> webpages = new ArrayList<>();
            try {
                Document document = Jsoup.connect(url).get();

                //獲取搜尋結果數目
                int total = getBaiduSearchResultCount(document);
//              searchResult.setTotal(total);
                int len = 10;
                if (total < 1) {
                    return null;
                }
                //如果搜尋到的結果不足一頁
                if (total < 10) {
                    len = total;
                }
                for (int i = 0; i < len; i++) {
                    String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
                    String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
                    LOG.debug("titleCssQuery:" + titleCssQuery);
                    LOG.debug("summaryCssQuery:" + summaryCssQuery);
                    Element titleElement = document.select(titleCssQuery).first();
                    String href = "";
                    String titleText = "";
                    if(titleElement != null){
                        titleText = titleElement.text();
                        href = titleElement.attr("href");
                    }else{
                        //處理百度百科
                        titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
                        summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
                        LOG.debug("處理百度百科 titleCssQuery:" + titleCssQuery);
                        LOG.debug("處理百度百科 summaryCssQuery:" + summaryCssQuery);
                        titleElement = document.select(titleCssQuery).first();
                        if(titleElement != null){
                            titleText = titleElement.text();
                            href = titleElement.attr("href");
                        }
                    }
                    LOG.debug(titleText);
                    Element summaryElement = document.select(summaryCssQuery).first();
                    //處理百度知道
                    if(summaryElement == null){
                        summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
                        LOG.debug("處理百度知道 summaryCssQuery:" + summaryCssQuery);
                        summaryElement = document.select(summaryCssQuery).first();
                    }
                    String summaryText = "";
                    if(summaryElement != null){
                        summaryText = summaryElement.text(); 
                    }
                    LOG.debug(summaryText);                

                    if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
                        Webpage webpage = new Webpage();
                        webpage.setTitle(titleText);
                        webpage.setUrl(href);
                        webpage.setSummary(summaryText);
                        /*if (href != null) {
                            String content = Tools.getHTMLContent(href);
                            webpage.setContent(content);
                        } else {
                            LOG.info("頁面正確提取失敗");
                        }*/
                        webpages.add(webpage);
                    } else {
                        LOG.error("獲取搜尋結果列表項出錯:" + titleText + " - " + summaryText);
                    }
                }

            } catch (IOException ex) {
                LOG.error("搜尋出錯",ex);
            }
//          searchResult.setWebpages(webpages);
            return webpages;
        }
        /**
         * 獲取百度搜索結果數
         * 獲取如下文字並解析數字:
         * 百度為您找到相關結果約13,200個
         * @param document 文件
         * @return 結果數
         */
        private int getBaiduSearchResultCount(Document document){
            String cssQuery = "html body div div div div.nums";
            LOG.debug("total cssQuery: " + cssQuery);
            Element totalElement = document.select(cssQuery).first();
            String totalText = totalElement.text(); 
            LOG.info("搜尋結果文字:" + totalText);

            String regEx="[^0-9]";   
            Pattern pattern = Pattern.compile(regEx);      
            Matcher matcher = pattern.matcher(totalText);
            totalText = matcher.replaceAll("");
            int total = Integer.parseInt(totalText);
            LOG.info("搜尋結果數:" + total);
            return total;
        }

        public static void main(String[] args) {
            Searcher searcher = new JSoupBaiduSearcher();
            List<Webpage> webpages = searcher.search("六扇門",2);
            if (webpages != null) {
                int i = 2;
                LOG.info("搜尋結果 當前第 " + 1 + " 頁,頁面大小為:" + webpages.size() + " 共有結果數:" + webpages.size());
                for (Webpage webpage : webpages) {
                    LOG.info("搜尋結果 " + (i++) + " :");
                    LOG.info("標題:" + webpage.getTitle());
                    LOG.info("URL:" + webpage.getUrl());
                    LOG.info("摘要:" + webpage.getSummary());
                    LOG.info("正文:" + webpage.getContent());
                    LOG.info("");
                }
            } else {
                LOG.error("沒有搜尋到結果");
            }
        }
}