1. 程式人生 > >基於HttpClient實現網路爬蟲~以百度新聞為例

基於HttpClient實現網路爬蟲~以百度新聞為例

      在以前的工作中,實現過簡單的網路爬蟲,沒有系統的介紹過,這篇部落格就係統的介紹以下如何使用java的HttpClient實現網路爬蟲。

      關於網路爬蟲的一些理論知識、實現思想以及策略問題,可以參考百度百科“網路爬蟲”,那裡已經介紹的十分詳細,這裡也不再囉嗦,下面就主要介紹如何去實現。

http請求:
      程式碼開始之前,還是首先介紹以下如何通過瀏覽器獲取http請求資訊,這一步是分析網站資源的第一步。在瀏覽器介面右鍵有“審查元素”這一功能(如果沒找到,F12一樣可以的),谷歌瀏覽器效果如下:


    點選“審查元素”之後會出現如下介面:


    其中的Network欄目是做爬蟲應該重點關注的,開啟會看到當前網頁所有的http請求資訊,如下圖:

    單擊每個資訊,可以看到http請求的詳細資訊,如下圖所示:


    通過程式偽裝成瀏覽器請求的時候,就多需要關注Request Headers裡面的資訊,還有一些需要登入的網站也是需要關注這些的。Response裡面的資訊就是伺服器返回的內容,這裡只做對文字資訊的處理,對圖片、音訊、視訊等資訊不做介紹。

    Response裡面就包含這我們爬蟲想獲取的資訊內容。如果裡面的格式不好看的話,可以在瀏覽器中輸入該http請求的url地址,然後右鍵-->檢視網頁原始碼的形式檢視相關資訊。通過分析網頁原始碼中的字串,總結出統一的規則,提取相應的文字資訊。

程式碼實現:

    CrawlBase類,模擬http請求的基類

 /**  
 *@Description: 獲取網頁資訊基類
 */ 
package com.lulei.crawl;  

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;

import com.lulei.util.CharsetUtil;


public abstract class CrawlBase {
	private static Logger log = Logger.getLogger(CrawlBase.class);
	
	//連結原始碼
	private String pageSourceCode = "";
	//返回頭資訊
	private Header[] responseHeaders = null;
	//連線超時時間
	private static int connectTimeout = 3500;
	//連線讀取時間
	private static int readTimeout = 3500;
	//預設最大訪問次數
	private static int maxConnectTimes = 3;
	//網頁預設編碼方式
	private static String charsetName = "iso-8859-1";
	private static HttpClient httpClient = new HttpClient();
	
	static {
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param method
	 * @param params
	 * @return
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: method方式訪問頁面
	 */
	public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
		if ("post".equals(method) || "POST".equals(method)) {
			return readPageByPost(urlStr, charsetName, params);
		} else {
			return readPageByGet(urlStr, charsetName, params);	
		}
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 訪問是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: Get方式訪問頁面
	 */
	public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
		GetMethod getMethod = createGetMethod(urlStr, params);
		return readPage(getMethod, charsetName, urlStr);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 訪問是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: Post方式訪問頁面
	 */
	public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
		PostMethod postMethod = createPostMethod(urlStr, params);
		return readPage(postMethod, charsetName, urlStr);
	}
	
	/**
	 * @param method
	 * @param defaultCharset
	 * @param urlStr
	 * @return 訪問是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: 讀取頁面資訊和頭資訊
	 */
	private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{
		int n = maxConnectTimes;
		while (n > 0) {
			try {
				if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
					log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));
					n--;
				} else {
					//獲取頭資訊
					responseHeaders = method.getResponseHeaders();
					//獲取頁面原始碼
					InputStream inputStream = method.getResponseBodyAsStream();
					BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
					StringBuffer stringBuffer = new StringBuffer();
					String lineString = null;
					while ((lineString = bufferedReader.readLine()) != null){
						stringBuffer.append(lineString);
						stringBuffer.append("\n");
					}
					pageSourceCode = stringBuffer.toString();
					InputStream in =new  ByteArrayInputStream(pageSourceCode.getBytes(charsetName));
					String charset = CharsetUtil.getStreamCharset(in, defaultCharset);
					//下面這個判斷是為了IP歸屬地查詢特意加上去的
					if ("Big5".equals(charset)) {
						charset = "gbk";
					}
					if (!charsetName.toLowerCase().equals(charset.toLowerCase())) {
						pageSourceCode = new String(pageSourceCode.getBytes(charsetName), charset);
					}
					return true;
				}
			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(urlStr + " -- can't connect  " + (maxConnectTimes - n + 1));
				n--;
			}
		}
		return false;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return GetMethod
	 * @Author: lulei  
	 * @Description: 設定get請求引數
	 */
	@SuppressWarnings("rawtypes")
	private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){
		GetMethod getMethod = new GetMethod(urlStr);
		if (params == null){
			return getMethod;
		}
		Iterator iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry entry = (Map.Entry) iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			getMethod.setRequestHeader(key, val);
		}
		return getMethod;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return PostMethod
	 * @Author: lulei  
	 * @Description: 設定post請求引數
	 */
	private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){
		PostMethod postMethod = new PostMethod(urlStr);
		if (params == null){
			return postMethod;
		}
		Iterator<Entry<String, String>> iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry<String, String> entry =  iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			postMethod.setParameter(key, val);
		}
		return postMethod;
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @return 訪問是否成功
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: 不設定任何頭資訊直接訪問網頁
	 */
	public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
		return this.readPageByGet(urlStr, charsetName, null);
	}
	
	/**
	 * @return String
	 * @Author: lulei  
	 * @Description: 獲取網頁原始碼
	 */
	public String getPageSourceCode(){
		return pageSourceCode;
	}
	
	/**
	 * @return Header[]
	 * @Author: lulei  
	 * @Description: 獲取網頁返回頭資訊
	 */
	public Header[] getHeader(){
		return responseHeaders;
	}
	
	/**
	 * @param timeout
	 * @Author: lulei  
	 * @Description: 設定連線超時時間
	 */
	public void setConnectTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
	}
	
	/**
	 * @param timeout
	 * @Author: lulei  
	 * @Description: 設定讀取超時時間
	 */
	public void setReadTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);
	}
	
	/**
	 * @param maxConnectTimes
	 * @Author: lulei  
	 * @Description: 設定最大訪問次數,連結失敗的情況下使用
	 */
	public static void setMaxConnectTimes(int maxConnectTimes) {
		CrawlBase.maxConnectTimes = maxConnectTimes;
	}

	/**
	 * @param connectTimeout
	 * @param readTimeout
	 * @Author: lulei  
	 * @Description: 設定連線超時時間和讀取超時時間
	 */
	public void setTimeout(int connectTimeout, int readTimeout){
		setConnectTimeout(connectTimeout);
		setReadTimeout(readTimeout);
	}

	/**
	 * @param charsetName
	 * @Author: lulei  
	 * @Description: 設定預設編碼方式
	 */
	public static void setCharsetName(String charsetName) {
		CrawlBase.charsetName = charsetName;
	}
}
    CrawlListPageBase類是CrawlBase的子類,實現了從頁面中獲取連結的URL資訊基類
 /**  
  *@Description: 獲取頁面連結地址資訊基類  
 */ 
package com.lulei.crawl;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.lulei.util.DoRegex;


public abstract class CrawlListPageBase extends CrawlBase {
	private String pageurl;
	
	/**
	* @param urlStr
	* @param charsetName
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
		readPageByGet(urlStr, charsetName);
		pageurl = urlStr;
	}
	
	/**
	* @param urlStr
	* @param charsetName
	* @param method
	* @param params
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{
		readPage(urlStr, charsetName, method, params);	
		pageurl = urlStr;
	}
	
	/**
	 * @return List<String>
	 * @Author: lulei  
	 * @Description: 返回頁面上需求的連結地址
	 */
	public List<String> getPageUrls(){
		List<String> pageUrls = new ArrayList<String>();
		pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
		return pageUrls;
	}
	
	/**
	 * @return String
	 * @Author: lulei  
	 * @Description: 返回頁面上需求的網址連線的正則表示式
	 */
	public abstract String getUrlRegexString();
	
	/**
	 * @return int
	 * @Author: lulei  
	 * @Description: 正則表示式中要去的欄位位置
	 */
	public abstract int getUrlRegexStringNum();	
}

    DoRegex類,封裝的一些基於正則表示式字串匹配查詢類
 /**  
 * @Description: 正則處理工具   
 */ 
package com.lulei.util;  

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
  
public class DoRegex {
	
	private static String rootUrlRegex = "(http://.*?/)";
	private static String currentUrlRegex = "(http://.*/)";
	private static String ChRegex = "([\u4e00-\u9fa5]+)";

	/**
	 * @param dealStr
	 * @param regexStr
	 * @param splitStr
	 * @param n
	 * @return String
	 * @Author: lulei  
	 * @Description: 正則匹配結果,每條記錄用splitStr分割
	 */
	public static String getString(String dealStr, String regexStr, String splitStr, int n){
		String reStr = "";
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reStr;
		}
		splitStr = (splitStr == null) ? "" : splitStr;
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		StringBuffer stringBuffer = new StringBuffer();
		while (matcher.find()) {
			stringBuffer.append(matcher.group(n).trim());
			stringBuffer.append(splitStr);
		}
		reStr = stringBuffer.toString();
		if (splitStr != "" && reStr.endsWith(splitStr)){
			reStr = reStr.substring(0, reStr.length() - splitStr.length());
		}
		return reStr;
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param n
	 * @return String
	 * @Author: lulei  
	 * @Description: 正則匹配結果,將所有匹配記錄組裝成字串
	 */
	public static String getString(String dealStr, String regexStr, int n){
		return getString(dealStr, regexStr, null, n);
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param n
	 * @return String
	 * @Author: lulei  
	 * @Description: 正則匹配第一條結果
	 */
	public static String getFirstString(String dealStr, String regexStr, int n){
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return "";
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			return matcher.group(n).trim();
		}
		return "";
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param n
	 * @return ArrayList<String>
	 * @Author: lulei  
	 * @Description: 正則匹配結果,將匹配結果組裝成陣列
	 */
	public static List<String> getList(String dealStr, String regexStr, int n){
		List<String> reArrayList = new ArrayList<String>();
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reArrayList;
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			reArrayList.add(matcher.group(n).trim());
		}
		return reArrayList;
	}
	
	/**
	 * @param url
	 * @param currentUrl
	 * @return String
	 * @Author: lulei  
	 * @Description: 組裝網址,網頁的url
	 */
	private static String getHttpUrl(String url, String currentUrl){
		try {
			url = encodeUrlCh(url);
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block  
			e.printStackTrace();
		}
		if (url.indexOf("http") == 0){
			return url;
		}
		if  (url.indexOf("/") == 0){
			return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1);
		}
		return getFirstString(currentUrl, currentUrlRegex, 1) + url;
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param currentUrl
	 * @param n
	 * @return ArrayList<String>
	 * @Author: lulei  
	 * @Description: 獲取和正則匹配的絕對連結地址
	 */
	public static List<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){
		List<String> reArrayList = new ArrayList<String>();
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reArrayList;
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl));
		}
		return reArrayList;
	}
	
	/**
	 * @param url
	 * @return
	 * @throws UnsupportedEncodingException
	 * @Author: lulei  
	 * @Description: 將連線地址中的中文進行編碼處理
	 */
	public static String encodeUrlCh (String url) throws UnsupportedEncodingException {
		while (true) {
			String s = getFirstString(url, ChRegex, 1);
			if ("".equals(s)){
				return url;
			}
			url = url.replaceAll(s, URLEncoder.encode(s, "utf-8"));
		}
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param array 正則位置陣列
	 * @return
	 * @Author:lulei  
	 * @Description: 獲取全部
	 */
	public static List<String[]> getListArray(String dealStr, String regexStr, int[] array) {
		List<String[]> reArrayList = new ArrayList<String[]>();
		if (dealStr == null || regexStr == null || array == null) {
			return reArrayList;
		}
		for (int i = 0; i < array.length; i++) {
			if (array[i] < 1) {
				return reArrayList;
			}
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			String[] ss = new String[array.length]; 
			for (int i = 0; i < array.length; i++) {
				ss[i] = matcher.group(array[i]).trim();
			}
			reArrayList.add(ss);
		}
		return reArrayList;
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param array
	 * @return
	 * @Author:lulei  
	 * @Description: 獲取全部
	 */
	public static List<String> getStringArray(String dealStr, String regexStr, int[] array) {
		List<String> reStringList = new ArrayList<String>();
		if (dealStr == null || regexStr == null || array == null) {
			return reStringList;
		}
		for (int i = 0; i < array.length; i++) {
			if (array[i] < 1) {
				return reStringList;
			}
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			StringBuffer sb = new StringBuffer();
			for (int i = 0; i < array.length; i++) {
				sb.append(matcher.group(array[i]).trim());
			}
			reStringList.add(sb.toString());
		}
		return reStringList;
	}
	
	/**
	 * @param dealStr
	 * @param regexStr
	 * @param array 正則位置陣列
	 * @return
	 * @Author:lulei  
	 * @Description: 獲取第一個
	 */
	public static String[] getFirstArray(String dealStr, String regexStr, int[] array) {
		if (dealStr == null || regexStr == null || array == null) {
			return null;
		}
		for (int i = 0; i < array.length; i++) {
			if (array[i] < 1) {
				return null;
			}
		}
		Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher matcher = pattern.matcher(dealStr);
		while (matcher.find()) {
			String[] ss = new String[array.length]; 
			for (int i = 0; i < array.length; i++) {
				ss[i] = matcher.group(array[i]).trim();
			}
			return ss;
		}
		return null;
	}
}

    CharsetUtil類,編碼方式檢測類
 /**  
 *@Description:  編碼方式檢測類  
 */ 
package com.lulei.util;  

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;
  
public class CharsetUtil {
	private static final CodepageDetectorProxy detector;
	
	static {//初始化探測器
		detector = CodepageDetectorProxy.getInstance();
		detector.add(new ParsingDetector(false));
		detector.add(ASCIIDetector.getInstance());
		detector.add(UnicodeDetector.getInstance());
		detector.add(JChardetFacade.getInstance());
	}

	/**
	 * @param url
	 * @param defaultCharset
	 * @Author:lulei  
	 * @return 獲取檔案的編碼方式
	 */
	public static String getStreamCharset (URL url, String defaultCharset) {
		if (url == null) {
			return defaultCharset;
		}
		try {
			//使用第三方jar包檢測檔案的編碼
			Charset charset = detector.detectCodepage(url);
			if (charset != null) {
				return charset.name();
			}
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		return defaultCharset;
	}
	
	/**
	 * @param inputStream
	 * @param defaultCharset
	 * @return
	 * @Author:lulei  
	 * @Description: 獲取檔案流的編碼方式
	 */
	public static String getStreamCharset (InputStream inputStream, String defaultCharset) {
		if (inputStream == null) {
			return defaultCharset;
		}
		int count = 200;
		try {
			count = inputStream.available();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		try {
			//使用第三方jar包檢測檔案的編碼
			Charset charset = detector.detectCodepage(inputStream, count);
			if (charset != null) {
				return charset.name();
			}
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		return defaultCharset;
	}
}
    上面四個類,就實現了網路文字資源資訊抓取的基本架構,下面就通過一個實際的例子介紹如何使用上述類實現網路文字資源資訊的獲取。

百度新聞案例:

    1)找到百度新聞更新列表頁,如http://news.baidu.com/n?cmd=4&class=civilnews&pn=1&from=tab 介面如下圖所示:

    文章URL連結地址如下圖所示:

    通過對原始檔的分析,編寫BaiduNewList類,實現百度新聞列表頁資訊的抓取,程式碼如下:

 /**  
 *@Description:   百度新聞滾動列表頁,可以獲取當前頁面上的連結
 */ 
package com.lulei.crawl.news;  

import java.io.IOException;
import java.util.HashMap;

import com.lulei.crawl.CrawlListPageBase;
  
public class BaiduNewList extends CrawlListPageBase{
	private static HashMap<String, String> params;
	
	/**
	 * 新增相關頭資訊,對請求進行偽裝
	 */
	static {
		params = new HashMap<String, String>();
		params.put("Referer", "http://www.baidu.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
	}

	public BaiduNewList(String urlStr) throws IOException {
		super(urlStr, "utf-8", "get", params);  
	}

	@Override
	public String getUrlRegexString() {
		// TODO Auto-generated method stub  
		//新聞列表頁中文章連結地址的正則表示式
		return "• <a href=\"(.*?)\"";
	}

	@Override
	public int getUrlRegexStringNum() {
		// TODO Auto-generated method stub  
		//連結地址在正則表示式中的位置
		return 1;
	}

	/**  
	 * @param args
	 * @throws IOException 
	 * @Author:lulei  
	 * @Description:  測試用例
	 */
	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub  
		BaiduNewList baidu = new BaiduNewList("http://news.baidu.com/n?cmd=4&class=sportnews&pn=1&from=tab");
		for (String s : baidu.getPageUrls()) {
			System.out.println(s);
		}
	}
}

    2)通過第一步獲取的URL,得到新聞所在的內容頁面URL,由於百度新聞列表頁面上的新聞來自不同的站,所以很難找到一個通用的結構,大多數的新聞類網站,內容都是放在p標籤內,所以就採用瞭如下的方式獲取新聞的內容,如下圖:


    News類具體實現如下所示:

 /**  
 *@Description:   新聞類網站新聞內容 
 */ 
package com.lulei.crawl.news;  

import java.io.IOException;
import java.util.HashMap;

import org.apache.commons.httpclient.HttpException;

import com.lulei.crawl.CrawlBase;
import com.lulei.util.DoRegex;
  
public class News extends CrawlBase{
	private String url;
	private String content;
	private String title;
	private String type;
	
	private static String contentRegex = "<p.*?>(.*?)</p>";
	private static String titleRegex = "<title>(.*?)</title>";
	private static int maxLength = 300;
	
	private static HashMap<String, String> params;
	/**
	 * 新增相關頭資訊,對請求進行偽裝
	 */
	static {
		params = new HashMap<String, String>();
		params.put("Referer", "http://www.baidu.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 預設p標籤內的內容為正文,如果正文長度查過設定的最大長度,則擷取前半部分
	 */
	private void setContent() {
		String content = DoRegex.getString(getPageSourceCode(), contentRegex, 1);
		content = content.replaceAll("\n", "")
									  .replaceAll("<script.*?/script>", "")
									  .replaceAll("<style.*?/style>", "")
									  .replaceAll("<.*?>", "");
		this.content = content.length() > maxLength ? content.substring(0, maxLength) : content;
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 預設title標籤內的內容為標題
	 */
	private void setTitle() {
		this.title = DoRegex.getString(getPageSourceCode(), titleRegex, 1);;
	}
	
	public News(String url) throws HttpException, IOException {
		this.url = url;
		readPageByGet(url, "utf-8", params);
		setContent();
		setTitle();
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	public String getContent() {
		return content;
	}

	public String getTitle() {
		return title;
	}

	public String getType() {
		return type;
	}

	public void setType(String type) {
		this.type = type;
	}

	public static void setMaxLength(int maxLength) {
		News.maxLength = maxLength;
	}

	/**
	 * @param args
	 * @throws HttpException
	 * @throws IOException
	 * @Author:lulei  
	 * @Description: 測試用例
	 */
	public static void main(String[] args) throws HttpException, IOException {
		// TODO Auto-generated method stub  
		News news = new News("http://we.sportscn.com/viewnews-1634777.html");
		System.out.println(news.getContent());
		System.out.println(news.getTitle());
	}

}

    3)編寫抓取的入口,這裡為了簡單,只做了兩層的分析,所以新聞更新列表也的URL就直接寫在程式中。如下圖所示:


    執行一次採集任務如下圖所示:



    在main函式裡面只需要一次性或週期性的去執行run函式即可,具體程式碼如下:

 /**  
 *@Description:     
 */ 
package com.lulei.knn.data;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.lulei.crawl.news.BaiduNewList;
import com.lulei.crawl.news.News;
import com.lulei.knn.index.KnnIndex;
import com.lulei.knn.index.KnnSearch;
import com.lulei.util.ParseMD5;
  
public class CrawlNews {
	private static List<Info> infos;
	private static KnnIndex knnIndex = new KnnIndex();
	private static KnnSearch knnSearch = new KnnSearch();
	private static HashMap<String, Integer> result;
	
	static {
		infos = new ArrayList<Info>();
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=1&from=tab", "體育類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=2&from=tab", "體育類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=3&from=tab", "體育類"));
		
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=1&sub=0", "軍事類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=2&sub=0", "軍事類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=3&sub=0", "軍事類"));
		
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=1&sub=0", "財經類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=2&sub=0", "財經類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=3&sub=0", "財經類"));
		
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=internet&pn=1&from=tab", "網際網路"));
		
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=1&sub=0", "房產類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=2&sub=0", "房產類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=3&sub=0", "房產類"));
		
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=1&sub=0", "遊戲類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=2&sub=0", "遊戲類"));
		infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=3&sub=0", "遊戲類"));
	}
	
	/**
	 *@Description:  抓取網址資訊
	 *@Author:lulei  
	 */
	static class Info{
		String url;
		String type;
		Info(String url, String type) {
			this.url = url;
			this.type = type;
		}
	}
	
	/**
	 * @param info
	 * @Author:lulei  
	 * @Description: 抓取一個列表頁面下的新聞資訊
	 */
	private void crawl(Info info) {
		if (info == null) {
			return;
		}
		try {
			BaiduNewList baiduNewList = new BaiduNewList(info.url);
			List<String> urls = baiduNewList.getPageUrls();
			for (String url : urls) {
				News news = new News(url);
				NewsBean newBean = new NewsBean();
				newBean.setId(ParseMD5.parseStrToMd5L32(url));
				newBean.setType(info.type);
				newBean.setUrl(url);
				newBean.setTitle(news.getTitle());
				newBean.setContent(news.getContent());
				//儲存到索引檔案中
				knnIndex.add(newBean);
				//knn驗證
				if (news.getContent() == null || "".equals(news.getContent())) {
					result.put("E", 1+result.get("E"));
					continue;
				}
				if (info.type.equals(knnSearch.getType(news.getContent()))) {
					result.put("R", 1+result.get("R"));
				} else {
					result.put("W", 1+result.get("W"));
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 啟動入口
	 */
	public void run() {
		result = new HashMap<String, Integer>(); 
		result.put("R", 0);
		result.put("W", 0);
		result.put("E", 0);
		for (Info info : infos) {
			System.out.println(info.url + "------start");
			crawl(info);
			System.out.println(info.url + "------end");
		}
		try {
			knnIndex.commit();
			System.out.println("R = " + result.get("R"));
			System.out.println("W = " + result.get("W"));
			System.out.println("E = " + result.get("E"));
			System.out.println("精確度:" + (result.get("R") * 1.0 / (result.get("R") + result.get("W"))));
			System.out.println("-------------finished---------------");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) {
		new CrawlNews().run();
	}
}
到此為止,一個完整的採集程式就完成了。