1. 程式人生 > >基於HttpClient實現網絡爬蟲~以百度新聞為例

基於HttpClient實現網絡爬蟲~以百度新聞為例

rom pcl 音頻 lba 瀏覽器中 sts 更新 @override erro

轉載請註明出處:http://blog.csdn.net/xiaojimanman/article/details/40891791


基於HttpClient4.5實現網絡爬蟲請訪問這裏:http://blog.csdn.net/xiaojimanman/article/details/53178307

在曾經的工作中,實現過簡單的網絡爬蟲,沒有系統的介紹過,這篇博客就系統的介紹以下怎樣使用java的HttpClient實現網絡爬蟲。

關於網絡爬蟲的一些理論知識、實現思想以及策略問題。能夠參考百度百科“網絡爬蟲”,那裏已經介紹的十分具體。這裏也不再啰嗦,以下就主要介紹怎樣去實現。


http請求:


代碼開始之前,還是首先介紹以下怎樣通過瀏覽器獲取http請求信息。這一步是分析站點資源的第一步。在瀏覽器界面右鍵有“審查元素”這一功能(假設沒找到。F12一樣能夠的),谷歌瀏覽器效果例如以下:

技術分享

點擊“審查元素”之後會出現例如以下界面:

技術分享

當中的Network欄目是做爬蟲應該重點關註的,打開會看到當前網頁所有的http請求信息,例如以下圖:技術分享

單擊每一個信息。能夠看到http請求的具體信息。例如以下圖所看到的:

技術分享

通過程序偽裝成瀏覽器請求的時候,就多須要關註Request Headers裏面的信息,另一些須要登錄的站點也是須要關註這些的。Response裏面的信息就是server返回的內容,這裏僅僅做對文本信息的處理,對圖片、音頻、視頻等信息不做介紹。

Response裏面就包括這我們爬蟲想獲取的信息內容。假設裏面的格式不好看的話。能夠在瀏覽器中輸入該http請求的url地址。然後右鍵-->查看網頁源碼的形式查看相關信息。

通過分析網頁源碼中的字符串。總結出統一的規則。提取對應的文本信息。


代碼實現:

CrawlBase類,模擬http請求的基類

 /**  
 [email protected]: 獲取網頁信息基類
 */ 
package com.lulei.crawl;  

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;

import com.lulei.util.CharsetUtil;


public abstract class CrawlBase {
	private static Logger log = Logger.getLogger(CrawlBase.class);
	
	//鏈接源碼
	private String pageSourceCode = "";
	//返回頭信息
	private Header[] responseHeaders = null;
	//連接超時時間
	private static int connectTimeout = 3500;
	//連接讀取時間
	private static int readTimeout = 3500;
	//默認最大訪問次數
	private static int maxConnectTimes = 3;
	//網頁默認編碼方式
	private static String charsetName = "iso-8859-1";
	private static HttpClient httpClient = new HttpClient();
	
	static {
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param method
	 * @param params
	 * @return
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: method方式訪問頁面
	 */
	public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
		if ("post".equals(method) || "POST".equals(method)) {
			return readPageByPost(urlStr, charsetName, params);
		} else {
			return readPageByGet(urlStr, charsetName, params);	
		}
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 訪問是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: Get方式訪問頁面
	 */
	public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
		GetMethod getMethod = createGetMethod(urlStr, params);
		return readPage(getMethod, charsetName, urlStr);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 訪問是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: Post方式訪問頁面
	 */
	public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
		PostMethod postMethod = createPostMethod(urlStr, params);
		return readPage(postMethod, charsetName, urlStr);
	}
	
	/**
	 * @param method
	 * @param defaultCharset
	 * @param urlStr
	 * @return 訪問是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: 讀取頁面信息和頭信息
	 */
	private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{
		int n = maxConnectTimes;
		while (n > 0) {
			try {
				if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
					log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));
					n--;
				} else {
					//獲取頭信息
					responseHeaders = method.getResponseHeaders();
					//獲取頁面源碼
					InputStream inputStream = method.getResponseBodyAsStream();
					BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
					StringBuffer stringBuffer = new StringBuffer();
					String lineString = null;
					while ((lineString = bufferedReader.readLine()) != null){
						stringBuffer.append(lineString);
						stringBuffer.append("\n");
					}
					pageSourceCode = stringBuffer.toString();
					InputStream in =new  ByteArrayInputStream(pageSourceCode.getBytes(charsetName));
					String charset = CharsetUtil.getStreamCharset(in, defaultCharset);
					//以下這個推斷是為了IP歸屬地查詢特意加上去的
					if ("Big5".equals(charset)) {
						charset = "gbk";
					}
					if (!charsetName.toLowerCase().equals(charset.toLowerCase())) {
						pageSourceCode = new String(pageSourceCode.getBytes(charsetName), charset);
					}
					return true;
				}
			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(urlStr + " -- can‘t connect  " + (maxConnectTimes - n + 1));
				n--;
			}
		}
		return false;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return GetMethod
	 * @Author: lulei  
	 * @Description: 設置get請求參數
	 */
	@SuppressWarnings("rawtypes")
	private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){
		GetMethod getMethod = new GetMethod(urlStr);
		if (params == null){
			return getMethod;
		}
		Iterator iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry entry = (Map.Entry) iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			getMethod.setRequestHeader(key, val);
		}
		return getMethod;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return PostMethod
	 * @Author: lulei  
	 * @Description: 設置post請求參數
	 */
	private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){
		PostMethod postMethod = new PostMethod(urlStr);
		if (params == null){
			return postMethod;
		}
		Iterator<Entry<String, String>> iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry<String, String> entry =  iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			postMethod.setParameter(key, val);
		}
		return postMethod;
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @return 訪問是否成功
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: 不設置不論什麽頭信息直接訪問網頁
	 */
	public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
		return this.readPageByGet(urlStr, charsetName, null);
	}
	
	/**
	 * @return String
	 * @Author: lulei  
	 * @Description: 獲取網頁源碼
	 */
	public String getPageSourceCode(){
		return pageSourceCode;
	}
	
	/**
	 * @return Header[]
	 * @Author: lulei  
	 * @Description: 獲取網頁返回頭信息
	 */
	public Header[] getHeader(){
		return responseHeaders;
	}
	
	/**
	 * @param timeout
	 * @Author: lulei  
	 * @Description: 設置連接超時時間
	 */
	public void setConnectTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
	}
	
	/**
	 * @param timeout
	 * @Author: lulei  
	 * @Description: 設置讀取超時時間
	 */
	public void setReadTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);
	}
	
	/**
	 * @param maxConnectTimes
	 * @Author: lulei  
	 * @Description: 設置最大訪問次數,鏈接失敗的情況下使用
	 */
	public static void setMaxConnectTimes(int maxConnectTimes) {
		CrawlBase.maxConnectTimes = maxConnectTimes;
	}

	/**
	 * @param connectTimeout
	 * @param readTimeout
	 * @Author: lulei  
	 * @Description: 設置連接超時時間和讀取超時時間
	 */
	public void setTimeout(int connectTimeout, int readTimeout){
		setConnectTimeout(connectTimeout);
		setReadTimeout(readTimeout);
	}

	/**
	 * @param charsetName
	 * @Author: lulei  
	 * @Description: 設置默認編碼方式
	 */
	public static void setCharsetName(String charsetName) {
		CrawlBase.charsetName = charsetName;
	}
}


CrawlListPageBase類是CrawlBase的子類,實現了從頁面中獲取鏈接的URL信息基類

 /**  
  [email protected]: 獲取頁面鏈接地址信息基類  
 */ 
package com.lulei.crawl;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.lulei.util.DoRegex;


public abstract class CrawlListPageBase extends CrawlBase {
	private String pageurl;
	
	/**
	* @param urlStr
	* @param charsetName
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
		readPageByGet(urlStr, charsetName);
		pageurl = urlStr;
	}
	
	/**
	* @param urlStr
	* @param charsetName
	* @param method
	* @param params
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{
		readPage(urlStr, charsetName, method, params);	
		pageurl = urlStr;
	}
	
	/**
	 * @return List<String>
	 * @Author: lulei  
	 * @Description: 返回頁面上需求的鏈接地址
	 */
	public List<String> getPageUrls(){
		List<String> pageUrls = new ArrayList<String>();
		pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
		return pageUrls;
	}
	
	/**
	 * @return String
	 * @Author: lulei  
	 * @Description: 返回頁面上需求的網址連接的正則表達式
	 */
	public abstract String getUrlRegexString();
	
	/**
	 * @return int
	 * @Author: lulei  
	 * @Description: 正則表達式中要去的字段位置
	 */
	public abstract int getUrlRegexStringNum();	
}

DoRegex類,封裝的一些基於正則表達式字符串匹配查找類
 /**  
 * @Description: 正則處理工具   
 */ 
package com.lulei.util;  

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
  
public class DoRegex {
	
	private static String rootUrlRegex = "(http://.*?/)";
	private static String currentUrlRegex = "(http://.*/)";
	private static String ChRegex = "([\u4e00-\u9fa5]+)";

	/**
	 * @param dealStr
	 * @param regexStr
	 * @param splitStr
	 * @param n
	 * @return String
	 * @Author: lulei  
	 * @Description: 正則匹配結果。每條記錄用splitStr切割
	 */
	public static String getString(String dealStr, String regexStr, String splitStr, int n){
		String reStr = "";
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reStr;
		}
		splitStr = (splitStr == null) ?

"" : splitStr; Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); StringBuffer stringBuffer = new StringBuffer(); while (matcher.find()) { stringBuffer.append(matcher.group(n).trim()); stringBuffer.append(splitStr); } reStr = stringBuffer.toString(); if (splitStr != "" && reStr.endsWith(splitStr)){ reStr = reStr.substring(0, reStr.length() - splitStr.length()); } return reStr; } /** * @param dealStr * @param regexStr * @param n * @return String * @Author: lulei * @Description: 正則匹配結果,將所有匹配記錄組裝成字符串 */ public static String getString(String dealStr, String regexStr, int n){ return getString(dealStr, regexStr, null, n); } /** * @param dealStr * @param regexStr * @param n * @return String * @Author: lulei * @Description: 正則匹配第一條結果 */ public static String getFirstString(String dealStr, String regexStr, int n){ if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){ return ""; } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { return matcher.group(n).trim(); } return ""; } /** * @param dealStr * @param regexStr * @param n * @return ArrayList<String> * @Author: lulei * @Description: 正則匹配結果。將匹配結果組裝成數組 */ public static List<String> getList(String dealStr, String regexStr, int n){ List<String> reArrayList = new ArrayList<String>(); if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){ return reArrayList; } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { reArrayList.add(matcher.group(n).trim()); } return reArrayList; } /** * @param url * @param currentUrl * @return String * @Author: lulei * @Description: 組裝網址,網頁的url */ private static String getHttpUrl(String url, String currentUrl){ try { url = encodeUrlCh(url); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (url.indexOf("http") == 0){ return url; } if (url.indexOf("/") == 0){ return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1); } return getFirstString(currentUrl, currentUrlRegex, 1) + url; } /** * @param dealStr * @param regexStr * @param currentUrl * @param n * @return ArrayList<String> * @Author: lulei * @Description: 獲取和正則匹配的絕對鏈接地址 */ public static List<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){ List<String> reArrayList = new ArrayList<String>(); if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){ return reArrayList; } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl)); } return reArrayList; } /** * @param url * @return * @throws UnsupportedEncodingException * @Author: lulei * @Description: 將連接地址中的中文進行編碼處理 */ public static String encodeUrlCh (String url) throws UnsupportedEncodingException { while (true) { String s = getFirstString(url, ChRegex, 1); if ("".equals(s)){ return url; } url = url.replaceAll(s, URLEncoder.encode(s, "utf-8")); } } /** * @param dealStr * @param regexStr * @param array 正則位置數組 * @return * @Author:lulei * @Description: 獲取所有 */ public static List<String[]> getListArray(String dealStr, String regexStr, int[] array) { List<String[]> reArrayList = new ArrayList<String[]>(); if (dealStr == null || regexStr == null || array == null) { return reArrayList; } for (int i = 0; i < array.length; i++) { if (array[i] < 1) { return reArrayList; } } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { String[] ss = new String[array.length]; for (int i = 0; i < array.length; i++) { ss[i] = matcher.group(array[i]).trim(); } reArrayList.add(ss); } return reArrayList; } /** * @param dealStr * @param regexStr * @param array * @return * @Author:lulei * @Description: 獲取所有 */ public static List<String> getStringArray(String dealStr, String regexStr, int[] array) { List<String> reStringList = new ArrayList<String>(); if (dealStr == null || regexStr == null || array == null) { return reStringList; } for (int i = 0; i < array.length; i++) { if (array[i] < 1) { return reStringList; } } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < array.length; i++) { sb.append(matcher.group(array[i]).trim()); } reStringList.add(sb.toString()); } return reStringList; } /** * @param dealStr * @param regexStr * @param array 正則位置數組 * @return * @Author:lulei * @Description: 獲取第一個 */ public static String[] getFirstArray(String dealStr, String regexStr, int[] array) { if (dealStr == null || regexStr == null || array == null) { return null; } for (int i = 0; i < array.length; i++) { if (array[i] < 1) { return null; } } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { String[] ss = new String[array.length]; for (int i = 0; i < array.length; i++) { ss[i] = matcher.group(array[i]).trim(); } return ss; } return null; } }


CharsetUtil類。編碼方式檢測類

 /**  
 [email protected]:  編碼方式檢測類  
 */ 
package com.lulei.util;  

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;
  
public class CharsetUtil {
	private static final CodepageDetectorProxy detector;
	
	static {//初始化探測器
		detector = CodepageDetectorProxy.getInstance();
		detector.add(new ParsingDetector(false));
		detector.add(ASCIIDetector.getInstance());
		detector.add(UnicodeDetector.getInstance());
		detector.add(JChardetFacade.getInstance());
	}

	/**
	 * @param url
	 * @param defaultCharset
	 * @Author:lulei  
	 * @return 獲取文件的編碼方式
	 */
	public static String getStreamCharset (URL url, String defaultCharset) {
		if (url == null) {
			return defaultCharset;
		}
		try {
			//使用第三方jar包檢測文件的編碼
			Charset charset = detector.detectCodepage(url);
			if (charset != null) {
				return charset.name();
			}
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		return defaultCharset;
	}
	
	/**
	 * @param inputStream
	 * @param defaultCharset
	 * @return
	 * @Author:lulei  
	 * @Description: 獲取文件流的編碼方式
	 */
	public static String getStreamCharset (InputStream inputStream, String defaultCharset) {
		if (inputStream == null) {
			return defaultCharset;
		}
		int count = 200;
		try {
			count = inputStream.available();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		try {
			//使用第三方jar包檢測文件的編碼
			Charset charset = detector.detectCodepage(inputStream, count);
			if (charset != null) {
				return charset.name();
			}
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		return defaultCharset;
	}
}
上面四個類,就實現了網絡文本資源信息抓取的基本架構,以下就通過一個實際的樣例介紹怎樣使用上述類實現網絡文本資源信息的獲取。


百度新聞案例:

1)找到百度新聞更新列表頁,如http://news.baidu.com/n?cmd=4&class=civilnews&pn=1&from=tab 界面例如以下圖所看到的:

技術分享


文章URL鏈接地址例如以下圖所看到的:

技術分享


通過對源文件的分析。編寫BaiduNewList類。實現百度新聞列表頁信息的抓取,代碼例如以下:

 /**  
 [email protected]:   百度新聞滾動列表頁,能夠獲取當前頁面上的鏈接
 */ 
package com.lulei.crawl.news;  

import java.io.IOException;
import java.util.HashMap;

import com.lulei.crawl.CrawlListPageBase;
  
public class BaiduNewList extends CrawlListPageBase{
	private static HashMap<String, String> params;
	
	/**
	 * 加入相關頭信息,對請求進行偽裝
	 */
	static {
		params = new HashMap<String, String>();
		params.put("Referer", "http://www.baidu.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
	}

	public BaiduNewList(String urlStr) throws IOException {
		super(urlStr, "utf-8", "get", params);  
	}

	@Override
	public String getUrlRegexString() {
		// TODO Auto-generated method stub  
		//新聞列表頁中文章鏈接地址的正則表達式
		return "? <a href=\"(.*?

)\""; } @Override public int getUrlRegexStringNum() { // TODO Auto-generated method stub //鏈接地址在正則表達式中的位置 return 1; } /** * @param args * @throws IOException * @Author:lulei * @Description: 測試用例 */ public static void main(String[] args) throws IOException { // TODO Auto-generated method stub BaiduNewList baidu = new BaiduNewList("http://news.baidu.com/n?

cmd=4&class=sportnews&pn=1&from=tab"); for (String s : baidu.getPageUrls()) { System.out.println(s); } } }


2)通過第一步獲取的URL。得到新聞所在的內容頁面URL。因為百度新聞列表頁面上的新聞來自不同的站,所以非常難找到一個通用的結構。大多數的新聞類站點,內容都是放在p標簽內,所以就採用了例如以下的方式獲取新聞的內容,例如以下圖:

技術分享


News類具體實現例如以下所看到的:

 /**  
 [email protected]:   新聞類站點新聞內容 
 */ 
package com.lulei.crawl.news;  

import java.io.IOException;
import java.util.HashMap;

import org.apache.commons.httpclient.HttpException;

import com.lulei.crawl.CrawlBase;
import com.lulei.util.DoRegex;
  
public class News extends CrawlBase{
	private String url;
	private String content;
	private String title;
	private String type;
	
	private static String contentRegex = "<p.*?>(.*?)</p>";
	private static String titleRegex = "<title>(.*?)</title>";
	private static int maxLength = 300;
	
	private static HashMap<String, String> params;
	/**
	 * 加入相關頭信息,對請求進行偽裝
	 */
	static {
		params = new HashMap<String, String>();
		params.put("Referer", "http://www.baidu.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 默認p標簽內的內容為正文。假設正文長度查過設置的最大長度,則截取前半部分
	 */
	private void setContent() {
		String content = DoRegex.getString(getPageSourceCode(), contentRegex, 1);
		content = content.replaceAll("\n", "")
									  .replaceAll("<script.*?/script>", "")
									  .replaceAll("<style.*?/style>", "")
									  .replaceAll("<.*?>", "");
		this.content = content.length() > maxLength ? content.substring(0, maxLength) : content;
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 默認title標簽內的內容為標題
	 */
	private void setTitle() {
		this.title = DoRegex.getString(getPageSourceCode(), titleRegex, 1);;
	}
	
	public News(String url) throws HttpException, IOException {
		this.url = url;
		readPageByGet(url, "utf-8", params);
		setContent();
		setTitle();
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	public String getContent() {
		return content;
	}

	public String getTitle() {
		return title;
	}

	public String getType() {
		return type;
	}

	public void setType(String type) {
		this.type = type;
	}

	public static void setMaxLength(int maxLength) {
		News.maxLength = maxLength;
	}

	/**
	 * @param args
	 * @throws HttpException
	 * @throws IOException
	 * @Author:lulei  
	 * @Description: 測試用例
	 */
	public static void main(String[] args) throws HttpException, IOException {
		// TODO Auto-generated method stub  
		News news = new News("http://we.sportscn.com/viewnews-1634777.html");
		System.out.println(news.getContent());
		System.out.println(news.getTitle());
	}

}

3)編寫抓取的入口,這裏為了簡單,僅僅做了兩層的分析,所以新聞更新列表也的URL就直接寫在程序中。例如以下圖所看到的:

技術分享


運行一次採集任務例如以下圖所看到的:

技術分享


在main函數裏面僅僅須要一次性或周期性的去運行run函數就可以。具體代碼例如以下:

 /**  
 [email protected]:     
 */ 
package com.lulei.knn.data;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.lulei.crawl.news.BaiduNewList;
import com.lulei.crawl.news.News;
import com.lulei.knn.index.KnnIndex;
import com.lulei.knn.index.KnnSearch;
import com.lulei.util.ParseMD5;
  
public class CrawlNews {
	private static List<Info> infos;
	private static KnnIndex knnIndex = new KnnIndex();
	private static KnnSearch knnSearch = new KnnSearch();
	private static HashMap<String, Integer> result;
	
	static {
		infos = new ArrayList<Info>();
		infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=sportnews&pn=1&from=tab", "體育類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=2&from=tab", "體育類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=3&from=tab", "體育類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=1&sub=0", "軍事類")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=mil&pn=2&sub=0", "軍事類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=3&sub=0", "軍事類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=1&sub=0", "財經類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=2&sub=0", "財經類")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=finannews&pn=3&sub=0", "財經類")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=internet&pn=1&from=tab", "互聯網")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=housenews&pn=1&sub=0", "房產類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=2&sub=0", "房產類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=3&sub=0", "房產類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=1&sub=0", "遊戲類")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=gamenews&pn=2&sub=0", "遊戲類")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=3&sub=0", "遊戲類")); } /** [email protected]: 抓取網址信息 [email protected]:lulei */ static class Info{ String url; String type; Info(String url, String type) { this.url = url; this.type = type; } } /** * @param info * @Author:lulei * @Description: 抓取一個列表頁面下的新聞信息 */ private void crawl(Info info) { if (info == null) { return; } try { BaiduNewList baiduNewList = new BaiduNewList(info.url); List<String> urls = baiduNewList.getPageUrls(); for (String url : urls) { News news = new News(url); NewsBean newBean = new NewsBean(); newBean.setId(ParseMD5.parseStrToMd5L32(url)); newBean.setType(info.type); newBean.setUrl(url); newBean.setTitle(news.getTitle()); newBean.setContent(news.getContent()); //保存到索引文件裏 knnIndex.add(newBean); //knn驗證 if (news.getContent() == null || "".equals(news.getContent())) { result.put("E", 1+result.get("E")); continue; } if (info.type.equals(knnSearch.getType(news.getContent()))) { result.put("R", 1+result.get("R")); } else { result.put("W", 1+result.get("W")); } } } catch (Exception e) { e.printStackTrace(); } } /** * @Author:lulei * @Description: 啟動入口 */ public void run() { result = new HashMap<String, Integer>(); result.put("R", 0); result.put("W", 0); result.put("E", 0); for (Info info : infos) { System.out.println(info.url + "------start"); crawl(info); System.out.println(info.url + "------end"); } try { knnIndex.commit(); System.out.println("R = " + result.get("R")); System.out.println("W = " + result.get("W")); System.out.println("E = " + result.get("E")); System.out.println("準確度:" + (result.get("R") * 1.0 / (result.get("R") + result.get("W")))); System.out.println("-------------finished---------------"); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { new CrawlNews().run(); } }


到此為止,一個完整的採集程序就完畢了。

基於HttpClient實現網絡爬蟲~以百度新聞為例