1. 程式人生 > >利用HttpClient和HtmlParser構造簡單爬蟲

利用HttpClient和HtmlParser構造簡單爬蟲

/**
 * 爬蟲主方法入口類
 * @author Qing
 *
 */
public class Clawler {
	/**
	 * 用種子url初始化url佇列
	 * @param seeds
	 */
	 private void initCrawlerWithSeeds(String[] seeds){
		 for(int i = 0; i < seeds.length; i ++){
			 LinkDB.addUnvisitedUrl(seeds[i]);
		 }
	 }
	 public void crawling(String[] seeds){
		 LinkFilter filter = new LinkFilter(){
			 //:http://club.xdnice.com/
			 public boolean accept(String url){
				 if(url.startsWith("http://club.xdnice.com/"))
					 return true;
				 else
					 return false;
			 }
		 };
		 //初始化url佇列
		 initCrawlerWithSeeds(seeds);
		 //visit的url的最大值,並且未訪問的url集不為空
		 while(!LinkDB.unVisitedUrlIsEmpty() && LinkDB.getVisitedUrlNum() <= 10){
			 //隊列出隊一個url
			 String visitUrl = LinkDB.unVisitedUrlDeQueue();
			 if(visitUrl == null){
				 continue;
			 }
			 //下載網頁
			 FileDownLoader fdloader = new FileDownLoader();
			 fdloader.downloadFile(visitUrl);
			 //加入已訪問列表
			 LinkDB.addVisitedUrl(visitUrl);
			 //提取url
			 Set<String> links = HtmlParserTool.extracLinks(visitUrl,filter);
			 for(String link: links){
				 LinkDB.addUnvisitedUrl(link);
			 }
			 
		 }
	 }

FileDownLoader是一個利用HttpClient將網頁的位元組下載到本地,負責網頁下載的物件

/**
 * 網頁下載類
 * @author Qing
 *
 */
public class FileDownLoader {
	/**
	 * 根據url和網頁型別生成需要儲存的網頁中文名,去除url中非檔名字元
	 * @param url
	 * @param contentType
	 * @return
	 */
	public String getFileNameByUrl(String url,String contentType){
		url = url.substring(7);//移除http://
		if(contentType.indexOf("html") != -1){//html
			url = url.replaceAll("[\\?/:*|<>\"]","_");//去掉url中非檔名字元生成檔名
			return url;
		}
		else{
			return url.replaceAll("[\\?/:*|<>\"]","_")+"." + contentType.substring(contentType.lastIndexOf("/")+1);
		}
	}
	/**
	 * 儲存網頁位元組陣列到本地檔案
	 * @param data
	 * @param filePath
	 */
	public void saveToLocal(String data,String filePath){
		try{
			DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
			out.writeUTF(data);//write in utf-8
			out.flush();
			out.close();
		}catch(Exception e){
			e.printStackTrace();
		}
	}
	/**
	 * 下載url網頁
	 * @param url
	 * @return
	 */
	public String downloadFile(String url){
		UrlEncodedFormEntity uefEntity;
		String filePath = null;
		CloseableHttpClient httpclient = HttpClients.createDefault();
		try{
			HttpGet httpget = new HttpGet(url);
			List<NameValuePair> params = new ArrayList<NameValuePair>();
			String str = EntityUtils.toString(new UrlEncodedFormEntity(params, Consts.UTF_8));
			httpget.setURI(new URI(httpget.getURI().toString() +"?" + str));
			
			//執行get請求
			CloseableHttpResponse response = httpclient.execute(httpget);
			//獲取響應實體
			HttpEntity entity = response.getEntity();
			
			filePath = "temp/"+ getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
			saveToLocal(EntityUtils.toString(entity),filePath);
		}catch(Exception e){
			e.printStackTrace();
		}
		return filePath;
		
	}
HtmlParserTool利用HtmlParser對網頁進行過濾,過濾出符合條件的連結
/**
 *分析網頁獲取連結類
 * @author Qing
 *
 */
public class HtmlParserTool {
	/**
	 * 獲取一個網站上的連結,filter用來過濾連結
	 * @param url
	 * @param filter
	 * @return
	 */
	public static Set<String> extracLinks(String url,LinkFilter filter){
		Set<String> links = new HashSet<String>();
		try{
			Parser parser = new Parser(url);
			parser.setEncoding("gb2312");
			//過濾<frame>標籤的filter
			NodeFilter frameFilter = new NodeFilter(){

				@Override
				public boolean accept(Node node) {
					if(node.getText().startsWith("frame src=")){
						return true;
					}
					// TODO Auto-generated method stub
					return false;
				}
				
			};
			//OrFilter設定過濾<a><frame>標籤,or關係
			OrFilter linkfilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);
			NodeList list = parser.extractAllNodesThatMatch(linkfilter);
			for(int i = 0; i < list.size(); i++){
				Node tag = list.elementAt(i);
				if(tag instanceof LinkTag){
					LinkTag link = (LinkTag) tag;
					String linkUrl = link.getLink();
					if(filter.accept(linkUrl)){
						links.add(linkUrl);
					}
				}
				else{
					String frame = tag.getText();
					int start = frame.indexOf("src=");
					frame = frame.substring(start);
					int end = frame.indexOf(" ");
					if(end == -1){
						end = frame.indexOf(">");
					}
					frame = frame.substring(5, end -1);
					if(filter.accept(frame)){
						links.add(frame);
					}
				}
			}
	}catch(Exception e){
		e.printStackTrace();
	}
		return links;
	}
LinkDB負責管理未訪問的連結的集合,和未訪問的連結的佇列
public class LinkDB {
	private static Set<String> visitedUrl = new HashSet<String>();
	private static Queue<String> unVisitedUrl = new ArrayDeque<String>();
LinkFilter是一個Filter介面,實現了accept(String url)方法,因為NodeFilter只能實現accept(Node node)
public interface LinkFilter {
	public boolean accept(String url);

}