1. 程式人生 > >利用記憶體資料庫和布隆過濾器寫的網路爬蟲

利用記憶體資料庫和布隆過濾器寫的網路爬蟲

記憶體資料庫用來儲存待訪問url,布隆過濾器用來記錄已訪問的url。先前我們待訪問url是存放在記憶體中,已訪問的url是利用HashSet實現的。

布隆過濾器

package hashfilter;

import java.util.BitSet;

import bdb.CrawlUrl;

public class SimpleBloomFilter {
	private static final int DEFAULT_SIZE=2<<24;
	private static final int seeds[]={7,11,13,31,37,61};
	private BitSet bits=new BitSet(DEFAULT_SIZE);
	private SimpleHash func[]=new SimpleHash[seeds.length];
	
	public SimpleBloomFilter()
	{
		int n=func.length;
		for(int i=0;i<n;i++)
		{
			func[i]=new SimpleHash(DEFAULT_SIZE,seeds[i]);
		}
	}
	
	public void add(CrawlUrl crawlUrl)
	{
			add(crawlUrl.getOriUrl());
	}
	
	private void add(String value)
	{
		if(value!=null)
		{
			for(SimpleHash f:func)
			{
				bits.set(f.hash(value), true);
			}
		}
	}
	
	public boolean contains(CrawlUrl crawlUrl)
	{
			return contains(crawlUrl.getOriUrl());
	}
	
	private boolean contains(String value)
	{
		if(value==null)
			return false;
		else
		{
			boolean ret=true;
			for(SimpleHash f:func)
			{
				ret=ret&&bits.get(f.hash(value));
			}
			return ret;
		}
	}
}
package hashfilter;

public class SimpleHash {
	private int cap;
	private int seed;
	public SimpleHash(int cap,int seed)
	{
		this.cap=cap;
		this.seed=seed;
	}
	
	public int hash(String value)
	{
		int result=0;
		int n=value.length();
		for(int i=0;i<n;i++)
		{
			result=result*seed+value.charAt(i);
		}
		return (cap-1)&result;
	}

}

記憶體資料庫
package bdb;

import java.io.Serializable;
import java.util.Date;

import com.sleepycat.je.utilint.Timestamp;



public class CrawlUrl implements Serializable{
	private static final long serialVersionUID=7931672194843948629L;
	public CrawlUrl(){
		
	}
	private String oriUrl;				// 原始 URL 的值,主機部分是域名
	
	private String url;					// URL 的值,主機部分是 IP,為了防止重複主機的出現
	private int urlNo;					// URL NUM
	private int statusCode;				// 獲取 URL 返回的結果碼
	private int hitNum;					// 此 URL 被其他文章引用的次數
	private String charSet;				// 此 URL 對應文章的漢字編碼
	private String abstractText;		// 文章摘要
	private String author;				// 作者
	private int weight;					// 文章的權重(包含導向詞的資訊)
	private String description;			// 文章的描述
	private int fileSize;				// 文章大小
	private Timestamp lastUpdateTime;	// 最後修改時間
	private Date timeToLive;			// 過期時間
	private String title;				// 文章名稱
	private String type;				// 文章型別
	private String[] urlRefrences;		// 引用的連結
	private int layer;					// 爬取的層次, 從種子開始, 依次為第 0 層, 第 1 層...

	public int getLayer()
	{
		return layer;
	}
	public void setLayer(int layer)
	{
		this.layer=layer;
	}
	public String getUrl()
	{
		return url;
	}
	public void setUrl(String url)
	{
		this.url=url;
	}
	public int getUrlNo()
	{
		return urlNo;
	}
	public void setUrlNo(int urlNo) 
	{
		this.urlNo = urlNo;
	}
	public int getStatusCode() 
	{
		return statusCode;
	}
	public void setStatusCode(int statusCode) 
	{
		this.statusCode = statusCode;
	}
	public int getHitNum() 
	{
		return hitNum;
	}
	public void setHitNum(int hitNum) 
	{
		this.hitNum = hitNum;
	}
	public String getCharSet() 
	{
		return charSet;
	}
	public void setCharSet(String charSet) 
	{
		this.charSet = charSet;
	}
	public String getAbstractText() 
	{
		return abstractText;
	}
	public void setAbstractText(String abstractText) 
	{
		this.abstractText = abstractText;
	}
	public String getAuthor() 
	{
		return author;
	}
	public void setAuthor(String author) 
	{
		this.author = author;
	}
	public int getWeight() 
	{
		return weight;
	}
	public void setWeight(int weight)
	{
		this.weight = weight;
	}
	public String getDescription() 
	{
		return description;
	}
	public void setDescription(String description) 
	{
		this.description = description;
	}
	public int getFileSize() 
	{
		return fileSize;
	}
	public void setFileSize(int fileSize) 
	{
		this.fileSize = fileSize;
	}
	public Timestamp getLastUpdateTime() 
	{
		return lastUpdateTime;
	}
	public void setLastUpdateTime(Timestamp lastUpdateTime)
	{
		this.lastUpdateTime = lastUpdateTime;
	}
	public Date getTimeToLive() 
	{
		return timeToLive;
	}
	public void setTimeToLive(Date timeToLive) 
	{
		this.timeToLive = timeToLive;
	}
	public String getTitle() 
	{
		return title;
	}
	public void setTitle(String title) 
	{
		this.title = title;
	}
	public String getType() 
	{
		return type;
	}
	public void setType(String type) 
	{
		this.type = type;
	}
	public String[] getUrlRefrences() 
	{
		return urlRefrences;
	}
	public void setUrlRefrences(String[] urlRefrences) 
	{
		this.urlRefrences = urlRefrences;
	}
	public final String getOriUrl() 
	{
		return oriUrl;
	}
	public void setOriUrl(String oriUrl) 
	{
		this.oriUrl = oriUrl;
	}
}

package bdb;

public interface Frontier {
	public CrawlUrl getNext() throws Exception;
	public boolean putUrl(CrawlUrl url) throws Exception;

}

package bdb;

import java.io.File;





import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;

public abstract class AbstractFrontier {
	private Environment env;
	private static final String CLASS_CATALOG="java_class_catalog";
	protected StoredClassCatalog javaCatalog;
	protected Database catalogdatabase;
	protected Database database;
	
	public AbstractFrontier(String homeDirectory)
	{
		System.out.println("Opening environment in: "+homeDirectory);
		EnvironmentConfig envConfig=new EnvironmentConfig();
		envConfig.setTransactional(true);
		envConfig.setAllowCreate(true);
		env=new Environment(new File(homeDirectory),envConfig);
		
		DatabaseConfig dbConfig=new DatabaseConfig();
		dbConfig.setAllowCreate(true);
		dbConfig.setTransactional(true);
		catalogdatabase=env.openDatabase(null, CLASS_CATALOG, dbConfig);
		// A single StoredClassCatalog object is normally used along with a set of databases that stored serialized objects.
		// 存放需要序列化的物件
		javaCatalog=new StoredClassCatalog(catalogdatabase);
		DatabaseConfig dbConfig0=new DatabaseConfig();
		dbConfig0.setAllowCreate(true);
		dbConfig0.setTransactional(true);
		// 存放的是key
		database=env.openDatabase(null,"URL", dbConfig0);
	}
	public void close()
	{
		database.close();
		javaCatalog.close();
		env.close();
	}
	protected abstract void put(Object key,Object value);
	protected abstract Object get(Object key);
	protected abstract Object delete(Object key);

}

package bdb;

import java.util.Map.Entry;
import java.util.Set;

import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.collections.StoredMap;

public class BDBFrontier extends AbstractFrontier implements Frontier{
	private StoredMap pendingUrisDB=null;

	public BDBFrontier(String homeDirectory) {
		super(homeDirectory);
		// TODO Auto-generated constructor stub
		// 獲得DatabaseEntry有兩種方式,一是通過其建構函式,引數是物件的位元組;
		// 二是通過EntryBinding.objectToEntry()函式來獲得
		EntryBinding keyBinding=new SerialBinding(javaCatalog, String.class);
		EntryBinding valueBinding=new SerialBinding(javaCatalog,CrawlUrl.class);
		// Creates a map entity view of a Database
		pendingUrisDB=new StoredMap(database,keyBinding,valueBinding,true);
		
	}

	@Override
	public CrawlUrl getNext() throws Exception {
		// TODO Auto-generated method stub
		CrawlUrl result=null;
		if(!pendingUrisDB.isEmpty())
		{
//			Set entrys=pendingUrisDB.entrySet();
//			System.out.println(entrys);
			Entry<String,CrawlUrl> 
			entry=(Entry<String,CrawlUrl>)pendingUrisDB.entrySet().iterator().next();
			result=entry.getValue();
			delete(entry.getKey());
		}
		return result;
	}

	@Override
	public boolean putUrl(CrawlUrl url) throws Exception {
		// TODO Auto-generated method stub
		put(url.getOriUrl(),url);
		return true;
	}

	@Override
	protected void put(Object key, Object value) {
		// TODO Auto-generated method stub
		pendingUrisDB.put(key, value);
		
	}

	@Override
	protected Object get(Object key) {
		// TODO Auto-generated method stub
		return pendingUrisDB.get(key);
	}

	@Override
	protected Object delete(Object key) {
		// TODO Auto-generated method stub
		return pendingUrisDB.remove(key);
	}
	
	// 根據url可計算鍵值,可使用包括MD5在內的各種壓縮演算法
	private String calulateUrl(String url)
	{
		return url;
	}
	
	public boolean contains(CrawlUrl url)
	{
		return pendingUrisDB.containsKey(url.getOriUrl());
	}
	
	public boolean isEmpty()
	{
		return pendingUrisDB.isEmpty();
	}
	
	// 測試程式
//	public static void main(String[] args)
//	{
//		BDBFrontier bDBFrontier=new BDBFrontier("D:\\bdb");
//		CrawlUrl url=new CrawlUrl();
//		url.setOriUrl("http://www.baidu.com");
//		try {
//			bDBFrontier.putUrl(url);
//			System.out.println(bDBFrontier.getNext().getOriUrl());
//			bDBFrontier.close();
//		} catch (Exception e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		}
//	}

}

封裝待訪問url和已訪問url
import bdb.BDBFrontier;
import bdb.CrawlUrl;
import hashfilter.SimpleBloomFilter;


public class NewLinkQueue {
	private static SimpleBloomFilter visitedUrl=new SimpleBloomFilter();
	private static BDBFrontier unvistedUrl=new BDBFrontier("D:\\bdb");
	
	public static Object unvisitedUrlDeQueue() throws Exception
	{
		return unvistedUrl.getNext().getOriUrl();
	}
	
	public static void addUnvisitedUrl(String url)
	{
		CrawlUrl crawlUrl=new CrawlUrl();
		crawlUrl.setOriUrl(url);
		if(url!=null&&!url.trim().equals("")
				&&!unvistedUrl.contains(crawlUrl)&&!visitedUrl.contains(crawlUrl))
		{
			try {
				unvistedUrl.putUrl(crawlUrl);
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}
	
	public static boolean unvisitedUrlIsEmpty()
	{
		return unvistedUrl.isEmpty();
	}
	
	public static void addVisitedUrl(String url)
	{
		CrawlUrl crawlUrl=new CrawlUrl();
		crawlUrl.setOriUrl(url);
		visitedUrl.add(crawlUrl);
	}

}
//下載網頁
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;


public class DownLoadFile {
	
	private String filePath;
	private CloseableHttpClient httpclient;
	
	DownLoadFile()
	{
		filePath=null;
//		httpclient=HttpClients.createDefault();
//		HttpParams httpParams=new BasicHttpParams();
//		HttpConnectionParams.setConnectionTimeout(httpParams, 50000);
//		HttpConnectionParams.setSoTimeout(httpParams, 5000);
		
	}
	
	//根據URL和網頁型別生成需要儲存的網頁的檔名,去除URL中的非檔名字元
	public String getFileNameByUrl(String url,String contentType)
	{
		url=url.substring(7);
		//text/html型別
		if(contentType.indexOf("html")!=-1)
		{
			url=url.replaceAll("[\\?/:|<>\"]","_")+".html";
			return url;
		}
		else
		{
			return url.replaceAll("[\\?/:|<>\"]","_")+"."
					+contentType.substring(contentType.lastIndexOf("/")+1);
		}
	}
	
	//儲存網頁位元組陣列到本地檔案,filePath為要儲存的檔案的相對路徑
	
	
	
	
	//下載URL指向的網頁
	public String downloadFile(String url)
	{
		System.out.println("link:"+url);
//		HttpClientConnectionManager connManager=new BasicHttpClientConnectionManager();
//		connManager.closeIdleConnections(5, TimeUnit.SECONDS);
//		httpclient=HttpClients.createMinimal(connManager);
		
		
//		RequestConfig.Builder requestBuilder=RequestConfig.custom();
//		requestBuilder = requestBuilder.setConnectionRequestTimeout(5*1000);
//		requestBuilder = requestBuilder.setConnectTimeout(5*1000);
//		HttpClientBuilder builder=HttpClientBuilder.create();
//		builder.setDefaultRequestConfig(requestBuilder.build());
//		CloseableHttpClient httpclient=builder.build();
		
		 HttpParams params = new BasicHttpParams();

		 HttpConnectionParams.setConnectionTimeout(params, 10000);

		 HttpConnectionParams.setSoTimeout(params, 10000);

		 HttpClient httpClient = new DefaultHttpClient(params);
		
		
		try {
			HttpGet httpGet=new HttpGet(url);
			HttpResponse response=httpClient.execute(httpGet);
			System.out.println("得到http響應");
			if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK)
			{
			
/**************************************************************************************/
				//提取網頁編碼方式
/*				Header[] headers=response.getAllHeaders();
				
				String charset=null;
				int temp=-1;
				for(int i=0;i<headers.length;i++)
				{
					if((temp=headers[i].getValue().indexOf("charset="))!=-1)
					{
//						int end=headers[i].getValue().indexOf("\"");
//						if(end==-1)
//							end=headers[i].getValue().indexOf(">");
//						charset=headers[i].getValue().substring(temp+8,end-1);
						charset=headers[i].getValue().substring(temp+8);
						break;
					}
				}
*/				
/*				InputStream in=response.getEntity().getContent();
				
				
				String charset=null;
				byte b[]=null;
				int contentLength=in.available();
				if(contentLength>1000)
				{
					contentLength=1000;
				}
				
				b=new byte[1000];
				in.read(b,0,contentLength);
				String strTmp=new String(b);
				Pattern p;
				Matcher m;
				String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8";
				p=Pattern.compile(regex);
				m=p.matcher(strTmp);
				if(m.find())
				{
					charset=m.group();
				}
				else
				{
					charset="utf-8";
				}
				
				System.out.println("得到網頁字符集"+charset);
//				BufferedReader br=new BufferedReader(new InputStreamReader(in));
//				if(charset==null)
//				{
//					String line="";
//					StringBuffer buffer=new StringBuffer();
//					while((line=br.readLine())!=null)
//					{
//						buffer.append(line);
//					}
//					line=buffer.toString();
//					int a=line.indexOf("charset=");
//					String str=line.substring(a);
//					charset=str.substring(8,str.indexOf("\""));
//				}
//				if(charset==null)
//				{
//					charset="utf-8";
//				}
*/
/*************************************************************************************/
/*				//得到網頁內容
				BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset));
*/
/*************************************************************************************/
				String a=response.getFirstHeader("Content-Type").getValue();
				System.out.println("Content-Type內容: "+a);
				
				
				InputStream responseBody=response.getEntity().getContent();
				filePath="E:\\temp\\"
				+getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
				System.out.println("檔案路徑: "+filePath);
//				saveToLocal(responseBody,filePath);
				FileOutputStream outputStream=new FileOutputStream(new File(filePath));
				
				int length=0;
				byte b[]=new byte[1024];
				
				while((length=responseBody.read(b))!=-1)
				{
					outputStream.write(b,0,length);
				}
				
				responseBody.close();
				outputStream.close();
			}
			else
			{
				System.err.print("Method Failed:"+response.getStatusLine().getStatusCode());
			}
		} catch (ClientProtocolException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}catch(Exception e){
			e.printStackTrace();
		}
		
//		try {
//			httpclient.close();
//		} catch (IOException e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		}
	
		return filePath;
	}
	
	private void saveToLocal(InputStream responseBody,String filePath) throws IOException
	{
//		int ch;
//		FileWriter fw=new FileWriter(filePath);
//		
//		
//		while((ch=responseBody.read())!=-1){
//			fw.write(ch);
//		}
//		responseBody.close();
//		fw.close();
//		
//		return ;
		
		
		
		
		
//		String line="";
//		StringBuffer buffer=new StringBuffer();
//		int i=0;
//		while((line=responseBody.readLine())!=null)
//		{
//			buffer.append(line);
//			System.out.println("第"+i+"次迴圈");
//			i++;
//		}
//		line=buffer.toString();
//		System.out.println(line);//輸出原始碼
/**********************************************************************************************/
		//向檔案中寫入原始碼字串
//		FileWriter fw1=new FileWriter(filePath);
//		fw1.write(line);
//		fw1.close();
//		System.out.println("儲存完成"+filePath);
		
//		DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath)));
//		for(int i=0;i<b.length;i++)
//		{
//			out.write(b[i]);
//		}
		FileOutputStream outputStream=new FileOutputStream(new File(filePath));
		
		byte b[]=new byte[1024];
		
		while(responseBody.read(b)!=-1)
		{
			outputStream.write(b);
		}
		responseBody.close();
		outputStream.close();
	}
/*****************************************************************************************/
	//除錯用
//	public static void main(String[] args)
//	{
//		DownLoadFile df=new DownLoadFile();
//		df.downloadFile("http://www.baidu.com");
//	}
}
提取連結
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.HashSet;
import java.util.Set;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class HtmlParserTool {
	public static Set<String> extractLinks(String filePath)
	{
		Set<String> links=new HashSet<String>();
		NodeList nodeList;
		String line="";  
        StringBuffer sb=new StringBuffer();
        NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
        OrFilter lastFilter=new OrFilter();
        lastFilter.setPredicates(new NodeFilter[]{linkFilter});
		try {
			BufferedReader br=new BufferedReader(new FileReader(filePath));
			while((line=br.readLine())!=null)
			{
				sb.append(line);
			}
			Parser parser=Parser.createParser(sb.toString(), "utf-8");
			nodeList=parser.parse(lastFilter);
			Node nodes[]=nodeList.toNodeArray();
			String link=null;
			for(int i=0;i<nodes.length;i++)
			{
				if(nodes[i] instanceof  LinkTag)//	<a>  標籤
				{
					LinkTag linkNode=(LinkTag)(nodes[i]);
					link=linkNode.getLink();
					links.add(link);
				}
				else//	<frame標籤>
				{
					//提取frame裡src屬性的連結,如<frame src="test.html"/>
					String frame=nodes[i].getText();
					int start=frame.indexOf("src");
					int end=frame.indexOf(" ");
					if(end==-1)
					{
						end=frame.indexOf(">");
					}
					String frameUrl=frame.substring(start+5, end-1);
					links.add(frameUrl);
				}
			}
			
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}catch (Exception e) {
			e.printStackTrace();
		}
		return links;
		
		
//		try {
//			Parser parser=new Parser(url);
//			parser.setEncoding("gb2312");
//			NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
//			OrFilter lastFilter=new OrFilter();
//			lastFilter.setPredicates(new NodeFilter[]{linkFilter});
////			parser.setEncoding("gb2312");
//			nodeList=parser.parse(lastFilter);
//			Node[] nodes=nodeList.toNodeArray();
//			String link="";
//			System.out.println("開始提取連結迴圈");
//			for(int i=0;i<nodes.length;i++)
//			{
//				if(nodes[i] instanceof  LinkTag)//	<a>  標籤
//				{
//					LinkTag linkNode=(LinkTag)(nodes[i]);
//					link=linkNode.getLink();
//					links.add(link);
//				}
//				else//	<frame標籤>
//				{
//					//提取frame裡src屬性的連結,如<frame src="test.html"/>
//					String frame=nodes[i].getText();
//					int start=frame.indexOf("src");
//					int end=frame.indexOf(" ");
//					if(end==-1)
//					{
//						end=frame.indexOf(">");
//					}
//					String frameUrl=frame.substring(start+5, end-1);
//					links.add(frameUrl);
//				}
//			}
//		} catch (ParserException e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		}
//		catch(Exception e){
//			e.printStackTrace();
//		}
//		return links;
	}
}

主程式
import java.util.Set;


public class MyClawler {
	private void initCrawlerWithSeeds(String[] seeds)
	{
		for(int i=0;i<seeds.length;i++)
		{
			NewLinkQueue.addUnvisitedUrl(seeds[i]);
		}
	}
	
	public void crawling(String[] seeds)
	{
/******************************************************************************/
		//定義過濾器
/*		LinkFilter filter=new LinkFilter()
		{
			public boolean accept(String url)
			{
				if(url.startsWith("http://www.baidu.com"))
					return true;
				else 
					return false;
			}
		};
*/
/******************************************************************************/
		
		initCrawlerWithSeeds(seeds);
		
		DownLoadFile downLoader=new DownLoadFile();
		Set<String> links=null;
		String filePath=null;
		while(!NewLinkQueue.unvisitedUrlIsEmpty())
		{
			String visitUrl;
			try {
				visitUrl = (String)NewLinkQueue.unvisitedUrlDeQueue();// 未訪問佇列隊首Url出列
				System.out.println("提取未訪問的Url"+visitUrl);
				if(visitUrl==null)
					continue;
				filePath=downLoader.downloadFile(visitUrl);// 下載網頁
				NewLinkQueue.addVisitedUrl(visitUrl);// 將該Url放入已訪問佇列
				links=HtmlParserTool.extractLinks(filePath);// 提取網頁中的連結
				System.out.println("網頁中的連結數:"+links.size());
				for(String link:links)
				{
					NewLinkQueue.addUnvisitedUrl(link);// 將連結放入未訪問佇列
					System.out.println(link);
				}
				System.out.println("網頁中的連結數:"+links.size());
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			
			
		}
		
	}
	
	public static void main(String[] args)
	{
		MyClawler clawler=new MyClawler();
		clawler.crawling(new String[]{"http://www.baidu.com"});
		System.out.println("done");
	}
}

參考文獻:《自己動手寫網路爬蟲》、Berkeley DB參考手冊等