1. 程式人生 > >用JAVA實現一個爬蟲,爬取知乎的上的內容(程式碼已無法使用)

用JAVA實現一個爬蟲,爬取知乎的上的內容(程式碼已無法使用)

在學習JAVA的過程中寫的一個程式,處理上還是有許多問題,爬簡單的頁面還行,複雜的就要跪.
爬取內容主要使用URLConnection請求獲得頁面內容,使用正則匹配頁面內容獲得所需的資訊存入檔案,使用正則尋找這個頁面中可訪問的URL,使用佇列儲存未訪問的URL和已訪問的URL。另外,由於沒有用到第三方包,所以不需要去下載新的jar包,如果遇到報錯,可能是快捷匯入的包錯了,改過來就好了。

還是直接上程式碼吧。

public class MainTest {
/*
 * author:luo bangliu
 * SCUT
 * */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
String url="http://www.zhihu.com/explore/recommendations"; String result=Spider.SendGet(url); } Spider.getTarget(result); } } public class Spider { static String getEdit="question_link.+?>(.+?)<"; static String getUrl="<h2>.+?question_link.+?href=\"(.+?)\".+?</h2>"
; static String getDeepUrl="question_link.+?href=\"(.+?)\""; static String SendGet(String url){ String result=""; System.out.println("SendGet正在抓取:"+url); BufferedReader in=null; try{ URL realUrl=new URL(url); URLConnection connection=realUrl.openConnection(); connection.connect(); in
=new BufferedReader(new InputStreamReader( connection.getInputStream(),"UTF-8")); String line; while((line=in.readLine())!=null) { result+=line; } } catch (Exception e) { // TODO: handle exception System.out.println("Spider get error:"+url); e.printStackTrace(); } finally{ try{ if(in!=null) in.close(); }catch(Exception e) { e.printStackTrace(); } } return result; } static ArrayList<Zhihu> getRec(String content) { ArrayList<Zhihu>result=new ArrayList<Zhihu>(); Zhihu zhihuTmp=new Zhihu(); Pattern pattern=Pattern.compile(getUrl); Matcher matcher=pattern.matcher(content); while(matcher.find()) { zhihuTmp.init(matcher.group(1)); result.add(zhihuTmp); } return result; } static String getRealUrl(String url) { String ret = "http://www.zhihu.com/explore/recommendations"; Pattern pattern=Pattern.compile("question/(.*?)/"); Matcher matcher=pattern.matcher(url); if(matcher.find()) ret="http://www.zhihu.com/question/"+matcher.group(1); else if(url.length()<25){ ret="http://www.zhihu.com"+url; } return ret; } static void getTarget(String content){ SpiderQueue queue=new SpiderQueue(); Pattern pattern=Pattern.compile(getUrl); Matcher matcher=pattern.matcher(content); Zhihu tmp=new Zhihu(); while(matcher.find()) { queue.addUnvisitedUrl(matcher.group(1)); System.out.println("from recommendations:"+matcher.group(1)); } while(!queue.unVisitedUrlsEmpty()) { String url=(String) queue.unVisitedUrlDequeue(); queue.addVisiteUrl(url); tmp.init(url); url=getRealUrl(url); String c=SendGet(url); Pattern p=Pattern.compile(getDeepUrl); Matcher m=p.matcher(c); if(queue.getUnVisitedUrlNum()<=1000) while(m.find()){ System.out.println("get url from:"+m.group(1)+" number:"+queue.getUnVisitedUrlNum()); queue.addUnvisitedUrl(m.group(1)); } System.out.println("the loop :"+url); FileReaderWriter.writeIntoFile(tmp.writeString(), "E:/test.txt", true); //將獲得的資料寫入檔案中 } System.out.println("queue is empty"+queue.getVisitedUrlNum()); } } public class FileReaderWriter { public static boolean createNewFile(String filePath){ boolean ifSuccess=true; String filePathTurn=filePath.replaceAll("\\\\","/"); int index=filePathTurn.lastIndexOf("/"); String dir=filePathTurn.substring(0,index); File fileDir=new File(dir); ifSuccess=fileDir.mkdirs(); File file=new File(filePathTurn); try{ ifSuccess=file.createNewFile(); }catch(IOException e) { ifSuccess=false; e.printStackTrace(); } return ifSuccess; } public static boolean writeIntoFile(String content,String filePath,boolean ifAppend){ boolean ifSuccess=true; int index=filePath.lastIndexOf("/"); String dir=filePath.substring(0,index); File fileDir=new File(dir); fileDir.mkdirs(); File file=null; try{ file = new File(filePath); file.createNewFile(); } catch (IOException e) { ifSuccess = false; e.printStackTrace(); } FileWriter fileWriter=null; try{ fileWriter=new FileWriter(file,ifAppend); fileWriter.write(content); fileWriter.flush(); }catch(IOException e) { ifSuccess=false; e.printStackTrace(); } finally{ try { if(fileWriter!=null) fileWriter.close(); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } return ifSuccess; } } public class SpiderQueue { private static Set<Object>visitedUrl=new HashSet<>(); private static Queue unVisitedUrl=new Queue(); public void addVisiteUrl(String Url){ visitedUrl.add(Url); } public void removeVisitedUrl(String url){ visitedUrl.remove(url); } public int getVisitedUrlNum(){ return visitedUrl.size(); } public Object unVisitedUrlDequeue(){ return unVisitedUrl.deQueue(); } public void addUnvisitedUrl(String url){ if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url) &&!unVisitedUrl.contians(url)){ unVisitedUrl.enQueue(url); System.out.println("add to list success"+url); } else if(url==null){ System.out.println("url=null"); } else if(url.trim().equals("")){ System.out.println("url equals null"); } else if(visitedUrl.contains(url)){ System.out.println("vistedList alearld have"); } else if(unVisitedUrl.contians(url)){ System.out.println("unVisitedList alearld have"); } else System.out.println("something happened"); } public boolean unVisitedUrlsEmpty(){ return unVisitedUrl.empty(); } public int getUnVisitedUrlNum(){ return unVisitedUrl.getNum(); } } public class Zhihu { //獲得標題的正則 public static String getQuestion="zh-question-title.+?<h2.+?>(.+?)</h2>"; //獲得描述的正則 public static String getDetail="zh-question-detail.+?<div.+?>(.*?)</div>"; //獲得答案的正則 public static String getAnswer="data-author-name=\"(.+?)\".+?<div.+?>(.+?)</div>"; public String question; public String zhihuUrl; public ArrayList<String> authorName; public ArrayList<String> answers; public String questionDesc; public void init(String Url) { question=""; zhihuUrl=""; answers=new ArrayList<String>(); authorName=new ArrayList<String>(); questionDesc=""; try { if(getRealUrl(Url)){ String content=Spider.SendGet(zhihuUrl); System.out.println("zhihu spider begin:"+zhihuUrl); Pattern pattern; Matcher matcher; if(content!=null){ pattern=Pattern.compile(getQuestion); matcher=pattern.matcher(content); if(matcher.find()) question=matcher.group(1); else{ question="lost"; System.out.println("lost question:"+Url); } pattern=Pattern.compile(getDetail); matcher=pattern.matcher(content); if(matcher.find()) questionDesc=matcher.group(1); else{ questionDesc="lost"; System.out.println("lost questionDesc:"+Url); } pattern=Pattern.compile(getAnswer); matcher=pattern.matcher(content); while(matcher.find()) { authorName.add(matcher.group(1)); answers.add(matcher.group(2)); } } } } catch (Exception e) { // TODO: handle exception System.out.println("zhihu class error"); e.printStackTrace(); } } public Zhihu(){ question=""; zhihuUrl=""; answers=new ArrayList<String>(); authorName=new ArrayList<String>(); questionDesc=""; // //System.out.println("Zhihu類正在抓取:"+zhihuUrl); } public boolean getAll() { return true; } @Override public String toString() { return "question:"+question+"\n description:"+questionDesc+"\n link:"+zhihuUrl +"\n answer:"+answers+"\n"; } public boolean getRealUrl(String url) { Pattern pattern=Pattern.compile("question/(.*?)/"); Matcher matcher=pattern.matcher(url); if(matcher.find()) zhihuUrl="http://www.zhihu.com/question/"+matcher.group(1); else if(url.length()<25){ zhihuUrl="http://www.zhihu.com"+url; } else return false; return true; } public String writeString() { String result = ""; result += "問題:" + question + "\r\n"; result += "描述:" + questionDesc+ "\r\n"; result += "連結:" + zhihuUrl + "\r\n"; for (int i = 0; i < answers.size(); i++) { result += "作者" + i + ":" + authorName.get(i) + "\r\n"; result += "回答" + i + ":" + answers.get(i) + "\r\n"; } result += "\r\n\r\n"; result = result.replaceAll("<br>", "\r\n"); result = result.replaceAll("<.*?>", ""); return result; } } public class Queue { private LinkedList<Object> queue = new LinkedList<Object>(); public void enQueue(Object t){ queue.addLast(t); } public Object deQueue(){ return queue.removeFirst(); } public boolean isQueueEmpty(){ return queue.isEmpty(); } public boolean contians(Object t){ return queue.contains(t); } public boolean empty() { return queue.isEmpty(); } public int getNum() { return queue.size(); } }