1. 程式人生 > >【Java】基於jsoup爬蟲實現(從智聯獲取工作資訊)

【Java】基於jsoup爬蟲實現(從智聯獲取工作資訊)

這幾天在學習Java解析xml,突然想到Dom能不能解析html,結果試了半天行不通,然後就去查了一些資料,發現很多人都在用Jsoup解析html檔案,然後研究了一下,寫了一個簡單的例項,感覺還有很多地方需要潤色,在這裡分享一下我的例項,歡迎交流指教!後續想通過Java把資料匯入到Excel或者生成一個報表!

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**從智聯招聘獲取招聘資訊
 * @url 智聯招聘網站連結(建議不要更改)
 * @city 搜尋工作的城市
 * @keywrods 搜尋工作的相關關鍵字
 */

public class JsoupHtml {
    
    private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl=";  //智聯招聘網站
    private  String city="西安"; //搜尋工作的城市
    private  String keywords="java";  //搜尋工作的關鍵字
    public JsoupHtml(String city,String keywords){        
        this.city=city;
        this.keywords =keywords;
        
    }
    
    public void getZhiLianWork(){
        try {
            for (int i=0;i<10;i++) {
                    System.out.println("*********開始遍歷第"+(i+1)+"頁的求職資訊*********");
                    Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get();                    
                    Element content = doc.getElementById("newlist_list_content_table");            
                    Elements zwmcEls = content.getElementsByClass("zwmc");
                    Elements gsmcEls = content.getElementsByClass("gsmc");            
                    Elements zwyxEls = content.getElementsByClass("zwyx");            
                    Elements gzddEls = content.getElementsByClass("gzdd");            
                    Elements gxsjEls = content.getElementsByClass("gxsj");
                    for(int j = 0;j<zwmcEls .size();j++){
                        
                        System.out.println(
                                zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+
                                "*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+
                                "*****"+gxsjEls.get(j).tagName("a").text());
                        System.out.println();
                }
                    System.out.println("*********結束遍歷第"+(i+1)+"頁的求職資訊*********");
            
            }
            
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {    
        
        JsoupHtml jHtml = new JsoupHtml("上海", "java");
        jHtml.getZhiLianWork();
        
    }

}

更新原始碼,支援生成html表格:

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
public class JsoupHtml {
 
    public static void main(String[] args) {       
        try {
            String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?";
            String city ="西安";
            String keywords = "java";
            BufferedWriter bWriter = new BufferedWriter(
                    new OutputStreamWriter(
                            new FileOutputStream("output.html"),"utf-8"));
            bWriter.write("");
             
             
            File input = new File("input.html");
            Document doc2 = Jsoup.parse(input, "UTF-8", "");
            Element table = doc2.getElementById("workinfo");
            table.text("");
            Element theader = table.appendElement("tr");
            theader.appendElement("th").text("序號");
            theader.appendElement("th").text("職位名稱");
            theader.appendElement("th").text("公司名稱");
            theader.appendElement("th").text("職位月薪");
            theader.appendElement("th").text("工作地點");
            theader.appendElement("th").text("釋出日期");          
         
             
            for(int page=0;page<10;page++){             
                Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get();               
                Element content = doc.getElementById("newlist_list_content_table");        
                Elements zwmcEls = content.getElementsByClass("zwmc");
                Elements gsmcEls = content.getElementsByClass("gsmc");         
                Elements zwyxEls = content.getElementsByClass("zwyx");         
                Elements gzddEls = content.getElementsByClass("gzdd");         
                Elements gxsjEls = content.getElementsByClass("gxsj");
                 
                for(int i = 1;i<zwmcEls .size();i++){               
                    Element tr =table.appendElement("tr");
                    tr.appendElement("td").text((page+1)+"-"+i);
                    tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(gzddEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text());
                }
            }
            System.out.println(doc2.html());
            bWriter.write(doc2.html());
            bWriter.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
         
    }
 
}

output.html模板:

<!doctype html>
<html lang="en">
 <head>
  <meta charset="UTF-8">
  <meta name="Generator" content="EditPlus®">
  <meta name="Author" content="">
  <meta name="Keywords" content="">
  <meta name="Description" content="">
  <title>智聯工作資訊</title>
  <style>
  body{margin:0;padding:0;}
    .header{height:100px;width:100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px;
        font-family:"微軟雅黑";}
    .body{width:100%;background:#fff;}
    .body table{width:90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; }
    .body table th,td{min-width:50px;max-width:300px;}
    .feeter{height:30px;width:100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px;
        font-family:"微軟雅黑";}
  </style>
 </head>
 <body>
    <div class="header">智聯工作資訊</div>
    <div class="body">
        <table class="work" border="1">
            <tbody id="workinfo">
            </tbody>
        </table>
    </div>
    <div class="feeter">版權所有 翻版必究@2018 Joker</div>
 </body>
</html>