【Java】基於jsoup爬蟲實現(從智聯獲取工作資訊)
阿新 • • 發佈:2018-12-17
這幾天在學習Java解析xml,突然想到Dom能不能解析html,結果試了半天行不通,然後就去查了一些資料,發現很多人都在用Jsoup解析html檔案,然後研究了一下,寫了一個簡單的例項,感覺還有很多地方需要潤色,在這裡分享一下我的例項,歡迎交流指教!後續想通過Java把資料匯入到Excel或者生成一個報表!
import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /**從智聯招聘獲取招聘資訊 * @url 智聯招聘網站連結(建議不要更改) * @city 搜尋工作的城市 * @keywrods 搜尋工作的相關關鍵字 */ public class JsoupHtml { private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl="; //智聯招聘網站 private String city="西安"; //搜尋工作的城市 private String keywords="java"; //搜尋工作的關鍵字 public JsoupHtml(String city,String keywords){ this.city=city; this.keywords =keywords; } public void getZhiLianWork(){ try { for (int i=0;i<10;i++) { System.out.println("*********開始遍歷第"+(i+1)+"頁的求職資訊*********"); Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get(); Element content = doc.getElementById("newlist_list_content_table"); Elements zwmcEls = content.getElementsByClass("zwmc"); Elements gsmcEls = content.getElementsByClass("gsmc"); Elements zwyxEls = content.getElementsByClass("zwyx"); Elements gzddEls = content.getElementsByClass("gzdd"); Elements gxsjEls = content.getElementsByClass("gxsj"); for(int j = 0;j<zwmcEls .size();j++){ System.out.println( zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+ "*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+ "*****"+gxsjEls.get(j).tagName("a").text()); System.out.println(); } System.out.println("*********結束遍歷第"+(i+1)+"頁的求職資訊*********"); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) { JsoupHtml jHtml = new JsoupHtml("上海", "java"); jHtml.getZhiLianWork(); } }
更新原始碼,支援生成html表格:
import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupHtml { public static void main(String[] args) { try { String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?"; String city ="西安"; String keywords = "java"; BufferedWriter bWriter = new BufferedWriter( new OutputStreamWriter( new FileOutputStream("output.html"),"utf-8")); bWriter.write(""); File input = new File("input.html"); Document doc2 = Jsoup.parse(input, "UTF-8", ""); Element table = doc2.getElementById("workinfo"); table.text(""); Element theader = table.appendElement("tr"); theader.appendElement("th").text("序號"); theader.appendElement("th").text("職位名稱"); theader.appendElement("th").text("公司名稱"); theader.appendElement("th").text("職位月薪"); theader.appendElement("th").text("工作地點"); theader.appendElement("th").text("釋出日期"); for(int page=0;page<10;page++){ Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get(); Element content = doc.getElementById("newlist_list_content_table"); Elements zwmcEls = content.getElementsByClass("zwmc"); Elements gsmcEls = content.getElementsByClass("gsmc"); Elements zwyxEls = content.getElementsByClass("zwyx"); Elements gzddEls = content.getElementsByClass("gzdd"); Elements gxsjEls = content.getElementsByClass("gxsj"); for(int i = 1;i<zwmcEls .size();i++){ Element tr =table.appendElement("tr"); tr.appendElement("td").text((page+1)+"-"+i); tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text()); tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text()); tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text()); tr.appendElement("td").text(gzddEls.get(i).tagName("a").text()); tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text()); } } System.out.println(doc2.html()); bWriter.write(doc2.html()); bWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
output.html模板:
<!doctype html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="Generator" content="EditPlus®"> <meta name="Author" content=""> <meta name="Keywords" content=""> <meta name="Description" content=""> <title>智聯工作資訊</title> <style> body{margin:0;padding:0;} .header{height:100px;width:100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px; font-family:"微軟雅黑";} .body{width:100%;background:#fff;} .body table{width:90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; } .body table th,td{min-width:50px;max-width:300px;} .feeter{height:30px;width:100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px; font-family:"微軟雅黑";} </style> </head> <body> <div class="header">智聯工作資訊</div> <div class="body"> <table class="work" border="1"> <tbody id="workinfo"> </tbody> </table> </div> <div class="feeter">版權所有 翻版必究@2018 Joker</div> </body> </html>