最新手寫圖片爬蟲(針對千庫網和一個福利網站)
阿新 • • 發佈:2018-12-10
前言:由於前一段時期需要從網站上扒一些圖片下來,因為css,js都好扒,就是圖片數量眾多,需要程式碼實現,在網上找了一堆以實現的程式碼,要麼沒有用,要麼功能不是自己想要的,乾脆自己寫一個,寫的時候發現還挺簡單的,而且不單單可以下載圖片,只要是網站資源都可以,只不過需要針對單個網站去寫程式碼,因為每個網站的html佈局都不一樣。
1.圖片下載工具類:檔案路徑自己設定
package com.example.demo.util.netbug.downloadImage; /** * Descripition:image download util * Created by jin.tang on 2018/9/7...... */ import lombok.extern.java.Log; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; @Log public class DownloadPicFromURL { private static String[] imgType = {".jpg", ".jpeg", ".bmp", ".png", ".tif", ".gif", ".fpx", ".svg", ".psd", ".pcx", ".tga", ".exif", "psd", "cdr", "ufo", "raw"};//bmp,jpg,png,tif,gif,pcx,tga,exif,fpx,svg,psd,cdr,pcd,dxf,ufo,eps,ai,raw,WMF,webp public static void main(String[] args) { String url = "http://36.33.40.131:8200/uploadFile//userImg/1531368430530title.png"; String path = "d:/html/img/"; downloadPicture(url, path); } public static Boolean coverString(String url, String[] imgType) { Boolean flag = false; // 查詢是否存在圖片格式 for (String type : imgType) { if (url.indexOf(type) != -1 || url.toUpperCase().indexOf(type.toUpperCase()) != -1) { flag = true; break; } } return flag; } //連結url下載圖片 public static void downloadPicture(String urlList, String path) { path = path + new SimpleDateFormat("yyyyMMdd").format(new Date()) + "_" + urlList.substring(urlList.lastIndexOf("/") + 1); if(!coverString(urlList, imgType)){ path=path+".jpg"; } URL url = null; try { url = new URL(urlList); DataInputStream dataInputStream = new DataInputStream(url.openStream()); FileOutputStream fileOutputStream = new FileOutputStream(new File(path)); ByteArrayOutputStream output = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int length; while ((length = dataInputStream.read(buffer)) > 0) { output.write(buffer, 0, length); } fileOutputStream.write(output.toByteArray()); dataInputStream.close(); fileOutputStream.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } log.info("img: " + urlList.substring(urlList.lastIndexOf("/") + 1) + " download " + path + " done....."); } }
2、爬蟲實現類
package com.example.demo.util.netbug.downloadImage; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.*; import lombok.extern.java.Log; import java.util.List; /** * Descripition:spider for image,batch get image url,then you can download ---just for 千庫網 * Created by jin.tang on 2018/9/7...... */ @Log public class SpiderForInternetImage { public static void doSearchResourceToQianku(HtmlPage htmlpage) { try { // 根據名字得到一個表單,檢視上面這個網頁的原始碼可以發現表單的名字叫“f” //final HtmlForm form = htmlpage.getFormByName("f"); // 同樣道理,獲取”百度一下“這個按鈕 //final HtmlSubmitInput button = form.getInputByValue("百度一下"); // 得到搜尋框 //final HtmlTextInput textField = form.getInputByName("q1"); // 最近周星馳比較火呀,我這裡設定一下在搜尋框內填入”周星馳“ //textField.setValueAttribute("周星馳"); // 輸入好了,我們點一下這個按鈕 //final HtmlPage nextPage = button.click(); // 我把結果轉成String //String result = nextPage.asXml(); //當前頁的圖片下載 List<HtmlElement> a = htmlpage.getByXPath("//a[@class='db']"); a.stream().forEach(href -> { //單個a標籤內所有img的下載 DomNodeList<HtmlElement> imgs = href.getElementsByTagName("img"); doDownload(imgs); }); //跳轉下一頁的頁面圖片下載 List<HtmlAnchor> next = htmlpage.getByXPath("//a[@class='downPage']"); HtmlPage nextPage_ = next.get(0).click(); doSearchResourceToQianku(nextPage_); // for(HtmlAnchor ach:achList){ // System.out.println(ach.getHrefAttribute()); // ach.click(); // } //DomNodeList<HtmlElement> p = a.getElementsByTagName("p"); //List<HtmlElement> byXPath = p.get(0).getByXPath("span"); //DomNodeList<HtmlElement> imgs = byXPath.get(0).getElementsByTagName("img"); } catch (Exception e) { log.info("error happen ->" + e.getMessage()); e.printStackTrace(); } } public static void doSearchResourceToMmonly(WebClient webclient, HtmlPage htmlpage) { try { //當前頁的圖片下載 List<HtmlElement> divs = htmlpage.getByXPath("//div[@class='ABox']"); divs.stream().forEach(div -> { //單個div標籤內所有img的下載 DomNodeList<HtmlElement> a = div.getElementsByTagName("a"); //取第一個a標籤,裡面包含圖片連結頁面 String currentPageUrl = a.get(0).getAttribute("href"); //點選進入該頁面 try { HtmlPage currentHtmlpage = webclient.getPage(currentPageUrl); doDownloadCHildPage(currentHtmlpage, 1); } catch (Exception e) { e.printStackTrace(); } }); //跳轉下一頁的頁面圖片下載 List<HtmlElement> div_jump = htmlpage.getByXPath("//div[@id='pageNum']"); DomNodeList<HtmlElement> as=div_jump.get(0).getElementsByTagName("a"); log.info(""+as.get(as.size()-2)); HtmlElement nexta = as.get(as.size()-2); HtmlPage np=nexta.click(); doSearchResourceToMmonly(webclient,np ); } catch (Exception e) { log.info("error happen ->" + e.getMessage()); e.printStackTrace(); } } public static void doDownloadCHildPage(HtmlPage currentHtmlpage, int times) throws Exception { List<HtmlElement> div_2 = currentHtmlpage.getByXPath("//div[@class='big-pic']"); HtmlElement bigDiv = div_2.get(0);//大圖 //每頁只下一張大圖 DomNodeList<HtmlElement> imgs = bigDiv.getElementsByTagName("img"); doDownloadToMmonly(imgs); //調到下一頁的大圖頁面 List<HtmlElement> li = currentHtmlpage.getByXPath("//li[@id='nl']"); // DomNodeList<DomElement> li = currentHtmlpage.getElementsById("nl"); DomNodeList<HtmlElement> a_ = li.get(0).getElementsByTagName("a"); HtmlElement next_ = a_.get(0); //只下前8張圖片,因為每套圖數量不一樣,不好統一 if (times <=8) { HtmlPage nextPage_ = next_.click(); doDownloadCHildPage(nextPage_, times + 1); } } //單個標籤內子img標籤的迴圈下載 public static void doDownload(DomNodeList<HtmlElement> imgs) { // http://bpic.588ku.com/back_pic/03/72/92/6657b9a240d3d1f.jpg!/fh/300/quality/90/unsharp/true/compress/true System.out.println("總共" + imgs.size() + " 張圖片 , 開始下載到本地 路徑為 : "); // 遍歷 下載圖片到本地 for (HtmlElement img : imgs) { if ("".equals(img.getAttribute("data-original")) || !img.getAttribute("data-original").contains("http")) { log.info("current image src ==> " + img.getAttribute("data-original") + " :is not right!"); } else { log.info("current image src ==> " + img.getAttribute("data-original") + " :is right!"); String imgUrl = img.getAttribute("data-original").substring(0, img.getAttribute("data-original").contains("!") ? img.getAttribute("data-original").lastIndexOf("!") : img.getAttribute("data-original").length()); DownloadPicFromURL.downloadPicture(imgUrl, "d:/html/img/"); } } } //單個標籤內子img標籤的迴圈下載 public static void doDownloadToMmonly(DomNodeList<HtmlElement> imgs) { // http://bpic.588ku.com/back_pic/03/72/92/6657b9a240d3d1f.jpg!/fh/300/quality/90/unsharp/true/compress/true System.out.println("總共" + imgs.size() + " 張圖片 , 開始下載到本地 路徑為 : "); // 遍歷 下載圖片到本地 for (HtmlElement img : imgs) { if ("".equals(img.getAttribute("src")) || !img.getAttribute("src").contains("http")) { log.info("current image src ==> " + img.getAttribute("src") + " :is not right!"); } else { log.info("current image src ==> " + img.getAttribute("src") + " :is right!"); String imgUrl = img.getAttribute("src").substring(0, img.getAttribute("src").contains("!") ? img.getAttribute("src").lastIndexOf("!") : img.getAttribute("src").length()); DownloadPicFromURL.downloadPicture(imgUrl, "d:/html/img/"); } } } public static void main(String[] args) { try { // 得到瀏覽器物件,直接New一個就能得到,現在就好比說你得到了一個瀏覽器了 WebClient webclient = new WebClient(); // 這裡是配置一下不載入css和javaScript,配置起來很簡單,是不是 webclient.getOptions().setCssEnabled(false); webclient.getOptions().setJavaScriptEnabled(false); //做的第一件事,去拿到這個網頁,只需要呼叫getPage這個方法即可 HtmlPage htmlpage = webclient.getPage("http://588ku.com/beijing/0-0-pxnum-0-8-0-0-0-1/?hd=205"); doSearchResourceToQianku(htmlpage); // HtmlPage htmlpage = webclient.getPage("http://www.mmonly.cc/mmtp/"); // doSearchResourceToMmonly(webclient, htmlpage); } catch (Exception e) { log.info("error happen ->" + e.getMessage()); e.printStackTrace(); } } }
3、
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit --> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.32</version> </dependency>
4、福利網站的下載結果: