java程式爬取網頁上的圖片
阿新 • • 發佈:2018-12-18
最近需要在網上找一寫圖片,所以寫了一個爬取圖片的程式,新手有寫的不足之處還請各位大佬指點一二。
原始碼如下
package com.sysh.ssm.service; import org.apache.commons.lang3.StringEscapeUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author: * @Date:2018/5/22 */ public class SpiderPicturesFromBaiduByWord { public static void main(String[] args) throws Exception{ String downloadPath = "C:\\sunhoo\\image"; System.out.println("輸入爬取關鍵字(可用空格,、號分隔多個想爬的關鍵字):"); Scanner KeyWord = new Scanner(System.in); String Word =KeyWord.nextLine(); System.out.println("輸入要下載的頁數(1表示一頁,一頁有30張圖片)"); Integer pageSize=KeyWord.nextInt(); List<String> list = nameList(Word); getPictures(list,pageSize,downloadPath); //1代表下載一頁,一頁一般有30張圖片 } public static void getPictures(List<String> keywordList, int max,String downloadPath) throws Exception{ // key為關鍵詞,max作為爬取的頁數 String gsm=Integer.toHexString(max)+""; String finalURL = ""; String tempPath = ""; //每頁的數量 Integer pagenumber=10; for(String keyword : keywordList){ tempPath = downloadPath; if(!tempPath.endsWith("\\")){ tempPath = downloadPath+"\\"; } tempPath = tempPath+keyword+"\\"; File f = new File(tempPath); if(!f.exists()){ f.mkdirs(); } int picCount = 1; for(int page=0;page<=max;page++) { sop("正在下載第"+page+"頁面"); Document document = null; try { String url ="http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="+keyword+"&cg=star&pn="+page*pagenumber+"&rn=30&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm="+Integer.toHexString(page*pagenumber); //String url ="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1540974009530_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8D%8E%E5%B1%B1"+Integer.toHexString(page*30); sop(url); document = Jsoup.connect(url).data("query", "Java")//請求引數 .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")//設定urer-agent get(); .timeout(5000) .get(); String xmlSource = document.toString(); xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource); sop("頁面"+xmlSource.length()); String reg = "objURL\":\"http://.+?\\.jpg"; Pattern pattern = Pattern.compile(reg); Matcher m = pattern.matcher(xmlSource); sop("mmm"+m); while (m.find()) { finalURL = m.group().substring(9); sop(keyword+picCount+++":"+finalURL); download(finalURL,tempPath); sop("下載成功"); } } catch (IOException e) { e.printStackTrace(); } } } sop("下載完畢"); delMultyFile(downloadPath); sop("已經刪除所有空圖"); } public static void delMultyFile(String path){ File file = new File(path); if(!file.exists()) { throw new RuntimeException("File \""+path+"\" NotFound when excute the method of delMultyFile()....");} File[] fileList = file.listFiles(); File tempFile=null; for(File f : fileList){ if(f.isDirectory()){ { delMultyFile(f.getAbsolutePath());} }else{ if(f.length()==0) { sop(f.delete()+"---"+f.getName());} } } } public static List<String> nameList(String nameList){ List<String> arr = new ArrayList<String>(); String[] list; if(nameList.contains(",")) { list= nameList.split(",");} else if(nameList.contains("、")) { list= nameList.split("、");} else if(nameList.contains(" ")) {list= nameList.split(" ");} else{ arr.add(nameList); return arr; } for(String s : list){ arr.add(s); } return arr; } public static void sop(Object obj){ System.out.println(obj); } //根據圖片網路地址下載圖片 public static void download(String url,String path){ //path = path.substring(0,path.length()-2); File file= null; File dirFile=null; FileOutputStream fos=null; HttpURLConnection httpCon = null; URLConnection con = null; URL urlObj=null; InputStream in =null; byte[] size = new byte[1024]; int num=0; try { String downloadName= url.substring(url.lastIndexOf("/")+1); dirFile = new File(path); if(!dirFile.exists() && path.length()>0){ if(dirFile.mkdir()){ sop("creat document file \""+path.substring(0,path.length()-1)+"\" success...\n"); } }else{ file = new File(path+downloadName); fos = new FileOutputStream(file); if(url.startsWith("http")){ urlObj = new URL(url); con = urlObj.openConnection(); httpCon =(HttpURLConnection) con; in = httpCon.getInputStream(); while((num=in.read(size)) != -1){ for(int i=0;i<num;i++) { fos.write(size[i]);} } } } }catch (FileNotFoundException notFoundE) { sop("找不到該網路圖片...."); }catch(NullPointerException nullPointerE){ sop("找不到該網路圖片...."); }catch(IOException ioE){ sop("產生IO異常....."); }catch (Exception e) { e.printStackTrace(); }finally{ try { fos.close(); } catch (Exception e) { e.printStackTrace(); } } } }