1. 程式人生 > >Java爬蟲爬取網易汽車車型庫

Java爬蟲爬取網易汽車車型庫

最近由於工作需要,寫了一個小的爬蟲,主要用於爬取網易汽車車型庫(http://product.auto.163.com/)上的不同品牌/車標(共175個車標)下不同車系(共1650個系列)的的圖片(各八張)
這裡寫圖片描述

程式碼下載

程式碼如下:
共CarBrand.java,CarCrawer.java,CarCrawerDemo.java三個檔案。

實體
CarBrand.java

package com.mingo.crawer;

import java.util.ArrayList;

public class CarBrand {

    private String ppName;  
    private
String ppUrl; private ArrayList<CarBrand> ppList; private String cxName; private String cxUrl; private ArrayList<CarBrand> cxList; private String cxTpName; private String cxTpUrl; private ArrayList<CarBrand> cxTpList; private String tpName; private
String tpNameUrl; //getter() 和 setter() 省略 }

具體實現
CarCrawer.java

package com.mingo.crawer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import
java.io.RandomAccessFile; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CarCrawer { public static String carUrl = "http://product.auto.163.com"; public static String SendGet(String url) { // 定義一個字串用來儲存網頁內容 String result = ""; // 定義一個緩衝字元輸入流 BufferedReader in = null; try { // 將string轉成url物件 URL realUrl = new URL(url); // 初始化一個連結到那個url的連線 URLConnection connection = realUrl.openConnection(); // 開始實際的連線 connection.connect(); // 初始化 BufferedReader輸入流來讀取URL的響應 in = new BufferedReader(new InputStreamReader( connection.getInputStream(), "GB2312")); // 用來臨時儲存抓取到的每一行的資料 String line; while ((line = in.readLine()) != null) { // 遍歷抓取到的每一行並將其儲存到result裡面 result += line; } } catch (Exception e) { System.out.println("傳送GET請求出現異常!" + e); e.printStackTrace(); } // 使用finally來關閉輸入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return result; } /** * 下載檔案到本地 * * @param urlString * 被下載的檔案地址 * @param filename * 本地檔名 * @throws Exception * 各種異常 */ public static void download(String urlString, String filename,String savePath) throws Exception { // 構造URL URL url = new URL(urlString); // 開啟連線 URLConnection con = url.openConnection(); //設定請求超時為5s con.setConnectTimeout(5*1000); // 輸入流 InputStream is = con.getInputStream(); // 1K的資料緩衝 byte[] bs = new byte[1024]; // 讀取到的資料長度 int len; // 輸出的檔案流 File sf=new File(savePath); if(!sf.exists()){ sf.mkdirs(); } OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename); // 開始讀取 while ((len = is.read(bs)) != -1) { os.write(bs, 0, len); } // 完畢,關閉所有連結 os.close(); is.close(); } public static void writeTxtFile(String content,String txtfilename)throws Exception{ FileWriter writer = new FileWriter(txtfilename, true); writer.write(content); writer.close(); } public static ArrayList<CarBrand> removeDuplicate(ArrayList<CarBrand> list) { List<CarBrand> newlist= new ArrayList<CarBrand>(); Set<String> set=new HashSet<String>(); for (CarBrand car:list) { if (car == null) {continue;} String str = car.getCxName(); if (str != null) { if (!set.contains(str)) { //set中不包含重複的 set.add(str); newlist.add(car); } } } return (ArrayList<CarBrand>) newlist; } /* * @param url * 示例 http://product.auto.163.com/brand/a/ */ public static ArrayList<CarBrand> getPpUrl(String url) throws Exception { ArrayList<CarBrand> ppList = new ArrayList<CarBrand>(); String content = CarCrawer.SendGet(url); Pattern patternName = Pattern.compile("title=\"進入.{1,20}品牌頻道"); Pattern patternUrl = Pattern.compile("<a href='/brand/[a-z]/.{1,20}' title"); Matcher matcherName = patternName.matcher(content); Matcher matcherUrl = patternUrl.matcher(content); while(matcherName.find()&&matcherUrl.find()){ CarBrand carBrand = new CarBrand(); carBrand.setPpName(matcherName.group(0).substring(9, matcherName.group(0).length()-4)); carBrand.setPpUrl(carUrl+matcherUrl.group(0).substring(9, matcherUrl.group(0).length()-7)); //System.out.println(carBrand.getPpName()+": "+carBrand.getPpUrl()); ppList.add(carBrand); } return ppList; } /* * @param url * 示例 http://product.auto.163.com/brand/a/ */ public static ArrayList<CarBrand> getCxUrl(String url) throws Exception { ArrayList<CarBrand> cxPicList = new ArrayList<CarBrand>(); String content = CarCrawer.SendGet(url); //Pattern pattern = Pattern.compile("class=\"group\">.*<div class=\"gbox gbox2\" >"); //Matcher matcher = pattern.matcher(content); int i=0; while(content.indexOf("class=\"group\">",i)>0){ int subS = content.indexOf("class=\"group\">",i); int subE = content.indexOf("<div class=\"gbox gbox2\" >",i); String subContent = content.substring(subS, subE); i=subE+10; //System.out.println("subContent "+subContent); Pattern patternTitle = Pattern.compile("頻道\">進入.{1,20}品牌頻道</a>]</span>"); Matcher matcherTitle = patternTitle.matcher(subContent); String strtitle= null; if(matcherTitle.find()){ strtitle = matcherTitle.group(0).substring(6, matcherTitle.group(0).length()-16); } Pattern patternName = Pattern.compile("\"檢視.{1,20}圖片\">"); Pattern patternUrl = Pattern.compile("/series/photo/.{10,20}\""); Matcher matcherName = patternName.matcher(subContent); Matcher matcherUrl = patternUrl.matcher(subContent); while(matcherName.find()&&matcherUrl.find()){ CarBrand carBrand = new CarBrand(); carBrand.setPpName(strtitle); //System.out.println(carBrand.getPpName()); carBrand.setCxName(matcherName.group(0).substring(3, matcherName.group(0).length()-4)); carBrand.setCxUrl(carUrl+matcherUrl.group(0).substring(0, matcherUrl.group(0).length()-1)); //System.out.println(carBrand.getCxName()+": "+carBrand.getCxUrl()); cxPicList.add(carBrand); } } return cxPicList; } /* * @param url * 示例 http://product.auto.163.com/series/photo/2350.html#CX001 */ public static ArrayList<CarBrand> getCxPic(String url) throws Exception { ArrayList<CarBrand> cxPicList = new ArrayList<CarBrand>(); String content = CarCrawer.SendGet(url); Pattern pattern = Pattern.compile("http://product.auto.163.com/picture/photoview.{30,40}.html"); Matcher matcher = pattern.matcher(content); int num=1; while(matcher.find()&&num<9){ CarBrand carBrand = new CarBrand(); if(num==1){ carBrand.setCxTpName("左前");} else if(num==2){ carBrand.setCxTpName("正前"); }else if(num==3){ carBrand.setCxTpName("正側"); } else if(num==4){ carBrand.setCxTpName("左後"); }else if(num==5){ carBrand.setCxTpName("正後"); } else if(num==6){ carBrand.setCxTpName("車頂"); }else if(num==7){ carBrand.setCxTpName("前大燈區域性"); } else if(num==8){ carBrand.setCxTpName("後大燈區域性"); }else{ System.out.println("Error: num = "+num); return null;} carBrand.setCxTpUrl(matcher.group(0)); //System.out.println(carBrand.getCxTpName()+": "+matcher.group(0)); num = num + 1; cxPicList.add(carBrand); } return cxPicList; } public static String getBigPic(String url) throws Exception { String bigPicUrl = null; String content = CarCrawer.SendGet(url); Pattern pattern = Pattern.compile("<img class=\"main_photo hidden\" data-src=\".{60,70}.jpg"); Matcher matcher = pattern.matcher(content); if(matcher.find()){ //System.out.println(matcher.group(0).substring(41)); bigPicUrl = matcher.group(0).substring(41); } return bigPicUrl; } }

呼叫
CarCrawerDemo.java

package com.mingo.crawer;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CarCrawerDemo {

    public static String carUrl = "http://product.auto.163.com";

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub

        //儲存路徑 D:\\CarPic\\
        String savePath = "D:\\CarTp\\";
        //檔名 奧迪__奧迪Q5_2017款_左後.jpg
        String filename = "";
        //txt檔名
        String txtfilename=savePath+"output.txt";

        String url = "http://product.auto.163.com/brand/";
        System.out.println(url);
        ArrayList<CarBrand> pplist = CarCrawer.getPpUrl(url);
        System.out.println(pplist.size());


        ArrayList<CarBrand> cxUrllistNew = new ArrayList<CarBrand>();
        Set<String> ppUrlSet=new HashSet<String>();
        for(CarBrand pp:pplist){    
            String ppUrlStr= pp.getPpUrl().substring(0, 36);
            if (!ppUrlSet.contains(ppUrlStr)) { //set中不包含重複的
                ppUrlSet.add(ppUrlStr);             
                ArrayList<CarBrand> cxUrllist = CarCrawer.getCxUrl(pp.getPpUrl());          
                cxUrllistNew.addAll(cxUrllist); 
            }

        }

        System.out.println(cxUrllistNew.size());

        CarCrawer.writeTxtFile("\nCalendar: "+Calendar.getInstance(),txtfilename);
        for(CarBrand cxUrlNew:cxUrllistNew){
            //System.out.println(cxUrlNew.getPpName()+" "+cxUrlNew.getCxName()+" "+cxUrlNew.getCxUrl());

            ArrayList<CarBrand> cxTplist = CarCrawer.getCxPic(cxUrlNew.getCxUrl());

            for(CarBrand cxTp:cxTplist){    
                String tpName = cxUrlNew.getPpName()+"_"+cxUrlNew.getCxName()+"_"+cxTp.getCxTpName()+".jpg";
                String tpNameUrl = CarCrawer.getBigPic(cxTp.getCxTpUrl());

                //System.out.println(tpName+" "+tpNameUrl);

                CarCrawer.writeTxtFile("\n"+tpName+" "+tpNameUrl,txtfilename);

                if(tpName!=null&&tpNameUrl!=null){
                    CarCrawer.download(tpNameUrl, tpName, savePath);
                }
            }
        }
        System.out.println("finished!");
    }   
}

下載結果:
這裡寫圖片描述

改進點:
1 沒有爬取每個車系的年款;
2 庫有點小,車輛主要是小型車,
3 程式碼速度要進一步優化。