1. 程式人生 > >爬取京東收件地址下得所有資料

爬取京東收件地址下得所有資料

1.工具備用

package reptile;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;

/**
 * 京東服務地址
 * @author daiyang
 *
 */
public class Reptile4 {
	public static  int i = 0;
	public static  int j = 0;
	public static void main(String[] args) throws Exception {
		//解析省編碼
		String jdProvince = cover("\u5317\u4eac|1|72|1,\u4e0a\u6d77|2|78|1,\u5929\u6d25|3|51035|1,"
				+ "\u91cd\u5e86|4|113|1,\u6cb3\u5317|5|142,\u5c71\u897f|6|303,\u6cb3\u5357|7|412,"
				+ "\u8fbd\u5b81|8|560,\u5409\u6797|9|639,\u9ed1\u9f99\u6c5f|10|698,\u5185\u8499\u53e4|11|799,"
				+ "\u6c5f\u82cf|12|904,\u5c71\u4e1c|13|1000,\u5b89\u5fbd|14|1116,\u6d59\u6c5f|15|1158,\u798f\u5efa|16|1303,"
				+ "\u6e56\u5317|17|1381,\u6e56\u5357|18|1482,\u5e7f\u4e1c|19|1601,\u5e7f\u897f|20|1715,\u6c5f\u897f|21|1827,"
				+ "\u56db\u5ddd|22|1930,\u6d77\u5357|23|2121,\u8d35\u5dde|24|2144,\u4e91\u5357|25|2235,\u897f\u85cf|26|2951,"
				+ "\u9655\u897f|27|2376,\u7518\u8083|28|2487,\u9752\u6d77|29|2580,\u5b81\u590f|30|2628,\u65b0\u7586|31|2652,"
				+ "\u6e2f\u6fb3|52993|52994,\u53f0\u6e7e|32|2768,\u9493\u9c7c\u5c9b|84|84");
		//讀取市源資料
		String unicodeCity = readFile("D:\\test\\city.txt");
		//解析市編碼
		String jdCity = cover(unicodeCity);
		//獲取省資料
		List<Map<String,Object>> provinceList = provinceDataHandle(jdProvince);
		//獲取市資料
		List<Map<String, Object>> cityList= cityDataHandle(jdCity);	
		//執行緒安全的區縣資料
		ConcurrentLinkedQueue<Map<String,Object>> districtList = new ConcurrentLinkedQueue<Map<String,Object>>();
		//執行緒安全的鄉鎮資料
		ConcurrentLinkedQueue<Map<String,Object>> courtList = new ConcurrentLinkedQueue<Map<String,Object>>();
		//資料處理
		dataHandle(provinceList, cityList, districtList);
		System.out.println(JSON.toJSON(provinceList));
		System.out.println(JSON.toJSON(cityList));
		System.out.println(JSON.toJSON(districtList));
		//開始表演---->>>>市Id請求	
		AtomicInteger atoI = new AtomicInteger(0);	//查詢哪個市下得區縣鄉鎮資料,成都市列表第325個,綿陽市第329個
		//容量上限為50的執行緒池
		ExecutorService es = Executors.newFixedThreadPool(50);
		System.out.println("===========>>>>>>>>>>>>>>>>>>開始搜尋資料");
		int taskNum  = 1;//
		while(taskNum<=cityList.size()){//cityList.size()------------------------------------->>>開閘
			Runnable task = new Runnable() {				
				@Override
				public void run() {
					getDistrictInfo(courtList,districtList,cityList, atoI);
				}
			};
			es.submit(task);
			taskNum++;				
		}
		 es.shutdown();
	     while(true){  
	           if(es.isTerminated()){  
	                System.out.println("---END---\n");
	                System.out.println("所有的子執行緒都結束了!");
	                //*************************************資料正確處理*******************************************//*
	                System.out.println("=================>>>>>>>>>>>>>>>開始存入資料庫");
	                //addProvinceData(provinceList);//新增省資料
	                //addCityData(cityList);//新增市資料
	                //addDistructData(districtList);//新增區縣資料 3600多個
	                addTownData(courtList);//新增鄉鎮資料 39836個
	                break;  
	            }
	            Thread.sleep(1000);    
	      }
	}


	static Connection conn;
	static PreparedStatement ps;
	static ResultSet rs;
    /**
     * 寫一個連線資料庫的方法
     */
    public static Connection getConnection(){
        String url="jdbc:mysql://localhost:port/database";
        String userName="username";
        String password="password";
        try {
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            System.out.println("找不到驅動!");
            e.printStackTrace();
        }
        try {
            conn=DriverManager.getConnection(url, userName, password);
            if(conn!=null){
                System.out.println("connection successful");
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            System.out.println( "connection fail");
            e.printStackTrace();
        }
        return conn;
    }
    
    
    
    public static int addTownData(ConcurrentLinkedQueue<Map<String,Object>> list){
        int row=0;
        String sql="insert into tb_town(name,districtId,jdTownId) values(?,?,?)";
        try {
            conn=getConnection();//連線資料庫
            ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數
//            rs=ps.executeQuery();  // 3.ִ執行SQL語句,緊緊用於查詢語句
            //sql語句中寫了幾個欄位,下面就必須要有幾個欄位
            for(Map<String,Object> map:list){
            	System.out.println("FBIWARNING  i....:"+(i++));
                ps.setString(1, (String)map.get("name"));
                ps.setInt(2, Integer.valueOf((String)map.get("districtId")));
                ps.setInt(3, Integer.valueOf((String)map.get("id")));
                // 4.處理結果集
                row=ps.executeUpdate();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            try {
                ps.close();
                conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
            
        }
        return row;
    }
    
    
    
    
    
    
    public static int addDistructData(ConcurrentLinkedQueue<Map<String,Object>> list){
        int row=0;
        String sql="insert into tb_district(name,cityId,jdDistrictId) values(?,?,?)";
        try {
            conn=getConnection();//連線資料庫
            ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數
//            rs=ps.executeQuery();  // 3.ִ執行SQL語句,緊緊用於查詢語句
            //sql語句中寫了幾個欄位,下面就必須要有幾個欄位
            for(Map<String,Object> map:list){
                ps.setString(1, (String)map.get("name"));
                ps.setInt(2, Integer.valueOf((String)map.get("cityId")));
                ps.setInt(3, Integer.valueOf((String)map.get("id")));
                // 4.處理結果集
                row=ps.executeUpdate();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            try {
                ps.close();
                conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
            
        }
        return row;
    }
    
    
    
    public static int addCityData(List<Map<String,Object>> list){
        int row=0;
        String sql="insert into tb_city(name,provinceId,jdCityId) values(?,?,?)";
        try {
            conn=getConnection();//連線資料庫
            ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數
//            rs=ps.executeQuery();  // 3.ִ執行SQL語句,緊緊用於查詢語句
            //sql語句中寫了幾個欄位,下面就必須要有幾個欄位
            for(Map<String,Object> map:list){
                ps.setString(1, (String)map.get("cityName"));
                ps.setInt(2, Integer.valueOf((String)map.get("provinceId")));
                ps.setInt(3, Integer.valueOf((String)map.get("cityId")));
                // 4.處理結果集
                row=ps.executeUpdate();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            try {
                ps.close();
                conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
            
        }
        return row;
    }
      
    public static int addProvinceData(List<Map<String,Object>> list){
        int row=0;
        String sql="insert into tb_province(name,provinceCode,provinceType,jdProvinceId) values(?,?,?,?)";
        try {
            conn=getConnection();//連線資料庫
            ps=conn.prepareStatement(sql);// 2.建立Satement並設定引數
//            rs=ps.executeQuery();  // 3.ִ執行SQL語句,緊緊用於查詢語句
            //sql語句中寫了幾個欄位,下面就必須要有幾個欄位
            for(Map<String,Object> map:list){
                ps.setString(1, (String)map.get("provinceName"));
                ps.setString(2, (String)map.get("provinceCode"));
                ps.setInt(3, Integer.valueOf((String)map.get("provinceType")));
                ps.setInt(4, Integer.valueOf((String)map.get("provinceId")));
                // 4.處理結果集
                row=ps.executeUpdate();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            try {
                ps.close();
                conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
            
        }
        return row;
    }
	
	
	private static void dataHandle(List<Map<String, Object>> provinceList,
			List<Map<String, Object>> cityList,
			ConcurrentLinkedQueue<Map<String, Object>> districtList) {
		//1.直轄市下面的區跑到市去了
		Iterator<Map<String, Object>> iterator = cityList.iterator();
		while(iterator.hasNext()){
				Map<String,Object> map = iterator.next();
				if(Integer.valueOf((String)map.get("provinceId"))<=4){
				iterator.remove();
			}
		}
		for(int i = 0; i<4;i++){
			Map<String, Object> map = provinceList.get(i);
			Map<String, Object> newMap = new HashMap<String, Object>();
			newMap.put("cityName", map.get("provinceName"));
			newMap.put("cityId", map.get("provinceId"));
			newMap.put("provinceId", map.get("provinceId"));
			cityList.add(newMap);
		}
	}
	//獲取鄉鎮資料
	private static void getCourtInfo(
			ConcurrentLinkedQueue<Map<String,Object>> courtList,String districtId) {
		//爬取第四級鄉鎮資料
		String url = "https://d.jd.com/area/get?fid="+districtId;
		//System.out.println(url);
		try {
			String request = request(url);
			JSONArray jarr=JSONArray.parseArray(request);//JSON.parseArray(jsonStr);  
	        for (Iterator iterator = jarr.iterator(); iterator.hasNext();) {
	        	Map<String,Object> one = new HashMap<>();
	            JSONObject job=(JSONObject)iterator.next(); 
	            String name=job.get("name").toString();
	            String id = job.getString("id").toString();
	            one.put("name", name);
	            one.put("id", id);
	            one.put("districtId",districtId);
	            courtList.add(one);
	        }
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println("請求地址錯誤");
		}
	}
	
	//獲取區縣資料
	private static void getDistrictInfo(
			ConcurrentLinkedQueue<Map<String,Object>> courtList,ConcurrentLinkedQueue<Map<String,Object>> districtList,
			List<Map<String, Object>> cityList, AtomicInteger atoI) {
		Map<String, Object> map = cityList.get(atoI.getAndIncrement());
		//爬取第三級區縣資料
		String url = "https://d.jd.com/area/get?fid="+map.get("cityId");
		//System.out.println(url);
		try {
			String request = request(url);
			JSONArray jarr=JSONArray.parseArray(request);//JSON.parseArray(jsonStr);  
	        for (Iterator iterator = jarr.iterator(); iterator.hasNext();) {
	        	Map<String,Object> one = new HashMap<>();
	            JSONObject job=(JSONObject)iterator.next(); 
	            String name=job.get("name").toString();
	            String id = job.getString("id").toString();
	            one.put("name", name);
	            one.put("id", id);
	            one.put("cityId",map.get("cityId"));
	            districtList.add(one);
	            //
	            getCourtInfo(courtList,id);
	        }
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println("請求地址錯誤");
		}
	}
	
	//發起請求
	private static String request(String url) throws Exception {
		// 定義一個緩衝字元輸入流
		BufferedReader in = null;
		// 將string轉成url物件
		URL realUrl = new URL(url);
		// 初始化一個連結到那個url的連線
		URLConnection connection = realUrl.openConnection();
		// 開始實際的連線
		connection.connect();
		// 初始化 BufferedReader輸入流來讀取URL的響應
		in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
		String line = null;
		String content = "";
		while((line = in.readLine())!=null){
			content+=line;
		}
		return content;
	}
	//市資料處理
	private static List<Map<String,Object>> cityDataHandle(String cityStr){
		String[] cityStrs = cityStr.split("hello,dy");
		List<Map<String,Object>> cityList = new ArrayList<Map<String,Object>>();
		for(String city:cityStrs){
			String delResult = city.replace("\"", "");
			String split[] = delResult.split(":");
			String[] cities = split[1].split(",");
			for(String str:cities){
				Map<String,Object> one = new HashMap<String, Object>();
				if(str.trim().isEmpty())//空白串處理
					continue;
				String data[] = str.split("\\|");
				one.put("cityName", data[0]);
				one.put("cityId", data[1]);
				one.put("provinceId", split[0].replace("\t", ""));
				cityList.add(one);
			}
		}
		return cityList;
	}
	//讀取市資料來源
	private static String readFile(String fileName){
		File file = new File(fileName);
		String content = "";
		try {
			BufferedReader br = new BufferedReader(new FileReader(file));
			String line;
			while((line=br.readLine())!=null){
				content += (line+"hello,dy");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return content.substring(0,content.length()-8);
	}
	//省資料處理
	private static List<Map<String,Object>> provinceDataHandle(String provinceStr){
		String[] proviceStrs = provinceStr.split(",");
		List<Map<String,Object>> provinceList = new ArrayList<Map<String,Object>>();
		for(String provice:proviceStrs){
			Map<String,Object> one = new HashMap<String, Object>();
			String data[] = provice.split("\\|");
			one.put("provinceName", data[0]);
			one.put("provinceId", data[1]);
			one.put("provinceCode", data[2]);
			if(data.length>3)
				one.put("provinceType", data[3]);//1為直轄市
			else
				one.put("provinceType", "2");//2為省
			provinceList.add(one);
		}
		return provinceList;
	}
	//解碼規則--16進位制unicode編碼
	public static String cover(String s){
		String re = "",sub = null;
		char c1,c2;
		for(int i=0;i<s.length()-1;i++)
		{
			c1 = s.charAt(i);
			c2 = s.charAt(i+1);
			if(c1 == '\\' && c2 =='u'){
				sub = s.substring(i+2,i+6);
				re = re + (char)Integer.parseInt(sub,16);
				i+=5;
			}
			else{
				re = re+c1;
			}
		}
		return re;
	}
	//NIO非阻塞式讀寫
	@SuppressWarnings("static-access")
	public static void writeByNIO(String content,File file) {
        RandomAccessFile fout = null;
        FileChannel fcout = null;
        try {
            fout = new RandomAccessFile(file, "rw");
            long filelength = fout.length();//獲取檔案的長度
            fout.seek(filelength);//將檔案的讀寫指標定位到檔案的末尾
            fcout = fout.getChannel();//開啟檔案通道
            FileLock flout = null;
            while (true) {
                try {
                    flout = fcout.tryLock();//不斷的請求鎖,如果請求不到,等一秒再請求
                    break;
                } catch (Exception e) {
                    System.out.print("lock is exist ......");
                    Thread.currentThread().sleep(1000);
                }
            }
            fout.write(content.getBytes());//將需要寫入的內容寫入檔案

            flout.release();
            fcout.close();
            fout.close();

        } catch (IOException e1) {
            e1.printStackTrace();
            System.out.print("file no find ...");
        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            if (fcout != null) {
                try {
                    fcout.close();
                } catch (IOException e) {
                    e.printStackTrace();
                    fcout = null;
                }
            }
            if (fout != null) {
                try {
                    fout.close();
                } catch (IOException e) {
                    e.printStackTrace();
                    fout = null;
                }
            }
        }

    }
}
   能直接爬出京東的全國地址並拷貝到本地資料庫中,使用的話注意資料庫連線和表結構.