1. 程式人生 > >有搜尋條件根據url抓取網頁資料(java爬取網頁資料)

有搜尋條件根據url抓取網頁資料(java爬取網頁資料)

最近有一個任務抓取如下圖的網頁資料  要獲取前一天的資料進行翻頁抓取資料並存入資料庫

 

如果就只是抓取當前頁的資料  沒有條件和翻頁資料 這個就比較簡單了 但是要選取前一天的資料,還有分頁資料

一開始的思路就想錯了(開始想的是觸發查詢按鈕和翻頁按鈕)導致任務一度沒有進展 後來在技術經理的協助下搞定

話不多說 直接貼出程式碼

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;

import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;



@Scheduled(cron = "0 0 03 * * ?")//每天凌晨3點抓取資料
	//@Scheduled(cron="0/10 * *  * * ? ")  //測試 10秒執行一次
	//@Scheduled(cron="0 */10 * * * ?") //測試 10分鐘執行一次
	@Transactional
	public void getNotice() throws ClientProtocolException, IOException, ParseException {
		//獲取當前時間的前一天
		Calendar calendar = Calendar.getInstance();
		calendar.setTime(new Date());
		calendar.add(Calendar.DAY_OF_MONTH, -1);
		SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");//設定日期格式
		String format = df.format(calendar.getTime());
		//讀取配置檔案中的企業資訊
		Properties properties = new Properties();
		// 使用ClassLoader載入properties配置檔案生成對應的輸入流
		InputStream in = WryVoluntarilyMonitorService.class.getClassLoader()
				.getResourceAsStream("config/syqy.properties");
		// 使用properties物件載入輸入流 設定字符集以防讀取中文時亂碼
		properties.load(new InputStreamReader(in, "utf-8"));
		// 獲取key對應的value值
		String property = properties.getProperty("value");
		String[] split = property.split(";");
		System.out.println(split.length);
		for (String s : split) {
			String[] split2 = s.split(",");
			// System.out.println(split2[0]+"======="+split2[1]);split2[0]企業名稱;split2[1])企業url
			String html = getHtmlByUrl(split2[1],format);
			if (html != null && !"".equals(html)) {
				//獲取選中的時間有多少頁資料
				Document doc1 = Jsoup.parse(html);
				Elements select = doc1.select("input");
				String attr = select.get(4).attr("value");
				//迴圈每頁的資料並寫入資料庫
				for(int k=1;k<=Integer.parseInt(attr);k++) {
					String htmlByUrlData = getHtmlByUrlData(split2[1],format,k);
					Document doc = Jsoup.parse(htmlByUrlData);
					Elements linksElements = doc.select(".tb_ls >tbody >tr");
					for (int i = 1; i < linksElements.size(); i++) {
						Element element = linksElements.get(i);
						/**
						 * element.select(">td").get(0).text() 獲取到的是序號 
						 * 判斷是否有資料 (element.select(">td").get(0).text().equals("暫無資料!") 返回true是沒有資料)
						 */
						if (!element.select(">td").get(0).text().equals("暫無資料!")) {
							String aqi1 = element.select(">td").get(1).text();//檢測點位
							String aqi2 = element.select(">td").get(2).text();//檢測時間
							String aqi3 = element.select(">td").get(3).text();//檢測專案
							String aqi4 = element.select(">td").get(4).text();//檢測結果
							String aqi5 = element.select(">td").get(5).text();//檢測限值
							String aqi6 = element.select(">td").get(6).text();//檢測單位
							String aqi7 = element.select(">td").get(7).text();//是否達標
							String aqi8 = element.select(">td").get(8).text();//超標倍數
							String att9 = element.select(">td").get(9).getElementsByTag("td").attr("title");//評價標準
							String aqi10 = element.select(">td").get(10).getElementsByTag("td").attr("title");//排放去向 內容太多需要讀取title才能抓取完全
							String aqi11 = element.select(">td").get(11).text();//排放方式
							String aqi12 = element.select(">td").get(12).text();//備註
							WryVoluntarilyMonitor wryVoluntarilyMonitor = new WryVoluntarilyMonitor();
							wryVoluntarilyMonitor.setPkid(keyGenerator.getNext());
							wryVoluntarilyMonitor.setCompanyName(split2[0]);
							wryVoluntarilyMonitor.setDetectionPoint(aqi1);
							wryVoluntarilyMonitor.setDetectionTime(StringToDate(aqi2));
							wryVoluntarilyMonitor.setDetectionProject(aqi3);
							wryVoluntarilyMonitor.setDetectionResult(aqi4);
							wryVoluntarilyMonitor.setStandardLimitingValue(aqi5);
							wryVoluntarilyMonitor.setUnit(aqi6);
							wryVoluntarilyMonitor.setIsStandards(aqi7);
							wryVoluntarilyMonitor.setExceedingMultiple(aqi8);
							wryVoluntarilyMonitor.setEvaluationCriterion(att9);
							wryVoluntarilyMonitor.setEmissionsTo(aqi10);
							wryVoluntarilyMonitor.setEmissionsWay(aqi11);
							wryVoluntarilyMonitor.setRemarks(aqi12);
							super.insert(wryVoluntarilyMonitor);
						}
					}
				}
			}
		}
		System.out.println("執行成功");
	}

	/**
	 * String轉date
	 * @param times
	 * @return
	 * @throws ParseException
	 */
	public Date StringToDate(String times) throws ParseException {
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        return sdf.parse(times);
	}
	
	/**
	 * 根據URL和時間獲得所有的html資訊
	 * 
	 * @param url
	 * @return
	 * @throws IOException
	 * @throws ClientProtocolException
	 */

	public static String getHtmlByUrl(String url,String date) throws ClientProtocolException, IOException{
        String html = null;
        //建立httpClient物件
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response=null;
        try {
        	//以get方式請求該URL
            //HttpGet httpget = new HttpGet(url);
            HttpPost httppost = new HttpPost(url);
            String query="startTime="+date;
            StringEntity stringEntity = new StringEntity(query,"UTF-8");
            stringEntity.setContentType("application/x-www-form-urlencoded");
            httppost.setEntity(stringEntity);
            //CloseableHttpResponse response = httpClient.execute(httpget);
            response = httpClient.execute(httppost);
            //得到responce物件
            //HttpResponse responce = httpClient.execute(httpget);
            //返回碼
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不對
                //獲得輸入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通過輸入流轉為字串獲得html原始碼  注:可以獲得實體,然後通過 EntityUtils.toString方法獲得html
                	//但是有可能出現亂碼,因此在這裡採用了這種方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("訪問【"+url+"】出現異常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            //response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
    }

	/**
	 * 根據url,時間和當前頁獲取資料
	 * @param url
	 * @param date
	 * @param page
	 * @return
	 * @throws ClientProtocolException
	 * @throws IOException
	 */
	public static String getHtmlByUrlData(String url,String date,Integer page) throws ClientProtocolException, IOException{
	       
    	String html = null;
        //建立httpClient物件
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response=null;
        try {
        	//以get方式請求該URL
            //HttpGet httpget = new HttpGet(url);
            HttpPost httppost = new HttpPost(url);
            String query="startTime="+date+"&pageIndex="+page;
            StringEntity stringEntity = new StringEntity(query,"UTF-8");
            stringEntity.setContentType("application/x-www-form-urlencoded");
            httppost.setEntity(stringEntity);
            //CloseableHttpResponse response = httpClient.execute(httpget);
            response = httpClient.execute(httppost);
            //得到responce物件
            //HttpResponse responce = httpClient.execute(httpget);
            //返回碼
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不對
                //獲得輸入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通過輸入流轉為字串獲得html原始碼  注:可以獲得實體,然後通過 EntityUtils.toString方法獲得html
                	//但是有可能出現亂碼,因此在這裡採用了這種方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("訪問【"+url+"】出現異常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            //response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
}
	
	
	/**
	 * 將一個輸入流轉化為字串
	 */
	public static String getStreamString(InputStream tInputStream) {
		if (tInputStream != null) {
			try {
				BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream, "utf-8"));
				StringBuffer tStringBuffer = new StringBuffer();
				String sTempOneLine = new String("");
				while ((sTempOneLine = tBufferedReader.readLine()) != null) {
					tStringBuffer.append(sTempOneLine + "\n");
				}
				return tStringBuffer.toString();
			} catch (Exception ex) {
				ex.printStackTrace();
			}
		}
		return null;
	}

迴圈

程式碼截圖  為什麼從一開始迴圈  應為第一行是表格的開頭

頁面檢查截圖

 

讀取title

 

 

貼出檢查頁面圖startTime選擇時間  pageIndex當前頁碼