有搜尋條件根據url抓取網頁資料(java爬取網頁資料)
阿新 • • 發佈:2018-11-30
最近有一個任務抓取如下圖的網頁資料 要獲取前一天的資料進行翻頁抓取資料並存入資料庫
如果就只是抓取當前頁的資料 沒有條件和翻頁資料 這個就比較簡單了 但是要選取前一天的資料,還有分頁資料
一開始的思路就想錯了(開始想的是觸發查詢按鈕和翻頁按鈕)導致任務一度沒有進展 後來在技術經理的協助下搞定
話不多說 直接貼出程式碼
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Properties; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @Scheduled(cron = "0 0 03 * * ?")//每天凌晨3點抓取資料 //@Scheduled(cron="0/10 * * * * ? ") //測試 10秒執行一次 //@Scheduled(cron="0 */10 * * * ?") //測試 10分鐘執行一次 @Transactional public void getNotice() throws ClientProtocolException, IOException, ParseException { //獲取當前時間的前一天 Calendar calendar = Calendar.getInstance(); calendar.setTime(new Date()); calendar.add(Calendar.DAY_OF_MONTH, -1); SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");//設定日期格式 String format = df.format(calendar.getTime()); //讀取配置檔案中的企業資訊 Properties properties = new Properties(); // 使用ClassLoader載入properties配置檔案生成對應的輸入流 InputStream in = WryVoluntarilyMonitorService.class.getClassLoader() .getResourceAsStream("config/syqy.properties"); // 使用properties物件載入輸入流 設定字符集以防讀取中文時亂碼 properties.load(new InputStreamReader(in, "utf-8")); // 獲取key對應的value值 String property = properties.getProperty("value"); String[] split = property.split(";"); System.out.println(split.length); for (String s : split) { String[] split2 = s.split(","); // System.out.println(split2[0]+"======="+split2[1]);split2[0]企業名稱;split2[1])企業url String html = getHtmlByUrl(split2[1],format); if (html != null && !"".equals(html)) { //獲取選中的時間有多少頁資料 Document doc1 = Jsoup.parse(html); Elements select = doc1.select("input"); String attr = select.get(4).attr("value"); //迴圈每頁的資料並寫入資料庫 for(int k=1;k<=Integer.parseInt(attr);k++) { String htmlByUrlData = getHtmlByUrlData(split2[1],format,k); Document doc = Jsoup.parse(htmlByUrlData); Elements linksElements = doc.select(".tb_ls >tbody >tr"); for (int i = 1; i < linksElements.size(); i++) { Element element = linksElements.get(i); /** * element.select(">td").get(0).text() 獲取到的是序號 * 判斷是否有資料 (element.select(">td").get(0).text().equals("暫無資料!") 返回true是沒有資料) */ if (!element.select(">td").get(0).text().equals("暫無資料!")) { String aqi1 = element.select(">td").get(1).text();//檢測點位 String aqi2 = element.select(">td").get(2).text();//檢測時間 String aqi3 = element.select(">td").get(3).text();//檢測專案 String aqi4 = element.select(">td").get(4).text();//檢測結果 String aqi5 = element.select(">td").get(5).text();//檢測限值 String aqi6 = element.select(">td").get(6).text();//檢測單位 String aqi7 = element.select(">td").get(7).text();//是否達標 String aqi8 = element.select(">td").get(8).text();//超標倍數 String att9 = element.select(">td").get(9).getElementsByTag("td").attr("title");//評價標準 String aqi10 = element.select(">td").get(10).getElementsByTag("td").attr("title");//排放去向 內容太多需要讀取title才能抓取完全 String aqi11 = element.select(">td").get(11).text();//排放方式 String aqi12 = element.select(">td").get(12).text();//備註 WryVoluntarilyMonitor wryVoluntarilyMonitor = new WryVoluntarilyMonitor(); wryVoluntarilyMonitor.setPkid(keyGenerator.getNext()); wryVoluntarilyMonitor.setCompanyName(split2[0]); wryVoluntarilyMonitor.setDetectionPoint(aqi1); wryVoluntarilyMonitor.setDetectionTime(StringToDate(aqi2)); wryVoluntarilyMonitor.setDetectionProject(aqi3); wryVoluntarilyMonitor.setDetectionResult(aqi4); wryVoluntarilyMonitor.setStandardLimitingValue(aqi5); wryVoluntarilyMonitor.setUnit(aqi6); wryVoluntarilyMonitor.setIsStandards(aqi7); wryVoluntarilyMonitor.setExceedingMultiple(aqi8); wryVoluntarilyMonitor.setEvaluationCriterion(att9); wryVoluntarilyMonitor.setEmissionsTo(aqi10); wryVoluntarilyMonitor.setEmissionsWay(aqi11); wryVoluntarilyMonitor.setRemarks(aqi12); super.insert(wryVoluntarilyMonitor); } } } } } System.out.println("執行成功"); } /** * String轉date * @param times * @return * @throws ParseException */ public Date StringToDate(String times) throws ParseException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); return sdf.parse(times); } /** * 根據URL和時間獲得所有的html資訊 * * @param url * @return * @throws IOException * @throws ClientProtocolException */ public static String getHtmlByUrl(String url,String date) throws ClientProtocolException, IOException{ String html = null; //建立httpClient物件 CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse response=null; try { //以get方式請求該URL //HttpGet httpget = new HttpGet(url); HttpPost httppost = new HttpPost(url); String query="startTime="+date; StringEntity stringEntity = new StringEntity(query,"UTF-8"); stringEntity.setContentType("application/x-www-form-urlencoded"); httppost.setEntity(stringEntity); //CloseableHttpResponse response = httpClient.execute(httpget); response = httpClient.execute(httppost); //得到responce物件 //HttpResponse responce = httpClient.execute(httpget); //返回碼 int resStatu = response.getStatusLine().getStatusCode(); if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不對 //獲得輸入流 InputStream entity = response.getEntity().getContent(); if (entity!=null) { //通過輸入流轉為字串獲得html原始碼 注:可以獲得實體,然後通過 EntityUtils.toString方法獲得html //但是有可能出現亂碼,因此在這裡採用了這種方式 html=getStreamString(entity); // System.out.println(html); } } } catch (Exception e) { //System.out.println("訪問【"+url+"】出現異常!"); e.printStackTrace(); } finally { //httpClient.getConnectionManager().shutdown(); //response.close(); try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } return html; } /** * 根據url,時間和當前頁獲取資料 * @param url * @param date * @param page * @return * @throws ClientProtocolException * @throws IOException */ public static String getHtmlByUrlData(String url,String date,Integer page) throws ClientProtocolException, IOException{ String html = null; //建立httpClient物件 CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse response=null; try { //以get方式請求該URL //HttpGet httpget = new HttpGet(url); HttpPost httppost = new HttpPost(url); String query="startTime="+date+"&pageIndex="+page; StringEntity stringEntity = new StringEntity(query,"UTF-8"); stringEntity.setContentType("application/x-www-form-urlencoded"); httppost.setEntity(stringEntity); //CloseableHttpResponse response = httpClient.execute(httpget); response = httpClient.execute(httppost); //得到responce物件 //HttpResponse responce = httpClient.execute(httpget); //返回碼 int resStatu = response.getStatusLine().getStatusCode(); if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不對 //獲得輸入流 InputStream entity = response.getEntity().getContent(); if (entity!=null) { //通過輸入流轉為字串獲得html原始碼 注:可以獲得實體,然後通過 EntityUtils.toString方法獲得html //但是有可能出現亂碼,因此在這裡採用了這種方式 html=getStreamString(entity); // System.out.println(html); } } } catch (Exception e) { //System.out.println("訪問【"+url+"】出現異常!"); e.printStackTrace(); } finally { //httpClient.getConnectionManager().shutdown(); //response.close(); try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } return html; } /** * 將一個輸入流轉化為字串 */ public static String getStreamString(InputStream tInputStream) { if (tInputStream != null) { try { BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream, "utf-8")); StringBuffer tStringBuffer = new StringBuffer(); String sTempOneLine = new String(""); while ((sTempOneLine = tBufferedReader.readLine()) != null) { tStringBuffer.append(sTempOneLine + "\n"); } return tStringBuffer.toString(); } catch (Exception ex) { ex.printStackTrace(); } } return null; }
迴圈
程式碼截圖 為什麼從一開始迴圈 應為第一行是表格的開頭
頁面檢查截圖
讀取title
貼出檢查頁面圖startTime選擇時間 pageIndex當前頁碼