Spring Boot + JSoup 抓取京東商品資訊
阿新 • • 發佈:2019-02-18
需求分析
- 匯入京東商品URL列表
- 生成京東商品資訊並輸出到excel表
思路
- 讀取excel獲取URL列表
- 訪問url並獲得HTML原始碼
- 提取對應的商品資訊欄位
- 輸出到excel
搭建框架
建立Spring Boot工程
選擇依賴
配置pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation
重新命名application.properties為application.yml,修改內容
server: port: 8080 # 配置埠 spring: thymeleaf: # 開發時關閉快取,不然無法看到實時頁面 cache: false prefix: classpath:/templates/ suffix: .html
建立resources/templates/index.html
<!DOCTYPE html> <html lang="en" xmlns:th="http://www.w3.org/1999/xhtml"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"/> <title>生成商品資訊</title> </head> <body> <h3>請選擇excel檔案</h3> <form action="/excel/upload" enctype="multipart/form-data" method="post"> <span style="white-space:pre"></span> <input type="file" name="file"/> <span style="white-space:pre"></span> <input type="submit" value="匯入"/> <span style="white-space:pre"></span> </form> </body> </html>
建立PageController並訪問頁面
/** * Created with IntelliJ IDEA. * Project: operation-web * Package: com.landawang.operationweb.controller * User: Wangxin * Date: 2018/7/12 * Time: 16:46 * Copyright © 2018年 黃旺鑫. All rights reserved. */ package com.landawang.operationweb.controller; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.RequestMapping; @Controller public class PageController { @RequestMapping("/") public String index() { return "index"; } }
提取商品資訊
ExcelController.java,編寫實現邏輯
/** * Created with IntelliJ IDEA. * Project: operation-web * Package: com.landawang.operationweb.controller * User: Wangxin * Date: 2018/7/12 * Time: 17:26 * Copyright © 2018年 黃旺鑫. All rights reserved. */ package com.landawang.operationweb.controller; import org.apache.http.Consts; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.apache.poi.xssf.usermodel.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import javax.servlet.http.HttpServletResponse; import java.util.Date; import java.util.HashMap; import java.util.Map; @RestController @RequestMapping("/excel") public class ExcelController { @PostMapping("/upload") public void uploadExcel(HttpServletResponse response, @RequestParam("file") MultipartFile file) { /** * 1. 獲取上傳的url列表 * 2. 遍歷獲取url對應頁面的HTML原始碼 * 3. 提取對應的商品資訊欄位 * 4. 輸出的excel * */ try { // 讀取Excel檔案 XSSFWorkbook workbook = new XSSFWorkbook(file.getInputStream()); // 讀取Excel工作表 XSSFSheet sheet = workbook.getSheetAt(0); // 建立輸出Excel檔案 XSSFWorkbook newWorkbook = new XSSFWorkbook(); // 建立Sheet XSSFSheet newSheet = newWorkbook.createSheet(); // 建立標題行 XSSFRow titleRow = newSheet.createRow(0); // 設定標題行 XSSFCell cell1 = titleRow.createCell(0, STCellType.INT_STR); cell1.setCellValue("商品編碼"); XSSFCell cell2 = titleRow.createCell(1, STCellType.INT_STR); cell2.setCellValue("商品名稱"); XSSFCell cell3 = titleRow.createCell(2, STCellType.INT_STR); cell3.setCellValue("商品分類"); // 設定寬度 newSheet.setColumnWidth(0, 2560); newSheet.setColumnWidth(1, 25600); newSheet.setColumnWidth(2, 5120); // 遍歷獲取HTML原始碼,提取資訊 for (int i = 0; i < sheet.getLastRowNum(); i++) { // 獲取行 XSSFRow row = sheet.getRow(i); // 獲取列 XSSFCell cell = row.getCell(0); // 獲取url String url = cell.getStringCellValue(); // 輸出的Excel建立行 XSSFRow newRow = newSheet.createRow(i + 1); // 判斷url不為空並且包含http if (!url.isEmpty() && url.contains("http")) { // 獲取商品資訊集合 Map<String, String> data = getProductInfo(url); // 輸出商品資訊到Excel表 if (data != null) { XSSFCell cellOne = newRow.createCell(0, STCellType.INT_STR); cellOne.setCellValue(data.get("sku")); XSSFCell cellTwo = newRow.createCell(1, STCellType.INT_STR); cellTwo.setCellValue(data.get("name")); XSSFCell cellThree = newRow.createCell(2, STCellType.INT_STR); cellThree.setCellValue(data.get("cat")); } } // 列印除錯 System.out.println("\n內容是:" + url); } // 下載excel response.setContentType("application/octet-stream"); // 以時間戳命名 String fileName = String.valueOf(new Date().getTime()) + ".xlsx"; response.setHeader("Content-disposition", "attachment;filename=" + fileName); response.flushBuffer(); // 輸出excel newWorkbook.write(response.getOutputStream()); } catch (Exception e) { e.printStackTrace(); } } /** * 提取商品資訊 * */ private Map<String, String> getProductInfo(String url) throws Exception { CloseableHttpClient httpclient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); // 模擬瀏覽器瀏覽 httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0"); CloseableHttpResponse response1 = httpclient.execute(httpGet); // The underlying HTTP connection is still held by the response object // to allow the response content to be streamed directly from the network socket. // In order to ensure correct deallocation of system resources // the user MUST call CloseableHttpResponse#close() from a finally clause. // Please note that if response content is not fully consumed the underlying // connection cannot be safely re-used and will be shut down and discarded // by the connection manager. // 結果集合 Map<String, String> reslut = null; //獲取響應狀態碼 int StatusCode = response1.getStatusLine().getStatusCode(); try { HttpEntity entity1 = response1.getEntity(); //如果狀態響應碼為200,則獲取html實體內容或者json檔案 if(StatusCode == 200){ String html = EntityUtils.toString(entity1, Consts.UTF_8); // 提取HTML得到商品資訊結果 reslut = getData(html); // 消耗掉實體 EntityUtils.consume(response1.getEntity()); }else { //否則,消耗掉實體 EntityUtils.consume(response1.getEntity()); } } finally { response1.close(); } return reslut; } private static Map<String, String> getData (String html) throws Exception{ //獲取的資料,存放在集合中 Map<String, String> data = new HashMap<String,String>(); //採用Jsoup解析 Document doc = Jsoup.parse(html); //獲取html標籤中的內容 // 標題 String name = doc.select("div[class=sku-name]").text(); if (name != null) { data.put("name", name); } // sku String sku = ""; Elements elements = doc.select("a[data-sku]"); for (Element ele: elements) { if (ele.hasAttr("data-sku")) { sku = ele.attr("data-sku"); break; } } if (sku != null) { data.put("sku", sku); } String cat = doc.select("a[clstag=shangpin|keycount|product|mbNav-1]").text(); if (cat != null) { data.put("cat", cat); } System.out.print(sku + "---------" + cat + "---------" + name); //返回資料 return data; } }
測試
準備excel表
獲取時間戳命名的excel表